In [114]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [115]:
ps=PorterStemmer()
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [116]:
data=pd.read_csv('/content/train.csv')

In [117]:
data.head()

Unnamed: 0,Statement,Label
0,Says the Annies List political group supports ...,False
1,When did the decline of coal start? It started...,True
2,"Hillary Clinton agrees with John McCain ""by vo...",True
3,Health care reform legislation is likely to ma...,False
4,The economic turnaround started at the end of ...,True


In [118]:
data.isnull().sum()

Statement    0
Label        0
dtype: int64

In [119]:
#seperate x &y
X=data.drop(columns='Label',axis=1)
Y=data['Label']
X
Y

0        False
1         True
2         True
3        False
4         True
         ...  
10235     True
10236     True
10237     True
10238    False
10239    False
Name: Label, Length: 10240, dtype: bool

In [120]:
#function to clean and transform data
def transform_text(text):
  text=text.lower()
  text=nltk.word_tokenize(text)
  y=[]
  for i in text:
    if i.isalnum():
      y.append(i)
  text=y[:]
  y=[]
  y=[ps.stem(word) for word in text if word not in stopwords.words('english')]
  return " ".join(y)

In [101]:
data['Statement']=data['Statement'].apply(transform_text)

In [102]:
print(data)

                                               Statement  Label
0         say anni list polit group support abort demand  False
1      declin coal start start natur ga took start be...   True
2      hillari clinton agre john mccain vote give geo...   True
3      health care reform legisl like mandat free sex...  False
4                       econom turnaround start end term   True
...                                                  ...    ...
10235  larger number shark attack florida case voter ...   True
10236      democrat becom parti atlanta metro area black   True
10237  say altern social secur oper galveston counti ...   True
10238               lift cuban embargo allow travel cuba  False
10239  depart veteran affair manual tell veteran stuf...  False

[10240 rows x 2 columns]


In [103]:
X=data['Statement'].values
Y=data['Label'].values

In [104]:
vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X=vectorizer.transform(X)

In [105]:
print(X)

  (0, 6937)	0.2479480068686816
  (0, 6226)	0.13621700210761026
  (0, 5480)	0.3432626244453162
  (0, 4291)	0.3812079519870589
  (0, 3275)	0.3432626244453162
  (0, 2123)	0.40577591571178007
  (0, 690)	0.5348556431607517
  (0, 435)	0.2970927466191125
  (1, 7226)	0.20188150682360842
  (1, 6750)	0.6785289823523206
  (1, 5583)	0.14785493360425483
  (1, 4823)	0.2743530126831687
  (1, 3119)	0.2224927260756085
  (1, 3056)	0.22383327300737246
  (1, 2075)	0.2727081447947497
  (1, 1648)	0.27975461075155794
  (1, 1311)	0.19750151612482283
  (1, 1002)	0.2681559998611685
  (1, 505)	0.1963680459801407
  (2, 7641)	0.192155137008578
  (2, 4508)	0.2735772665448082
  (2, 3933)	0.26476956808168783
  (2, 3831)	0.30285034563252805
  (2, 3478)	0.2474195132217653
  (2, 3151)	0.2715268422628416
  :	:
  (10237, 4530)	0.3126285192306642
  (10237, 4335)	0.23195575212573594
  (10237, 3067)	0.3276432105752006
  (10237, 1879)	0.19058109202812845
  (10237, 628)	0.29761382788612767
  (10238, 7292)	0.3977343553550759
  

In [106]:
X.shape

(10240, 7958)

In [107]:
Y.shape

(10240,)

In [108]:
#training data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.1,stratify=Y,random_state=2)

In [109]:
#fitting model
model=LogisticRegression()

In [110]:
model.fit(X_train,Y_train)

In [111]:
prediction_train=model.predict(X_train)
train_accuracy=accuracy_score(prediction_train,Y_train)
print(train_accuracy)

0.7650824652777778


In [112]:
prediction_test=model.predict(X_test)
test_accuracy=accuracy_score(prediction_test,Y_test)
print(test_accuracy)

0.6220703125


In [113]:
print(X)


  (0, 6937)	0.2479480068686816
  (0, 6226)	0.13621700210761026
  (0, 5480)	0.3432626244453162
  (0, 4291)	0.3812079519870589
  (0, 3275)	0.3432626244453162
  (0, 2123)	0.40577591571178007
  (0, 690)	0.5348556431607517
  (0, 435)	0.2970927466191125
  (1, 7226)	0.20188150682360842
  (1, 6750)	0.6785289823523206
  (1, 5583)	0.14785493360425483
  (1, 4823)	0.2743530126831687
  (1, 3119)	0.2224927260756085
  (1, 3056)	0.22383327300737246
  (1, 2075)	0.2727081447947497
  (1, 1648)	0.27975461075155794
  (1, 1311)	0.19750151612482283
  (1, 1002)	0.2681559998611685
  (1, 505)	0.1963680459801407
  (2, 7641)	0.192155137008578
  (2, 4508)	0.2735772665448082
  (2, 3933)	0.26476956808168783
  (2, 3831)	0.30285034563252805
  (2, 3478)	0.2474195132217653
  (2, 3151)	0.2715268422628416
  :	:
  (10237, 4530)	0.3126285192306642
  (10237, 4335)	0.23195575212573594
  (10237, 3067)	0.3276432105752006
  (10237, 1879)	0.19058109202812845
  (10237, 628)	0.29761382788612767
  (10238, 7292)	0.3977343553550759
  

In [125]:
prediction=model.predict(X_train)
print(prediction)
  

[ True  True  True ...  True False  True]
