In [2]:
#From https://data-flair.training/blogs/advanced-python-project-detecting-fake-news/

import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

## Below is the code from 
https://data-flair.training/blogs/advanced-python-project-detecting-fake-news/

In [3]:
# FROM DATA FLAIR
df = pd.read_csv('news.csv')
df.head()

#DataFlair - Get the labels
labels=df.label
labels.head()

#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.2, random_state=7)

#DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
#DataFlair - Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

#DataFlair - Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
#DataFlair - Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

#DataFlair - Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

Accuracy: 92.74%


array([[591,  47],
       [ 45, 584]])

Ok, so starting with accuracy of 92.9, I can see from the confusion matrix that my data is split pretty evenly over the data, so an f1 score probably isn't necassary. And i'd be leaning towards saying that the cost of a false negative is high, so I'll probably prioritise recall over precision. But we dont want to falsely call something fake, so i'll check precision too.

Rejigging it a bit to match what I'm used to realised that accuracy improved, try changing the random state

In [3]:
i=0
while i < 100 :
    i=i+5
    print('random state is : '+str(i))
    y=df['label'] 
    X=df['text']

    #DataFlair - Split the dataset
    X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=i)

    #DataFlair - Initialize a TfidfVectorizer
    tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
    #DataFlair - Fit and transform train set, transform test set
    tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
    tfidf_test=tfidf_vectorizer.transform(X_test)

    #DataFlair - Initialize a PassiveAggressiveClassifier
    pac=PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train,y_train)
    #DataFlair - Predict on the test set and calculate accuracy
    y_pred=pac.predict(tfidf_test)
    score=accuracy_score(y_test,y_pred)
    print(f'Accuracy: {round(score*100,2)}%')

    #DataFlair - Build confusion matrix
    #confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

random state is : 5
Accuracy: 94.55%
random state is : 10
Accuracy: 94.87%
random state is : 15
Accuracy: 92.98%
random state is : 20
Accuracy: 94.63%
random state is : 25
Accuracy: 94.55%
random state is : 30
Accuracy: 94.32%
random state is : 35
Accuracy: 93.37%
random state is : 40
Accuracy: 93.61%
random state is : 45
Accuracy: 94.0%
random state is : 50
Accuracy: 93.53%
random state is : 55
Accuracy: 93.05%
random state is : 60
Accuracy: 94.4%
random state is : 65
Accuracy: 94.08%
random state is : 70
Accuracy: 93.29%
random state is : 75
Accuracy: 93.76%
random state is : 80
Accuracy: 93.69%
random state is : 85
Accuracy: 94.48%
random state is : 90
Accuracy: 93.05%
random state is : 95
Accuracy: 94.71%
random state is : 100
Accuracy: 93.45%


20 seems to be the best. Open question, why does changing the random state change it so much? maybe coz there isn't enough data?
maybe run it a few times through 20 to see what the variance is

In [4]:
i=0
while i < 10 :
    i=i+1
    print('random state is 20, i = '+str(i))
    # FROM DATA FLAIR
    df = pd.read_csv('news.csv')
    df.head()

    #DataFlair - Get the labels
    y=df['label'] 
    X=df['text']

    #DataFlair - Split the dataset
    X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=20)

    #DataFlair - Initialize a TfidfVectorizer
    tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
    #DataFlair - Fit and transform train set, transform test set
    tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
    tfidf_test=tfidf_vectorizer.transform(X_test)

    #DataFlair - Initialize a PassiveAggressiveClassifier
    pac=PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train,y_train)
    #DataFlair - Predict on the test set and calculate accuracy
    y_pred=pac.predict(tfidf_test)
    score=accuracy_score(y_test,y_pred)
    print(f'Accuracy: {round(score*100,2)}%')

    #DataFlair - Build confusion matrix
    #confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

random state is 20, i = 1
Accuracy: 94.95%
random state is 20, i = 2
Accuracy: 95.03%
random state is 20, i = 3
Accuracy: 94.87%
random state is 20, i = 4
Accuracy: 94.87%
random state is 20, i = 5
Accuracy: 94.55%
random state is 20, i = 6
Accuracy: 95.42%
random state is 20, i = 7
Accuracy: 95.19%
random state is 20, i = 8
Accuracy: 94.87%
random state is 20, i = 9
Accuracy: 94.63%
random state is 20, i = 10
Accuracy: 94.87%


try playing around with the high level params, first test size

In [5]:
#Different test size checks
test_sizes = np.linspace(0.05, 0.5, 10)
print(test_sizes)
accuracy_df = pd.DataFrame()

for test_sizer in test_sizes :
    print('test size: '+str(test_sizer))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_sizer, random_state=101)
    
    #DataFlair - Initialize a TfidfVectorizer
    tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
    #DataFlair - Fit and transform train set, transform test set
    tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
    tfidf_test=tfidf_vectorizer.transform(X_test)

    #DataFlair - Initialize a PassiveAggressiveClassifier
    pac=PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train,y_train)
    #DataFlair - Predict on the test set and calculate accuracy
    y_pred=pac.predict(tfidf_test)
    score=accuracy_score(y_test,y_pred)
    
    df_temp = pd.DataFrame({'test_size':[test_sizer], 'accuracy': [round(score*100,2)]})
    accuracy_df=accuracy_df.append(df_temp)
    
accuracy_df

[0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5 ]
test size: 0.05
test size: 0.1
test size: 0.15000000000000002
test size: 0.2
test size: 0.25
test size: 0.3
test size: 0.35000000000000003
test size: 0.4
test size: 0.45
test size: 0.5


Unnamed: 0,test_size,accuracy
0,0.05,95.58
0,0.1,93.53
0,0.15,93.38
0,0.2,93.61
0,0.25,93.12
0,0.3,93.63
0,0.35,93.73
0,0.4,92.62
0,0.45,93.2
0,0.5,92.71


## COMMENT
top test size 0.25, backup 20

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=20)

#Vectorize
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
tfidf_test=tfidf_vectorizer.transform(X_test)

#Train / Fit model
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

#Predict
y_pred=pac.predict(tfidf_test)

accuracy=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(accuracy*100,2)}%')

precision=precision_score(y_test,y_pred, pos_label='REAL')
print(f'Precision: {round(precision*100,2)}%')

recall=recall_score(y_test,y_pred, pos_label='REAL')
print(f'Recall: {round(recall*100,2)}%')


Accuracy: 94.76%
Precision: 95.64%
Recall: 93.54%


## Comment
Ok, let me check the other vectorizors

In [7]:
vectorizers = [CountVectorizer(stop_words='english'),
               HashingVectorizer(stop_words='english'), 
               TfidfVectorizer(stop_words='english', max_df=0.7)]

names = ['CountVectorizer',
         'HashingVectorizer',
         'TfidfVectorizer']

Perf_df = pd.DataFrame()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=20)

i=0
for vectorizer in vectorizers :
    print(names[i])

    #Vectorize
    vectorizerr=vectorizer
    vectorizerr_train=vectorizerr.fit_transform(X_train) 
    vectorizerr_test=vectorizerr.transform(X_test)

    #Train / Fit model
    pac=PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train,y_train)

    #Predict
    y_pred=pac.predict(tfidf_test)

    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred, pos_label='REAL')
    recall=recall_score(y_test,y_pred, pos_label='REAL')
    
    df_temp = pd.DataFrame({'Vectorizer':[names[i]],
                            'accuracy': [round(accuracy*100,2)],
                           'precision': [round(precision*100,2)],
                           'recall': [round(recall*100,2)]})
    
    Perf_df=Perf_df.append(df_temp)
    
    i=i+1

Perf_df

CountVectorizer
HashingVectorizer
TfidfVectorizer


Unnamed: 0,Vectorizer,accuracy,precision,recall
0,CountVectorizer,94.7,95.63,93.41
0,HashingVectorizer,94.44,95.25,93.28
0,TfidfVectorizer,93.94,94.72,92.76


## Comment
top vectorizer = Tfidf, backup is count vectorizer

ok, now to poke around a bit with a bunch of different classifiers

## ALL THE CLASSIFIERS
https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV, Perceptron
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB


y=df['label'] 
X=df['text']
    
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "AdaBoost",
         "Naive Bayes", "QDA", "SGD", "log_regression", "log_regression_CV", 
         "Perceptron", "RidgeClass", "RidgeClassCV", "PassiveAggressiveClassifier", "MultinomialNB"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    SGDClassifier(), 
    LogisticRegression(), 
    LogisticRegressionCV(max_iter=300),
    Perceptron(),
    RidgeClassifier(),
    RidgeClassifierCV(cv=5), 
    PassiveAggressiveClassifier(max_iter=50),
    MultinomialNB()]


Perf_df = pd.DataFrame()


i=0
for classifier in classifiers:
    try:
        print('classifier: '+names[i])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=20)

        #Initialise vectorizor
        tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

        #fit and transform test and train set
        tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
        tfidf_test=tfidf_vectorizer.transform(X_test)
        
        print('vectorized')

        classy=classifier
        classy.fit(tfidf_train,y_train)
        print('trained')
        
        y_pred=classy.predict(tfidf_test)

        accuracy=accuracy_score(y_test,y_pred)
        precision=precision_score(y_test,y_pred, pos_label='REAL')
        recall=recall_score(y_test,y_pred, pos_label='REAL')


        df_temp = pd.DataFrame({'Classifier':[names[i]],
                                'accuracy': [round(accuracy*100,2)],
                               'precision': [round(precision*100,2)],
                               'recall': [round(recall*100,2)]})

        Perf_df=Perf_df.append(df_temp)
        print(df_temp)
        print('success')
    
    except:
        print('failed')
        pass

    i=i+1
    
Perf_df

classifier: Nearest Neighbors
vectorized
trained
          Classifier  accuracy  precision  recall
0  Nearest Neighbors     61.11      99.38   20.54
success
classifier: Linear SVM
vectorized
trained
   Classifier  accuracy  precision  recall
0  Linear SVM     77.15      93.64   57.11
success
classifier: RBF SVM
vectorized
trained
  Classifier  accuracy  precision  recall
0    RBF SVM     91.35      95.18   86.69
success
classifier: Gaussian Process
vectorized
failed
classifier: Decision Tree
vectorized
trained
      Classifier  accuracy  precision  recall
0  Decision Tree     77.02      78.08   73.64
success
classifier: Random Forest
vectorized
trained
      Classifier  accuracy  precision  recall
0  Random Forest     49.12      48.96   97.16
success
classifier: AdaBoost
vectorized
trained
  Classifier  accuracy  precision  recall
0   AdaBoost     87.82      88.38   86.43
success
classifier: Naive Bayes
vectorized
failed
classifier: QDA
vectorized
failed
classifier: SGD
vectorized
trai



trained
       Classifier  accuracy  precision  recall
0  log_regression     91.54      94.08   88.24
success
classifier: log_regression_CV
vectorized




trained
          Classifier  accuracy  precision  recall
0  log_regression_CV     94.44       96.6   91.86
success
classifier: Perceptron
vectorized
trained
   Classifier  accuracy  precision  recall
0  Perceptron     93.43      92.84    93.8
success
classifier: RidgeClass
vectorized
trained
   Classifier  accuracy  precision  recall
0  RidgeClass     93.94      96.19   91.21
success
classifier: RidgeClassCV
vectorized
trained
     Classifier  accuracy  precision  recall
0  RidgeClassCV     93.81      95.92   91.21
success
classifier: PassiveAggressiveClassifier
vectorized
trained
                    Classifier  accuracy  precision  recall
0  PassiveAggressiveClassifier     94.57      95.26   93.54
success
classifier: MultinomialNB
vectorized
trained
      Classifier  accuracy  precision  recall
0  MultinomialNB     82.39      74.15   98.19
success


Unnamed: 0,Classifier,accuracy,precision,recall
0,Nearest Neighbors,61.11,99.38,20.54
0,Linear SVM,77.15,93.64,57.11
0,RBF SVM,91.35,95.18,86.69
0,Decision Tree,77.02,78.08,73.64
0,Random Forest,49.12,48.96,97.16
0,AdaBoost,87.82,88.38,86.43
0,SGD,94.26,95.72,92.38
0,log_regression,91.54,94.08,88.24
0,log_regression_CV,94.44,96.6,91.86
0,Perceptron,93.43,92.84,93.8


PAC and log_reg_cv on top. classig log_regression

# NEXT STEP
Tune the Passiveaggressiveclassifier

In [9]:

y=df['label']
X = df['text']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=20)

#Initialise vectorizor
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#fit and transform test and train set
tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
tfidf_test=tfidf_vectorizer.transform(X_test)

#DataFlair - Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
#DataFlair - Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)


#Scoring
accuracy=accuracy_score(y_test,y_pred)
print('Accuracy: '+str({round(accuracy*100,2)}))

recall=recall_score(y_test,y_pred, pos_label='REAL')
print('Recall: '+str({round(recall*100,2)}))



#DataFlair - Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

Accuracy: {94.76}
Recall: {93.93}


array([[774,  36],
       [ 47, 727]])

Try tuning the C param
c param is 

In [10]:
## Try tuning the C param


y=df['label']
X = df['text']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

cs = [1,2,3,5,10, 20, 50, 100, 200, 1000]

#Initialise vectorizor
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#fit and transform test and train set
tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
tfidf_test=tfidf_vectorizer.transform(X_test)

Perf_df = pd.DataFrame()
for cc in cs :
    print('c = '+str(cc))
    pac=PassiveAggressiveClassifier(C=cc, max_iter=50)
    pac.fit(tfidf_train,y_train)
    y_pred=pac.predict(tfidf_test)



    
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred, pos_label='REAL')
    recall=recall_score(y_test,y_pred, pos_label='REAL')


    df_temp = pd.DataFrame({'C':[cc],
                            'accuracy': [round(accuracy*100,2)],
                           'precision': [round(precision*100,2)],
                           'recall': [round(recall*100,2)]})

    Perf_df=Perf_df.append(df_temp)
    print(df_temp)
    
Perf_df


c = 1
   C  accuracy  precision  recall
0  1     93.56      93.77   93.29
c = 2
   C  accuracy  precision  recall
0  2     93.31      93.73   92.78
c = 3
   C  accuracy  precision  recall
0  3      93.5      93.76   93.16
c = 5
   C  accuracy  precision  recall
0  5     93.37      93.97   92.66
c = 10
    C  accuracy  precision  recall
0  10      93.5      93.76   93.16
c = 20
    C  accuracy  precision  recall
0  20     93.69      94.01   93.29
c = 50
    C  accuracy  precision  recall
0  50     93.69      94.12   93.16
c = 100
     C  accuracy  precision  recall
0  100     93.43      93.42   93.42
c = 200
     C  accuracy  precision  recall
0  200     93.43      93.97   92.78
c = 1000
      C  accuracy  precision  recall
0  1000     93.56      94.22   92.78


Unnamed: 0,C,accuracy,precision,recall
0,1,93.56,93.77,93.29
0,2,93.31,93.73,92.78
0,3,93.5,93.76,93.16
0,5,93.37,93.97,92.66
0,10,93.5,93.76,93.16
0,20,93.69,94.01,93.29
0,50,93.69,94.12,93.16
0,100,93.43,93.42,93.42
0,200,93.43,93.97,92.78
0,1000,93.56,94.22,92.78


c = 10 top winner, but not much difference backup 5

Next I wanted to try and include the title as a feature.

In [11]:
y=df['label']
Xs = [df['text'], df['title'], df['title'] + ' ' + df['text']]

names = ['text','title','title+text']

i=0

Perf_df = pd.DataFrame()

for X in Xs :
    print('X : '+names[i])
    print('X[0][0:70] : '+X[0][0:70])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    #Initialise vectorizor
    tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

    #fit and transform test and train set
    tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
    tfidf_test=tfidf_vectorizer.transform(X_test)

    #DataFlair - Initialize a PassiveAggressiveClassifier
    pac=PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train,y_train)
    #DataFlair - Predict on the test set and calculate accuracy
    y_pred=pac.predict(tfidf_test)


    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred, pos_label='REAL')
    recall=recall_score(y_test,y_pred, pos_label='REAL')


    df_temp = pd.DataFrame({'X':names[i],
                            'accuracy': [round(accuracy*100,2)],
                           'precision': [round(precision*100,2)],
                           'recall': [round(recall*100,2)]})

    Perf_df=Perf_df.append(df_temp)
    print(df_temp)


    i=i+1
Perf_df

X : text
X[0][0:70] : Daniel Greenfield, a Shillman Journalism Fellow at the Freedom Center,
      X  accuracy  precision  recall
0  text     93.62       94.0   93.16
X : title
X[0][0:70] : You Can Smell Hillary’s Fear
       X  accuracy  precision  recall
0  title     77.65      76.39   79.87
X : title+text
X[0][0:70] : You Can Smell Hillary’s Fear Daniel Greenfield, a Shillman Journalism 
            X  accuracy  precision  recall
0  title+text     93.37      93.74   92.91


Unnamed: 0,X,accuracy,precision,recall
0,text,93.62,94.0,93.16
0,title,77.65,76.39,79.87
0,title+text,93.37,93.74,92.91


so using the title didn't help, it just made it worse. I guess the title only has a few words so adding these words to the larger text isn't super significant. In my mind "titles are important" so they must be an important feature so I wanted to see if I could hack it a bit to use the title more. I could find examples where I could use multiple models and weight them, but it wouldn't let me use different features for each model. this I found out is actually not really great practice, weighting features is something that should be left to the algorithm to work out.

In [12]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

decisiontree = DecisionTreeClassifier()
forest=RandomForestClassifier()



ensemble=VotingClassifier(estimators=[('Decision Tree', decisiontree), ('Random Forest', forest)], 
                       voting='soft', weights=[1,5]).fit(tfidf_train,y_train)


print('The accuracy for DecisionTree and Random Forest is:',ensemble.score(tfidf_test,y_test))



The accuracy for DecisionTree and Random Forest is: 0.8623737373737373


next i tried vectorizing the title and text seperately and then putting them together, this worked mathmatically but I'm still not sure if its meaningful...

In [13]:
y=df['label']
X = df[['title','text']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#Initialise vectorizor
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#fit and transform test and train set (I"M NOT SURE IF THIS IS EVEN DOABLE COZ I"M FITTING ON 2 TEXT FEATURES SEPERATELY!!)
tfidf_train_title=tfidf_vectorizer.fit_transform(X_train['title']) 
tfidf_test_title=tfidf_vectorizer.transform(X_test['title'])

tfidf_train_text=tfidf_vectorizer.fit_transform(X_train['text']) 
tfidf_test_text=tfidf_vectorizer.transform(X_test['text'])

#combine vectorized text and title features
tfidf_train_text_df = pd.DataFrame(tfidf_train_text.toarray())
tfidf_train_title_df = pd.DataFrame(tfidf_train_title.toarray())
tfidf_train_all=pd.concat([tfidf_train_text_df, tfidf_train_title_df], axis=1)


tfidf_test_text_df = pd.DataFrame(tfidf_test_text.toarray())
tfidf_test_title_df = pd.DataFrame(tfidf_test_title.toarray())
tfidf_test_all=pd.concat([tfidf_test_text_df, tfidf_test_title_df], axis=1)



#Train + predict
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train_all,y_train)
#DataFlair - Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test_all)

#Scoring
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred, pos_label='REAL')
recall=recall_score(y_test,y_pred, pos_label='REAL')


df_temp = pd.DataFrame({'accuracy': [round(accuracy*100,2)],
                       'precision': [round(precision*100,2)],
                       'recall': [round(recall*100,2)]})

df_temp

Unnamed: 0,accuracy,precision,recall
0,92.8,93.56,91.9


## BACK TO ORIGINAL

In [14]:
y=df['label']
X = df['text']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#Initialise vectorizor
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#fit and transform test and train set
tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
tfidf_test=tfidf_vectorizer.transform(X_test)

#DataFlair - Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
#DataFlair - Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)


#Scoring
accuracy=accuracy_score(y_test,y_pred)
print('Accuracy: '+str({round(accuracy*100,2)}))

recall=recall_score(y_test,y_pred, pos_label='REAL')
print('Recall: '+str({round(recall*100,2)}))



#DataFlair - Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

Accuracy: {93.24}
Recall: {92.66}


array([[745,  49],
       [ 58, 732]])

try changing max iterations in pac

In [15]:
y=df['label']
X = df['text']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

max_iterations= [5,  50,  100, 400]

#Initialise vectorizor
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#fit and transform test and train set
tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
tfidf_test=tfidf_vectorizer.transform(X_test)

Perf_df = pd.DataFrame()

for max_iteration in max_iterations :

    #DataFlair - Initialize a PassiveAggressiveClassifier
    pac=PassiveAggressiveClassifier(max_iter=max_iteration)
    pac.fit(tfidf_train,y_train)
    #DataFlair - Predict on the test set and calculate accuracy
    y_pred=pac.predict(tfidf_test)


    print("max iterations = "+str(max_iteration))
    
    #Scoring
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred, pos_label='REAL')
    recall=recall_score(y_test,y_pred, pos_label='REAL')


    df_temp = pd.DataFrame({'max_iterations':max_iteration,
                            'accuracy': [round(accuracy*100,2)],
                           'precision': [round(precision*100,2)],
                           'recall': [round(recall*100,2)]})

    Perf_df=Perf_df.append(df_temp)
    print(df_temp)
    
Perf_df



max iterations = 5
   max_iterations  accuracy  precision  recall
0               5     93.56      93.77   93.29
max iterations = 50
   max_iterations  accuracy  precision  recall
0              50      93.5      94.09   92.78
max iterations = 100
   max_iterations  accuracy  precision  recall
0             100     93.31      93.51   93.04
max iterations = 400
   max_iterations  accuracy  precision  recall
0             400     92.99      93.25   92.66


Unnamed: 0,max_iterations,accuracy,precision,recall
0,5,93.56,93.77,93.29
0,50,93.5,94.09,92.78
0,100,93.31,93.51,93.04
0,400,92.99,93.25,92.66


didn't help



try tuning logregcv

first the solvers

In [16]:
#Test solvers

from sklearn.linear_model import LogisticRegressionCV

y=df['label']
X = df['text']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

solvers=['newton-cg', 'lbfgs', 'liblinear','sag','saga']

#Initialise vectorizor
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#fit and transform test and train set
tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
tfidf_test=tfidf_vectorizer.transform(X_test)

Perf_df = pd.DataFrame()

for slvr in solvers :
    print('Solver = '+str(slvr))
    #DataFlair - Initialize a PassiveAggressiveClassifier
    lrcv=LogisticRegressionCV(max_iter=300, solver=slvr)
    lrcv.fit(tfidf_train,y_train)
    #DataFlair - Predict on the test set and calculate accuracy
    y_pred=lrcv.predict(tfidf_test)

    
    #Scoring
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred, pos_label='REAL')
    recall=recall_score(y_test,y_pred, pos_label='REAL')


    df_temp = pd.DataFrame({'solver':slvr,
                            'accuracy': [round(accuracy*100,2)],
                           'precision': [round(precision*100,2)],
                           'recall': [round(recall*100,2)]})

    Perf_df=Perf_df.append(df_temp)
    print(df_temp)
    
Perf_df


Solver = newton-cg




      solver  accuracy  precision  recall
0  newton-cg     93.62      94.57   92.53
Solver = lbfgs




  solver  accuracy  precision  recall
0  lbfgs     93.62      94.57   92.53
Solver = liblinear




      solver  accuracy  precision  recall
0  liblinear     93.62      94.57   92.53
Solver = sag




  solver  accuracy  precision  recall
0    sag     93.62      94.57   92.53
Solver = saga




  solver  accuracy  precision  recall
0   saga     93.62      94.57   92.53


Unnamed: 0,solver,accuracy,precision,recall
0,newton-cg,93.62,94.57,92.53
0,lbfgs,93.62,94.57,92.53
0,liblinear,93.62,94.57,92.53
0,sag,93.62,94.57,92.53
0,saga,93.62,94.57,92.53


No impact so tried a few models from this paper: No impact so tried a few models from this paper: 

In [17]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

y=df['label']
X = df['text']

names = ["DTC empty", "DTC prefilled", "Random Forrest"]

classifiers = [
    DecisionTreeClassifier(),
    DecisionTreeClassifier(criterion= 'entropy',
                           max_depth = 20, 
                           splitter='best',
                           random_state=42),
    RandomForestClassifier()
]


df_results = pd.DataFrame({'Classifier':[],'Accuracy':[],'Recall':[]})



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#Initialise vectorizor
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#fit and transform test and train set
tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
tfidf_test=tfidf_vectorizer.transform(X_test)


i=0
for classifier in classifiers:
    try:
        print('classifier: '+names[i])

        classy=classifier
        classy.fit(tfidf_train,y_train)
        #DataFlair - Predict on the test set and calculate accuracy
        y_pred=classy.predict(tfidf_test)



        #Scoring
        accuracy=100*accuracy_score(y_test,y_pred)
        print('Accuracy: '+str({round(accuracy,2)}))

        recall=100*recall_score(y_test,y_pred, pos_label='REAL')
        print('Recall: '+str({round(recall,2)}))


        df_temp=pd.DataFrame({'Classifier':names[i],'Accuracy':[accuracy],'Recall':[recall]})
        df_results=df_results.append(df_temp)


        #DataFlair - Build confusion matrix
        confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])
    
    except:
        print(names[i]+"didn't work")
        pass

    print(i)
    i=i+1
    
df_results

classifier: DTC empty
Accuracy: {81.82}
Recall: {82.53}
0
classifier: DTC prefilled
Accuracy: {83.14}
Recall: {80.89}
1
classifier: Random Forrest




Accuracy: {84.85}
Recall: {81.52}
2


Unnamed: 0,Classifier,Accuracy,Recall
0,DTC empty,81.818182,82.531646
0,DTC prefilled,83.143939,80.886076
0,Random Forrest,84.848485,81.518987


In [18]:
### Try removing whitespace and punctuation

In [19]:
 

from sklearn.linear_model import LogisticRegressionCV, PassiveAggressiveClassifier
import string 

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text




y=df['label']
Xs = [df['text'],df['text'].apply(remove_punctuations)]
X_names = ['orig','remove_punctuation']

names = ["LogRegCV", "PassiveAggC"]

classifiers = [
    LogisticRegressionCV(max_iter = 300), 
    PassiveAggressiveClassifier(max_iter = 10)
]


df_results = pd.DataFrame()
j=0
for X in Xs:
    print('X : '+X_names[j])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    #Initialise vectorizor
    tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

    #fit and transform test and train set
    tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
    tfidf_test=tfidf_vectorizer.transform(X_test)


    i=0
    for classifier in classifiers:
        try:
            print('classifier: '+names[i])

            classy=classifier
            classy.fit(tfidf_train,y_train)
            #DataFlair - Predict on the test set and calculate accuracy
            y_pred=classy.predict(tfidf_test)



            #Scoring
            accuracy=100*accuracy_score(y_test,y_pred)
            print('Accuracy: '+str({round(accuracy,2)}))

            recall=100*recall_score(y_test,y_pred, pos_label='REAL')
            print('Recall: '+str({round(recall,2)}))


            df_temp=pd.DataFrame({'X':X_names[j],'Classifier':names[i],'Accuracy':[accuracy],'Recall':[recall]})
            df_results=df_results.append(df_temp)


        
        except:
            print(names[i]+" didn't work")
            pass

        i=i+1
    j=j+1

df_results

X : orig
classifier: LogRegCV




Accuracy: {93.62}
Recall: {92.53}
classifier: PassiveAggC
Accuracy: {93.69}
Recall: {92.78}
X : remove_punctuation




classifier: LogRegCV




Accuracy: {93.62}
Recall: {92.28}
classifier: PassiveAggC
Accuracy: {93.62}
Recall: {93.16}




Unnamed: 0,X,Classifier,Accuracy,Recall
0,orig,LogRegCV,93.623737,92.531646
0,orig,PassiveAggC,93.686869,92.78481
0,remove_punctuation,LogRegCV,93.623737,92.278481
0,remove_punctuation,PassiveAggC,93.623737,93.164557


### Removing whitespace improved the pac a bit

### TRY LEMMETISING AND STEMMING fro this paper
https://pdf.sciencedirectassets.com/280203/1-s2.0-S1877050920X00032/1-s2.0-S1877050920300430/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjECgaCXVzLWVhc3QtMSJGMEQCIG73FR1DntKcEGNieVvT2RqwAMP%2BhOmazJWvENPNL%2BP%2BAiAyPF69OeXkgldt3UDc8yEta2MGn%2Fis4xJ0hwLq3IFKZyq9AwjA%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAMaDDA1OTAwMzU0Njg2NSIMo2Lw0p1tm1mL6XTAKpED7pMpUiUQ2ZWoCO9A%2BlTtO9PZ92smEZ%2BPegfxXdayzsvXMKwTe0jcapZof2UpndNVoP8UzDu%2BTFCo8Vyc4ZuuUupGWY16f92zaXeo2VZNgwZUvKjNvsQXabPJDYMC4vLd7YD4mtkWCGoJG%2FkankSZ1zXbNuXyu4lZ3kVk%2FwgdaXqq9rHMdnGi5mDvJmTmurxCH9RDNB%2BHo%2BjOZzrwvkEPYPQPvSGDT%2B9F6dY%2F0WD4%2BZlmk7d1rjt4tyRsMaiSV%2BXT7MLerYdYBtrnIGvNXvsTRsiQAqvcgNcf6XV%2FkKwyD0OYj31FtyqQNCFH%2FGaIvPFIUMPedQEY1bhbXQ6mAU%2FYqQKBKBTtdxG4hLl5HzK46HfmdS3luJXxABH698P2VlhWKfAhmTByxOYysF3%2FOX%2B9QKzOKs9gx%2B0jaAckJsCEr2vMSyvt9OcStpVbhGwAbpeznrEB1wLulwwLzBE8D7%2BM4yErL7m46ntBUbN2xYA2kgbj%2BQiqW1%2Bkjl%2B%2FZgTMkAa%2FltO%2BRefUfXQYRLPyk8KXSC4wu7ykhQY67AHSRoRbIm67A1tz8GkkEmm%2BS4SlxyxqAeoa4Wve6H17eaweMsFDNzdgMEvtByb03Euhk15MOYo%2BanHOWyXTlcIEuweiPDIbSyBl3rQs9zHu%2FkS4qLSG5hW2Lt1Z3BewfaL4c3nx164FeOKz1SJryWR4MseSgIidEBHtqTV1TVnMPmQErzBwUSKavoj8hOO2zvLiEGZN2ptzW6L1nSoOOagPm6Lizge2nDGp%2FmEL3SSCBQLsTqSShRYsSHEyH%2F0TXjlqwA6ctDMl3M%2FFjQIIq5xQtWaWwXxwCmxzZtxF0a4SXlARNz5PAhnYGmbszA%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20210522T160217Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTYSXSEMYXH%2F20210522%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=211dd998f4b9b0f3070c561a3869be44fc06f7d1ea0d95e19456f20f4da963c1&hash=dbe30429adae210aaf84e2692d537605dd824e815605db553f3f72c958fec008&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S1877050920300430&tid=spdf-90c09903-2fa4-4014-9ed2-521c3a7439e7&sid=dad2b0101aa7d143fc19d5753c4c3b6416eegxrqb&type=client


In [50]:
#https://www.nltk.org/api/nltk.stem.html
#Perf_df = pd.DataFrame()


# NEED TO SORT OUT THIS ONE BELOW

In [64]:
from sklearn.linear_model import LogisticRegressionCV, PassiveAggressiveClassifier
import string 

names = ["LogRegCV", "PassiveAggC"]

classifiers = [
    LogisticRegressionCV(max_iter = 300), 
    PassiveAggressiveClassifier(max_iter = 10)
]



def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text


y=df['label']
X = df['text'].apply(remove_punctuations)
print('punctuation removed')

import nltk

from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.lancaster import LancasterStemmer

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()  
lemmatizer = WordNetLemmatizer()
snowballengstemmer = EnglishStemmer(ignore_stopwords=False)
Lancasterstem = LancasterStemmer()


def stem_text(text):
    #return [snowballengstemmer.stem(w) for w in w_tokenizer.tokenize(text)] #snowball-stem
    #return [Lancasterstem.stem(w) for w in w_tokenizer.tokenize(text)] #lancast-stem
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)] #lemmatizer

X=X.apply(stem_text)
print('stemmed')

from nltk.tokenize.treebank import TreebankWordDetokenizer

detoker=TreebankWordDetokenizer()

def detokenize(text):
    return detoker.detokenize(text)

X=X.apply(detokenize)
print('detokenized')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

i=0
for classy in classifiers:
    print(names[i])
    #Initialise vectorizor
    tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

    #fit and transform test and train set
    tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
    tfidf_test=tfidf_vectorizer.transform(X_test)

    print('tfidf fit and trans')


    classifyer=classy
    classifyer.fit(tfidf_train,y_train)
    print('classifier fit')
    y_pred=classifyer.predict(tfidf_test)
    print('predictions done')
    
    #Scoring
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred, pos_label='REAL')
    recall=recall_score(y_test,y_pred, pos_label='REAL')


    df_temp = pd.DataFrame({'lemstem?':'lemmatizer.lemmatize',
                            'classifyer':names[i],
                            'accuracy': [round(accuracy*100,2)],
                           'precision': [round(precision*100,2)],
                           'recall': [round(recall*100,2)]})

    Perf_df=Perf_df.append(df_temp).drop_duplicates(subset=['lemstem?','classifyer'])
    print(df_temp)
    i=i+1
Perf_df

punctuation removed
stemmed
detokenized
LogRegCV
tfidf fit and trans




classifier fit
predictions done
               lemstem? classifyer  accuracy  precision  recall
0  lemmatizer.lemmatize   LogRegCV     93.43      94.78    91.9
PassiveAggC
tfidf fit and trans
classifier fit
predictions done
               lemstem?   classifyer  accuracy  precision  recall
0  lemmatizer.lemmatize  PassiveAggC     93.81      94.47   93.04




Unnamed: 0,lemstem?,classifyer,accuracy,precision,recall
0,none,LogRegCV,93.62,94.8,92.28
0,none,PassiveAggC,93.88,93.92,93.8
0,snowballengstemmer.stem,LogRegCV,93.24,94.29,92.03
0,snowballengstemmer.stem,PassiveAggC,93.37,93.41,93.29
0,Lancasterstem.stem,LogRegCV,93.31,94.19,92.28
0,Lancasterstem.stem,PassiveAggC,93.37,93.3,93.42
0,lemmatizer.lemmatize,LogRegCV,93.43,94.78,91.9
0,lemmatizer.lemmatize,PassiveAggC,93.81,94.47,93.04


In [65]:
Perf_df_stem_lems = Perf_df
Perf_df_stem_lems

Unnamed: 0,lemstem?,classifyer,accuracy,precision,recall
0,none,LogRegCV,93.62,94.8,92.28
0,none,PassiveAggC,93.88,93.92,93.8
0,snowballengstemmer.stem,LogRegCV,93.24,94.29,92.03
0,snowballengstemmer.stem,PassiveAggC,93.37,93.41,93.29
0,Lancasterstem.stem,LogRegCV,93.31,94.19,92.28
0,Lancasterstem.stem,PassiveAggC,93.37,93.3,93.42
0,lemmatizer.lemmatize,LogRegCV,93.43,94.78,91.9
0,lemmatizer.lemmatize,PassiveAggC,93.81,94.47,93.04


## LEMMATIZATION AND STEMMING DIDN"T HELP
Next try ngrams

In [66]:
from sklearn.feature_extraction.text import CountVectorizer
#https://aiaspirant.com/bag-of-words/

import pandas as pd
 
sentences = ['Luckey loves Isa',
          'And Isa Loves Luckey too',
          'Finya is a dog']
 
cv = CountVectorizer(ngram_range=(1,1))
vect = cv.fit_transform(sentences)
 
df_ngram_exampe = pd.DataFrame()
df_ngram_exampe['vocabulary'] = cv.get_feature_names()
df_ngram_exampe['sentence1'] = vect.toarray()[0]
df_ngram_exampe['sentence2'] = vect.toarray()[1]
df_ngram_exampe['sentence3'] = vect.toarray()[2]
df_ngram_exampe.set_index('vocabulary', inplace=True)
df_ngram_exampe

Unnamed: 0_level_0,sentence1,sentence2,sentence3
vocabulary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
and,0,1,0
dog,0,0,1
finya,0,0,1
is,0,0,1
isa,1,1,0
loves,1,1,0
luckey,1,1,0
too,0,1,0


In [69]:
from sklearn.feature_extraction.text import CountVectorizer
#https://aiaspirant.com/bag-of-words/

import pandas as pd
 
sentences = ['Luckey loves Isa',
          'And Isa Loves Luckey too',
          'Finya is a dog']
 
cv = CountVectorizer(ngram_range=(2,2))
vect = cv.fit_transform(sentences)
 
df_ngram_exampe = pd.DataFrame()
df_ngram_exampe['vocabulary'] = cv.get_feature_names()
df_ngram_exampe['sentence1'] = vect.toarray()[0]
df_ngram_exampe['sentence2'] = vect.toarray()[1]
df_ngram_exampe['sentence3'] = vect.toarray()[2]
df_ngram_exampe.set_index('vocabulary', inplace=True)
df_ngram_exampe

Unnamed: 0_level_0,sentence1,sentence2,sentence3
vocabulary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
and isa,0,1,0
finya is,0,0,1
is dog,0,0,1
isa loves,0,1,0
loves isa,1,0,0
loves luckey,0,1,0
luckey loves,1,0,0
luckey too,0,1,0


In [71]:
from sklearn.feature_extraction.text import CountVectorizer
#https://aiaspirant.com/bag-of-words/

import pandas as pd
 
sentences = ['Luckey loves Isa',
          'And Isa Loves Luckey too',
          'Finya is a dog']
 
cv = CountVectorizer(ngram_range=(1,2))
vect = cv.fit_transform(sentences)
 
df_ngram_exampe = pd.DataFrame()
df_ngram_exampe['vocabulary'] = cv.get_feature_names()
df_ngram_exampe['sentence1'] = vect.toarray()[0]
df_ngram_exampe['sentence2'] = vect.toarray()[1]
df_ngram_exampe['sentence3'] = vect.toarray()[2]
df_ngram_exampe.set_index('vocabulary', inplace=True)
df_ngram_exampe

Unnamed: 0_level_0,sentence1,sentence2,sentence3
vocabulary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
and,0,1,0
and isa,0,1,0
dog,0,0,1
finya,0,0,1
finya is,0,0,1
is,0,0,1
is dog,0,0,1
isa,1,1,0
isa loves,0,1,0
loves,1,1,0


In [72]:
from sklearn.feature_extraction.text import CountVectorizer
#https://aiaspirant.com/bag-of-words/

import pandas as pd
 
sentences = ['Luckey loves Isa',
          'And Isa Loves Luckey too',
          'Finya is a dog']
 
cv = CountVectorizer(ngram_range=(3,3))
vect = cv.fit_transform(sentences)
 
df_ngram_exampe = pd.DataFrame()
df_ngram_exampe['vocabulary'] = cv.get_feature_names()
df_ngram_exampe['sentence1'] = vect.toarray()[0]
df_ngram_exampe['sentence2'] = vect.toarray()[1]
df_ngram_exampe['sentence3'] = vect.toarray()[2]
df_ngram_exampe.set_index('vocabulary', inplace=True)
df_ngram_exampe

Unnamed: 0_level_0,sentence1,sentence2,sentence3
vocabulary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
and isa loves,0,1,0
finya is dog,0,0,1
isa loves luckey,0,1,0
loves luckey too,0,1,0
luckey loves isa,1,0,0


In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [74]:
from sklearn.linear_model import LogisticRegressionCV, PassiveAggressiveClassifier
import string 

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text




y = df['label']
X = df['text'].apply(remove_punctuations)
print('removed punctuation')

names = ["LogRegCV", "PassiveAggC"]

classifiers = [
    LogisticRegressionCV(max_iter = 300), 
    PassiveAggressiveClassifier(max_iter = 10)
]


df_results = pd.DataFrame({'Classifier':[],'NgramRange':[],'Accuracy':[],'Recall':[]})



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

#Initialise vectorizor
ngramRange = [(1,1),(1,2)]#,(3,3),(2,2),(2,3),(1,3),(1,4)]

for ngram in ngramRange :
    print('ngram Range = '+str(ngram))
    tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7, ngram_range=ngram)

    #fit and transform test and train set
    tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
    print('tfidf_train done, shape is: '+str(tfidf_train.shape))
    tfidf_test=tfidf_vectorizer.transform(X_test)
    print('tfidf_test done, shape is: '+str(tfidf_test.shape))
    


    i=0
    for classifier in classifiers:
        try:
            print('classifier: '+names[i])

            classy=classifier
            classy.fit(tfidf_train,y_train)
            print('predictor fit')
            #DataFlair - Predict on the test set and calculate accuracy
            y_pred=classy.predict(tfidf_test)

            

            #Scoring
            accuracy=100*accuracy_score(y_test,y_pred)
            print('Accuracy: '+str({round(accuracy,2)}))

            recall=100*recall_score(y_test,y_pred, pos_label='REAL')
            print('Recall: '+str({round(recall,2)}))


            df_temp=pd.DataFrame({'Classifier':names[i],'NgramRange':[ngram], 'tfidf_train.shape':[tfidf_train.shape],'Accuracy':[accuracy],'Recall':[recall]})
            df_results=df_results.append(df_temp)


        except:
            print(names[i]+"didn't work")
            pass

        print(i)
        i=i+1
    
df_results


removed punctuation
ngram Range = (1, 1)
tfidf_train done, shape is: (6018, 81737)
tfidf_test done, shape is: (317, 81737)
classifier: LogRegCV




predictor fit
Accuracy: {93.38}
Recall: {92.62}
0
classifier: PassiveAggC
predictor fit
Accuracy: {93.06}
Recall: {93.96}
1
ngram Range = (1, 2)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


tfidf_train done, shape is: (6018, 1582764)
tfidf_test done, shape is: (317, 1582764)
classifier: LogRegCV




predictor fit
Accuracy: {94.64}
Recall: {97.32}
0
classifier: PassiveAggC


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


predictor fit
Accuracy: {94.95}
Recall: {98.66}
1


Unnamed: 0,Accuracy,Classifier,NgramRange,Recall,tfidf_train.shape
0,93.375394,LogRegCV,"(1, 1)",92.61745,"(6018, 81737)"
0,93.059937,PassiveAggC,"(1, 1)",93.959732,"(6018, 81737)"
0,94.637224,LogRegCV,"(1, 2)",97.315436,"(6018, 1582764)"
0,94.952681,PassiveAggC,"(1, 2)",98.657718,"(6018, 1582764)"


In [75]:
df_results['NgramRange','tfidf_train.shape','Classifier','Accuracy', 'Recall']

Unnamed: 0,Accuracy,Classifier,NgramRange,Recall,tfidf_train.shape
0,93.375394,LogRegCV,"(1, 1)",92.61745,"(6018, 81737)"
0,93.059937,PassiveAggC,"(1, 1)",93.959732,"(6018, 81737)"
0,94.637224,LogRegCV,"(1, 2)",97.315436,"(6018, 1582764)"
0,94.952681,PassiveAggC,"(1, 2)",98.657718,"(6018, 1582764)"


In [5]:
from sklearn.linear_model import LogisticRegressionCV, PassiveAggressiveClassifier
import string 

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text




y = df['label']
X = df['text'].apply(remove_punctuations)
print('removed punctuation')

names = ["LogRegCV", "PassiveAggC"]

classifiers = [
    LogisticRegressionCV(max_iter = 300), 
    PassiveAggressiveClassifier(max_iter = 10)
]


df_results = pd.DataFrame({'Classifier':[],'NgramRange':[],'Accuracy':[],'Recall':[]})



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

#Initialise vectorizor
ngramRange = [(1,1),(1,2),(3,3),(2,2),(2,3),(1,3),(1,4)]

for ngram in ngramRange :
    print('ngram Range = '+str(ngram))
    tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7, ngram_range=ngram)

    #fit and transform test and train set
    tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
    print('tfidf_train done, shape is: '+str(tfidf_train.shape))
    tfidf_test=tfidf_vectorizer.transform(X_test)
    print('tfidf_test done, shape is: '+str(tfidf_test.shape))
    


    i=0
    for classifier in classifiers:
        try:
            print('classifier: '+names[i])

            classy=classifier
            classy.fit(tfidf_train,y_train)
            print('predictor fit')
            #DataFlair - Predict on the test set and calculate accuracy
            y_pred=classy.predict(tfidf_test)

            

            #Scoring
            accuracy=100*accuracy_score(y_test,y_pred)
            print('Accuracy: '+str({round(accuracy,2)}))

            recall=100*recall_score(y_test,y_pred, pos_label='REAL')
            print('Recall: '+str({round(recall,2)}))


            df_temp=pd.DataFrame({'Classifier':names[i],'NgramRange':[ngram], 'tfidf_train.shape':[tfidf_train.shape],'Accuracy':[accuracy],'Recall':[recall]})
            df_results=df_results.append(df_temp)


        except:
            print(names[i]+"didn't work")
            pass

        print(i)
        i=i+1
    
df_results


removed punctuation
ngram Range = (1, 1)
tfidf_train done, shape is: (6018, 81737)
tfidf_test done, shape is: (317, 81737)
classifier: LogRegCV




predictor fit
Accuracy: {93.38}
Recall: {92.62}
0
classifier: PassiveAggC
predictor fit
Accuracy: {93.06}
Recall: {92.62}
1
ngram Range = (1, 2)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


tfidf_train done, shape is: (6018, 1582764)
tfidf_test done, shape is: (317, 1582764)
classifier: LogRegCV




predictor fit
Accuracy: {94.64}
Recall: {97.32}
0
classifier: PassiveAggC


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


predictor fit
Accuracy: {94.95}
Recall: {97.99}
1
ngram Range = (3, 3)
tfidf_train done, shape is: (6018, 2106490)
tfidf_test done, shape is: (317, 2106490)
classifier: LogRegCV




predictor fit
Accuracy: {93.06}
Recall: {97.32}
0
classifier: PassiveAggC
predictor fit


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Accuracy: {92.74}
Recall: {97.32}
1
ngram Range = (2, 2)
tfidf_train done, shape is: (6018, 1501027)
tfidf_test done, shape is: (317, 1501027)
classifier: LogRegCV




predictor fit
Accuracy: {93.69}
Recall: {96.64}
0
classifier: PassiveAggC
predictor fit


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Accuracy: {94.64}
Recall: {97.99}
1
ngram Range = (2, 3)
tfidf_train done, shape is: (6018, 3607517)
tfidf_test done, shape is: (317, 3607517)
classifier: LogRegCV




predictor fit
Accuracy: {92.74}
Recall: {96.64}
0
classifier: PassiveAggC


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


predictor fit
Accuracy: {93.69}
Recall: {97.99}
1
ngram Range = (1, 3)
tfidf_train done, shape is: (6018, 3689254)
tfidf_test done, shape is: (317, 3689254)
classifier: LogRegCV




predictor fit
Accuracy: {94.01}
Recall: {98.66}
0
classifier: PassiveAggC


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


predictor fit
Accuracy: {93.69}
Recall: {98.66}
1
ngram Range = (1, 4)
tfidf_train done, shape is: (6018, 5890248)
tfidf_test done, shape is: (317, 5890248)
classifier: LogRegCV




predictor fit
Accuracy: {92.43}
Recall: {98.66}
0
classifier: PassiveAggC


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


predictor fit
Accuracy: {93.38}
Recall: {98.66}
1


Unnamed: 0,Accuracy,Classifier,NgramRange,Recall,tfidf_train.shape
0,93.375394,LogRegCV,"(1, 1)",92.61745,"(6018, 81737)"
0,93.059937,PassiveAggC,"(1, 1)",92.61745,"(6018, 81737)"
0,94.637224,LogRegCV,"(1, 2)",97.315436,"(6018, 1582764)"
0,94.952681,PassiveAggC,"(1, 2)",97.986577,"(6018, 1582764)"
0,93.059937,LogRegCV,"(3, 3)",97.315436,"(6018, 2106490)"
0,92.744479,PassiveAggC,"(3, 3)",97.315436,"(6018, 2106490)"
0,93.690852,LogRegCV,"(2, 2)",96.644295,"(6018, 1501027)"
0,94.637224,PassiveAggC,"(2, 2)",97.986577,"(6018, 1501027)"
0,92.744479,LogRegCV,"(2, 3)",96.644295,"(6018, 3607517)"
0,93.690852,PassiveAggC,"(2, 3)",97.986577,"(6018, 3607517)"


In [8]:
#df_results_bu2=df_results[['NgramRange','tfidf_train.shape','Classifier','Accuracy', 'Recall']]
df_results_bu2

Unnamed: 0,NgramRange,tfidf_train.shape,Classifier,Accuracy,Recall
0,"(1, 1)","(6018, 81737)",LogRegCV,93.375394,92.61745
0,"(1, 1)","(6018, 81737)",PassiveAggC,93.059937,92.61745
0,"(1, 2)","(6018, 1582764)",LogRegCV,94.637224,97.315436
0,"(1, 2)","(6018, 1582764)",PassiveAggC,94.952681,97.986577
0,"(3, 3)","(6018, 2106490)",LogRegCV,93.059937,97.315436
0,"(3, 3)","(6018, 2106490)",PassiveAggC,92.744479,97.315436
0,"(2, 2)","(6018, 1501027)",LogRegCV,93.690852,96.644295
0,"(2, 2)","(6018, 1501027)",PassiveAggC,94.637224,97.986577
0,"(2, 3)","(6018, 3607517)",LogRegCV,92.744479,96.644295
0,"(2, 3)","(6018, 3607517)",PassiveAggC,93.690852,97.986577


### Try changing the cv factor thing

In [9]:
from sklearn.linear_model import LogisticRegressionCV, PassiveAggressiveClassifier
import string 

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text




y = df['label']
X = df['text']#.apply(remove_punctuations)

names = ["LogRegCV0",
        "LogRegCV2",
        "LogRegCV3",
        "LogRegCV4",
        "LogRegCV5",
        "LogRegCV10"]

classifiers = [
    LogisticRegressionCV(max_iter = 300),
    LogisticRegressionCV(max_iter = 300, cv=2),
    LogisticRegressionCV(max_iter = 300, cv=3),
    LogisticRegressionCV(max_iter = 300, cv=4),
    LogisticRegressionCV(max_iter = 300, cv=5),
    LogisticRegressionCV(max_iter = 300, cv=10)
    
]


df_results = pd.DataFrame({'Classifier':[],'NgramRange':[],'Accuracy':[],'Recall':[]})



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)#, ngram_range=(1,2))

#fit and transform test and train set
tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
print('tfidf_train done, shape is: '+str(tfidf_train.shape))
tfidf_test=tfidf_vectorizer.transform(X_test)
print('tfidf_test done, shape is: '+str(tfidf_test.shape))



i=0
for classifier in classifiers:
    try:
        print('classifier: '+names[i])

        classy=classifier
        classy.fit(tfidf_train,y_train)
        print('predictor fit')
        #DataFlair - Predict on the test set and calculate accuracy
        y_pred=classy.predict(tfidf_test)



        #Scoring
        accuracy=100*accuracy_score(y_test,y_pred)
        print('Accuracy: '+str({round(accuracy,2)}))

        recall=100*recall_score(y_test,y_pred, pos_label='REAL')
        print('Recall: '+str({round(recall,2)}))


        df_temp=pd.DataFrame({'Classifier':names[i],'Accuracy':[accuracy],'Recall':[recall]})
        df_results=df_results.append(df_temp)


    except:
        print(names[i]+"didn't work")
        pass

    print(i)
    i=i+1
    
df_results

tfidf_train done, shape is: (4751, 59568)
tfidf_test done, shape is: (1584, 59568)
classifier: LogRegCV0




predictor fit
Accuracy: {93.62}
Recall: {92.53}
0
classifier: LogRegCV2


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


predictor fit
Accuracy: {93.62}
Recall: {92.53}
1
classifier: LogRegCV3


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


predictor fit
Accuracy: {93.62}
Recall: {92.53}
2
classifier: LogRegCV4


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


predictor fit
Accuracy: {93.62}
Recall: {92.53}
3
classifier: LogRegCV5


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


predictor fit
Accuracy: {93.62}
Recall: {92.53}
4
classifier: LogRegCV10


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


predictor fit
Accuracy: {93.56}
Recall: {92.41}
5


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Unnamed: 0,Accuracy,Classifier,NgramRange,Recall
0,93.623737,LogRegCV0,,92.531646
0,93.623737,LogRegCV2,,92.531646
0,93.623737,LogRegCV3,,92.531646
0,93.623737,LogRegCV4,,92.531646
0,93.623737,LogRegCV5,,92.531646
0,93.560606,LogRegCV10,,92.405063


In [10]:
#df_results_bu3=df_results[['Classifier','Accuracy', 'Recall']]
df_results_bu3

Unnamed: 0,Classifier,Accuracy,Recall
0,LogRegCV0,93.623737,92.531646
0,LogRegCV2,93.623737,92.531646
0,LogRegCV3,93.623737,92.531646
0,LogRegCV4,93.623737,92.531646
0,LogRegCV5,93.623737,92.531646
0,LogRegCV10,93.560606,92.405063
