# TF-IDF and LR classifier

### 1. Loading Relevant Libraries and Data

In [79]:
#Data processing
import pandas as pd #dataframes 
import numpy as np

#Text processing
import nltk #Word preprocessing
from nltk.stem.porter import PorterStemmer
import re #regular expressions
from matplotlib import pyplot as mplot

# Machine Learning
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

#Other
import pickle
from datetime import datetime


print("Libraries loaded")

Libraries loaded


In [60]:
df = pd.read_csv("fake_train.csv")
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


### 2. Data pre-processing 

In [61]:
del df["id"]
df.dropna(subset=["text"], axis=0, inplace=True)

In [62]:
topauthor = df.author.value_counts()[:10]
print(topauthor)

Pam Key                243
admin                  193
Jerome Hudson          166
Charlie Spiering       141
John Hayward           140
Katherine Rodriguez    124
Warner Todd Huston     122
Ian Hanchett           119
Breitbart News         118
Daniel Nussbaum        112
Name: author, dtype: int64


In [63]:
faketop = df[df.label == 1].author.value_counts()[:10]
print(faketop)

admin                                    193
Pakalert                                  86
Eddy Lavine                               85
Starkman                                  84
Gillian                                   82
Alex Ansary                               82
Editor                                    81
noreply@blogger.com (Alexander Light)     80
Dave Hodges                               77
Anonymous                                 77
Name: author, dtype: int64


In [64]:
realtop = df[df.label == 0].author.value_counts()[:10]
print(realtop)

Pam Key                242
Jerome Hudson          166
Charlie Spiering       141
John Hayward           140
Katherine Rodriguez    124
Warner Todd Huston     122
Ian Hanchett           119
Breitbart News         118
Daniel Nussbaum        112
AWR Hawkins            107
Name: author, dtype: int64


In [82]:
# IncreasingUsing both columns to increase the corpus of words
df['title_text'] = df['title'] + ' ' + df['text']

In [19]:
df['title_text'][20]

'News: Hope For The GOP: A Nude Paul Ryan Has Just Emerged From An Ayahuasca Tent With Visions Of A New Republican Party Email \nSince Donald Trump entered the election over a year ago, he has single-handedly destroyed the GOP, leaving both the House and the Senate in utter disarray. But although many political strategists believe permanent damage has been done, conservatives shouldn’t lose hope yet, because Speaker of the House Paul Ryan has just emerged fully nude from an ayahuasca tent with visions of a new Republican Party. \nA fresh GOP platform requires fresh leadership, and when Speaker Ryan journeyed to South America last week to embark on an immersive psychedelic vision quest of political rebirth, he just proved he’s the only one for the job. \nAfter traveling to Peru and entering a makeshift tent with nothing but a towel, 200 mg of DMT, and a bucket to vomit in, Paul Ryan (R-Wis.) spent the last 12 hours lying nude on the jungle floor, enduring bursts of vivid consciousness i

In [66]:
df['title_text']=df['title_text'].apply(str)

In [67]:
#Processing the text as per standart NLP techniques
def preprocessor(text):
    
    text = re.sub('<[^>]*>', '', text)
    text = re.sub(r'[^\w\s]','', text)
    text = text.lower()
    return text
    
df['title_text'] = df['title_text'].apply(preprocessor)

In [23]:
df['title_text'][20]

'news hope for the gop a nude paul ryan has just emerged from an ayahuasca tent with visions of a new republican party email \nsince donald trump entered the election over a year ago he has singlehandedly destroyed the gop leaving both the house and the senate in utter disarray but although many political strategists believe permanent damage has been done conservatives shouldnt lose hope yet because speaker of the house paul ryan has just emerged fully nude from an ayahuasca tent with visions of a new republican party \na fresh gop platform requires fresh leadership and when speaker ryan journeyed to south america last week to embark on an immersive psychedelic vision quest of political rebirth he just proved hes the only one for the job \nafter traveling to peru and entering a makeshift tent with nothing but a towel 200 mg of dmt and a bucket to vomit in paul ryan rwis spent the last 12 hours lying nude on the jungle floor enduring bursts of vivid consciousness in which he watched the

In [68]:
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

### 3. Model training and evaluation

In [81]:
start_time = datetime.now()

#Training the tf-idf vectorization
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None,
                        tokenizer=tokenizer_porter,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True)
X = tfidf.fit_transform(df['title_text'])
y = df.label.values
print("TF-IDF Vectorization of the corpus is scomplete")

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1, shuffle=True)

#Training Logistic Regressoin
clf = LogisticRegressionCV(cv=5, scoring='accuracy', random_state=42, n_jobs=-1, verbose=3, max_iter=3000).fit(X_train, y_train)

end_time = datetime.now()
training_time_tfidfLR = (end_time - start_time).total_seconds()

#Saving the model
fake_news_model = open('fake_news_model.sav', 'wb')
pickle.dump(clf, fake_news_model)
fake_news_model.close()
print('Training time for the model is: {:.1f}s'.format(training_time_tfidfLR))

TF-IDF Vectorization of the corpus is scomplete


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   45.5s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   51.8s finished


Training time for the model is: 278.7s


In [77]:
filename = 'fake_news_model.sav'
saved_clf = pickle.load(open(filename, 'rb'))
LR_prediction = saved_clf.predict(X_test)
print(classification_report(y_test, LR_prediction))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      1049
           1       0.97      0.98      0.97      1028

    accuracy                           0.97      2077
   macro avg       0.97      0.97      0.97      2077
weighted avg       0.97      0.97      0.97      2077

