In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import classification_report, confusion_matrix

## Load data 

In [2]:
df = pd.read_csv('exploration/data/cleaned_data.csv')
df.head()

Unnamed: 0,title,text,class,full_text,clean_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,Fake,Donald Trump Sends Out Embarrassing New Year’...,donald trump send embarrass new year eve messa...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,Fake,Drunk Bragging Trump Staffer Started Russian ...,drunk bragging trump staffer start russian col...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",Fake,Sheriff David Clarke Becomes An Internet Joke...,sheriff david clarke become internet joke thre...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",Fake,Trump Is So Obsessed He Even Has Obama’s Name...,trump obsessed even obamas name code website i...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,Fake,Pope Francis Just Called Out Donald Trump Dur...,pope francis call donald trump christmas speec...


In [3]:
#Drop null rows
df.dropna(inplace=True)
df.reset_index(inplace=True,drop=True) #reset index so that we havee an accurate index for our output dataset

df.isnull().sum()

title         0
text          0
class         0
full_text     0
clean_text    0
dtype: int64

## Build train and test sets 

In [5]:
from sklearn.model_selection import train_test_split

#drop the class column for X
X = df.drop(columns=['class'])
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

train_clean_text = X_train['clean_text']
test_clean_text = X_test['clean_text']

train_clean_text.head()

50076    repudiate medium q trump win consider repudiat...
25674    canada suggest could quit nafta talk dispute m...
13084    intl leader not hide disrespect obama final g ...
12923    fbi give key clinton aide immunity strange wel...
41819    top anc official say party must act corrupt me...
Name: clean_text, dtype: object

## Count vectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.0, max_df=1.0, ngram_range=(1, 2))

X_traincv = cv.fit_transform(train_clean_text)
X_testcv = cv.transform(test_clean_text)



### Save vectorizer


In [7]:
import pickle 

with open('vectorizer.pkl','wb') as f:
    pickle.dump(cv, f)

## Train SVM model 

### Model training and evaluation 

In [None]:
from sklearn.svm import SVC

svc= SVC()
svc.fit(X_traincv,y_train)
predictions = svc.predict(X_testcv)


print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

## Save model

In [41]:
import pickle

with open('svm_trained_model.pkl','wb') as f:
    pickle.dump(svc, f)


## Test API

In [12]:
text = ['report ivanka trump take kellyanne conway woodshed peddle product line kellyanne conway get world sht decide violate ethic rule behalf ivanka trump business ivanka pissed thursday conway whine nordstrom drop ivanka product line due lack sale desperately beg fox news viewer open wallet use prominent woman donald trump life conway begin use champion empowerment woman workplace get think go store thousand store thousand place buy buy good online go buy ivanka stuff would tell hate shopping go go get today wonderful line fully go give free commercial go buy today everybody find online promote ivanka business conway violate federal code specifically state employee shall not use public office private gain endorsement product service enterprise private gain friend relative person employee affiliate nongovernmental capacity include nonprofit organization employee officer member employee shall not use permit use government position title authority associate public office endorse product service enterprise ethic complaint bombard trump administration ever since ivanka business embroil political scandal win help sales despite apologize donald trump apparently forgive kellyanne conway still face ivanka confrontation reportedly not pretty accorde politico source close trump say daughter scold conway drag brand ethic mess tell not mention tv continuation conversation ivanka trump father week earlier leave business politic conway aware ivanka take kellyanne conway woodshed hurt company another example chaos infighting occur inside white house since trump take office one wonder long kellyanne conway get fire way repeatedly embarrass television defense trump day soon especially keep anger ivanka feature image via yana paskova getty image',
       'trump spokesman promise reporter never lie washington reuters president donald trumps press secretary promise reporter monday would never lie weekend briefing make statement crowd size trump inauguration debunk comment reporter saturday become know alternative fact briefing white house spokesman sean spicer declare trumps crowd large audience ever witness inauguration period photograph show crowd trump swearing friday small barack obamas first presidential inauguration spicer statement draw criticism trump adviser kellyanne conway say sunday white house want put alternative fact counter say biased medium first formal white house briefing monday spicer ask reporter intend always tell truth lectern intention never lie reply spicer defend right give administration point view say include television online viewer remark saturday size inauguration crowd tell reporter trump adviser frustrate demoralize coverage call constant attempt undermine credibility want healthy relationship press spicer say spicer answer question wide range policy issue focus trade policy not take question reporter saturday move away acrimonious session saturday say martha kumar political scientist emeritus professor towson university maryland study relationship white house press corps spicer take question reporter twice average kumar say interview also say would start take question four skype seat later week allow news organization outside washington participate']

In [5]:
import requests 
#aws
#url ="http://54.235.235.78:8080/predict"

#local
url = "http://192.168.4.20:8080/predict"

r = requests.post(url, json = text)
r.text.strip()

'{"predictions":["Fake","Real"]}'

In [6]:
# These are unprocessed and unseen texts 
raw_txt = ['If this is what s to come, IT S GOING TO BE AWESOME! Vicente Fox apologized to Trump and Trump shot back:  Get your money ready, you re paying for the wall  (Videos below)Former President Of Mexico Vicente Fox was interviewed by Breitbart News exclusively when Fox gave this apology to Donald Trump: I apologize. Forgiveness is one of the greatest qualities that human beings have, is the quality of a compassionate leader. You have to be humble. You have to be compassionate. You have to love thy neighbor,  ',
          'MAXINE GOT A MAKEOVER and is hopping mad about speculation she s running for POTUS in 2020. It can t help that Tucker Carlson is mocking her just about every night.Waters was in a foul mood during an interview with her favorite news anchor Joy Reid. Reid is a foot soldier for Waters. These two get together every other day to bash Trump. Pitiful!The funny thing is that no one was criticizing Waters. They were merely speculating if she is considering a presidential run, something she has admitted to if she had the support of millennials. Right?Waters told Reid: Just because I m going to New Hampshire to be at a Democratic Party event for one of my colleagues, they made this story up. , or to make people uncomfortable with me, all of that. So you re gonna be hearing a lot more from them these people who are all, you know, aligned around trying to discredit Maxine Waters because she has stayed on Trump s case so much. And so you re going to hear a lot more from them. Don t believe anything they re saying.']

In [7]:
r = requests.post(url, json = raw_txt)
r.text.strip()

'{"predictions":["Fake","Fake"]}'

In [8]:

df= pd.read_csv('exploration/data/labeled_data.csv')
df.describe()

Unnamed: 0,title,text,class
count,59221,59221.0,59221
unique,52105,52386.0,2
top,no title,,Fake
freq,186,733.0,37046


In [9]:
# taking top 1000 items of the dataframe to use as input for the prediction
# as we can see below, all 1000 texts are Fake

df_text = df.head(1000)
df_text.describe()

Unnamed: 0,title,text,class
count,1000,1000,1000
unique,1000,1000,1
top,Congressional Black Caucus: Jeff Sessions Has...,A backlash has ensued after Donald Trump launc...,Fake
freq,1,1,1000


In [10]:
df_text

Unnamed: 0,title,text,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,Fake
...,...,...,...
995,GOP Rep. Wants A $30k A Year Housing Allowanc...,"Not long ago, Rep. Jason Chaffetz (R-Utah), sa...",Fake
996,Brace Yourself For 74 Percent Higher Health C...,"According to a new report, health insurance pr...",Fake
997,Grandma Will Have To Pay More Than $20k A Yea...,The Congressional Budget Office released their...,Fake
998,The Absolutely Cringeworthy Moment Trump Trie...,Donald Trump was in a room full of Irish repor...,Fake


In [11]:
# here we are taking only the 'text' column of the dataframe and predicting to see if it labels
# all of them as false

r = requests.post(url, json = df_text['text'].to_list())
r.text.strip()

'{"predictions":["Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fake","Fa

All 1000 values were correctly predicted as fake