In [1]:
#basics libraries
import pandas as pd 
import numpy as np
import pickle
import matplotlib.pyplot as plt

#sklearn libraries
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
from sklearn.model_selection import validation_curve
import sklearn.metrics as metrics
from sklearn.preprocessing import PolynomialFeatures
from scipy.sparse import csr_matrix

In [2]:
#importing dataset 

df1 = pd.read_csv('./data/clean_news.csv')

#spacy
title_spacy_pickle = pickle.load(open("./data/title_spacy.pkl", "rb"))
text_spacy_pickle = pickle.load(open("./data/text_spacy.pkl", "rb"))

In [3]:
#our database originally
df1.drop('Unnamed: 0', axis=1)

Unnamed: 0,title,text,author,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,0,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,0
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,0,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,0,1
...,...,...,...,...
19343,Rapper T.I.: Trump a ’Poster Child For White S...,Rapper T. I. unloaded on black celebrities who...,0,0
19344,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",When the Green Bay Packers lost to the Washing...,0,0
19345,Macy’s Is Said to Receive Takeover Approach by...,The Macy’s of today grew from the union of sev...,0,0
19346,"NATO, Russia To Hold Parallel Exercises In Bal...","NATO, Russia To Hold Parallel Exercises In Bal...",0,1


### Spacy Data

In [4]:
pd_text_spacy = pd.DataFrame.sparse.from_spmatrix(text_spacy_pickle)

In [None]:
#pd_title_spacy = pd.DataFrame.sparse.from_spmatrix(title_spacy_pickle)

In [5]:
text_spacy = pd_text_spacy.sparse.to_dense()
#title_spacy = pd_title_spacy.sparse.to_dense()

In [6]:
#building pandas dataframe nltk :
df = df1.drop(['Unnamed: 0','title', 'text'], axis=1)

In [7]:
#adding text and title processed and turned into vectors using nltk
df = pd.concat([text_spacy,df] , axis=1)

In [8]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,138521,138522,138523,138524,138525,138526,138527,138528,author,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
19344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
19345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
19346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1


In [9]:
#declaring our X and y
X = df.drop('label', axis = 1)
y = df['label']

In [10]:
# splitting our data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True)
X_train.shape, X_test.shape

((13543, 138530), (5805, 138530))

In [13]:
picke_load('./data/X_train.pkl')
picke_load('./data/X_test.pkl')
picke_load('./data/y_train.pkl')
picke_load('./data/y_test.pkl')

In [14]:
from sklearn.linear_model import LogisticRegression

# 1. instantiate a logistic regression model
lr = LogisticRegression()

In [15]:
# fbit & predict 
lr.fit(X_train, y_train)

LogisticRegression()

In [16]:
y_pred_lr = lr.predict(X_test)

#### Accuracy and mean squared error

In [17]:
print ("Accuracy is: ", metrics.accuracy_score(y_test, y_pred_lr))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))

Accuracy is:  0.9595176571920758
Mean Squared Error is: 0.20120224354595106


#### Confussion matrix

In [18]:
lr_cm = metrics.confusion_matrix(y_test, y_pred_lr)
print (lr_cm)

[[3019  110]
 [ 125 2551]]


In [19]:
print(metrics.classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      3129
           1       0.96      0.95      0.96      2676

    accuracy                           0.96      5805
   macro avg       0.96      0.96      0.96      5805
weighted avg       0.96      0.96      0.96      5805



In [20]:
# Calculate predicted probabilities for test data 
y_pred_prob_lr = lr.predict_proba(X_test)[:, 1]
metrics.roc_auc_score(np.ravel(y_test), y_pred_prob_lr)

0.9915094628053969