In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3986 entries, 0 to 3985
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Author_website  3986 non-null   object
 1   Headline        3986 non-null   object
 2   Body            3986 non-null   object
 3   Label           3986 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 124.7+ KB


In [4]:
df.head()

Unnamed: 0,Author_website,Headline,Body,Label
0,BEFOREITNEWS,nick folks days tampa numbered buccaneers bri...,nick folks days tampa numbered buccaneers bri...,0
1,BEFOREITNEWS,california doubles down stupidity is now a sa...,california doubles down stupidity is now a sa...,0
2,BEFOREITNEWS,where is president trumps sense of compassion,a potato battery can light room over month di...,0
3,BEFOREITNEWS,report jets sign running backreturn man trava...,report jets sign running backreturn man trava...,0
4,BEFOREITNEWS,jake arrieta say feel good ahead game 4 start...,jake arrieta say feel good ahead game 4 start...,0


In [5]:
df.tail()

Unnamed: 0,Author_website,Headline,Body,Label
3981,ACTIVISTPOST,us missile machinations undoes nonproliferati...,by ulson gunnarwhen come nuclear weapon upon ...,0
3982,BEFOREITNEWS,929 through 40 hbd cannonball paul camnitz cl...,red flag warning these california wildfires a...,0
3983,BEFOREITNEWS,107 chuck axed hbd brickyard adam moonlight a...,an embattled pharmaceutical company that sell...,0
3984,ACTIVISTPOST,msm ignores videos of syrians celebrating saa...,by brandon turbevillewhile western medium lam...,0
3985,BBC,facebook confirms russia ad instagram,image copyright getty imagesfacebook confirme...,1


In [6]:
df = df.sample(frac=1,random_state=24).reset_index()
df.drop(columns = ['index'],axis =1,inplace=True)

In [7]:
df.head()

Unnamed: 0,Author_website,Headline,Body,Label
0,REUTERS,accor 920 million mantra bid australia touris...,a sign bearing logo mantra group ltd displaye...,1
1,BBC,blade runner 2049 disappoints us box office,image copyright getty images image caption ry...,1
2,BEFOREITNEWS,shrimpton kerry cassidy in ireland 16 sept 17...,streamed live 9 hour agomirjam jansekerry cas...,0
3,REUTERS,apple name former honeywell executive new gen...,a security officer apple store staff seen ope...,1
4,CNN,how ferraris first female driver disappear,written jared zaugg cnnas one greatest female...,1


In [8]:
vectorizer_head = CountVectorizer()
vectorizer_body = CountVectorizer()
term_frequency_head = TfidfTransformer()
term_frequency_body = TfidfTransformer()

In [18]:
def prepare_head(sample):
    temp = vectorizer_head.fit_transform(sample)
    head_mat = term_frequency_head.fit_transform(temp).todense()
    return head_mat

def prepare_body(sample):
    temp = vectorizer_body.fit_transform(sample)
    body_mat = term_frequency_body.fit_transform(temp).todense()
    return body_mat

In [19]:
x_head = prepare_head(df['Headline'])
x_body = prepare_body(df['Body'])

In [20]:
x_head.shape, x_body.shape

((3986, 7350), (3986, 84165))

In [21]:
matrix = np.hstack((x_head,x_body))

In [22]:
matrix.shape

(3986, 91515)

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
x_train,x_test,y_train,y_test = train_test_split(matrix,df['Label'],test_size = 0.2,random_state = 24)

In [25]:
x_train.shape, y_train.shape

((3188, 91515), (3188,))

# Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score,mean_absolute_error

In [27]:
logistic_model = LogisticRegressionCV(Cs = 10)

In [28]:
logistic_model.fit(x_train,y_train)

LogisticRegressionCV()

In [29]:
trainpredict_logistic = logistic_model.predict(x_train)

In [30]:
trainscore_log_model = accuracy_score(y_train,trainpredict_logistic)

In [31]:
testpredict_logistic = logistic_model.predict(x_test)

In [33]:
testscore_log_model = accuracy_score(y_test,testpredict_logistic)

In [36]:
log_scores = pd.DataFrame({'Training score':trainscore_log_model,'Testing score':testscore_log_model},index=[0])
log_scores

Unnamed: 0,Training score,Testing score
0,1.0,0.982456


In [37]:
logistic_model.Cs_ , logistic_model.C_

(array([1.00000000e-04, 7.74263683e-04, 5.99484250e-03, 4.64158883e-02,
        3.59381366e-01, 2.78255940e+00, 2.15443469e+01, 1.66810054e+02,
        1.29154967e+03, 1.00000000e+04]),
 array([10000.]))

   # Decision Tree

In [38]:
from sklearn.tree import DecisionTreeClassifier

In [39]:
tree_model = DecisionTreeClassifier(random_state = 24,criterion='entropy')

In [40]:
from sklearn.model_selection import GridSearchCV

In [41]:
param_dict = {'max_depth':[80,81,82,83,86,88,90]}

In [42]:
search_cv = GridSearchCV(tree_model,param_dict,verbose=10)

In [43]:
search_cv.fit(x_train,y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] max_depth=80 ....................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........................ max_depth=80, score=0.934, total=  18.5s
[CV] max_depth=80 ....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   18.6s remaining:    0.0s


[CV] ........................ max_depth=80, score=0.940, total=  19.4s
[CV] max_depth=80 ....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   38.2s remaining:    0.0s


[CV] ........................ max_depth=80, score=0.933, total=  20.2s
[CV] max_depth=80 ....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   58.6s remaining:    0.0s


[CV] ........................ max_depth=80, score=0.939, total=  19.0s
[CV] max_depth=80 ....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.3min remaining:    0.0s


[CV] ........................ max_depth=80, score=0.928, total=  18.3s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.6min remaining:    0.0s


[CV] max_depth=81 ....................................................
[CV] ........................ max_depth=81, score=0.934, total=  19.1s
[CV] max_depth=81 ....................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.9min remaining:    0.0s


[CV] ........................ max_depth=81, score=0.940, total=  18.6s
[CV] max_depth=81 ....................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  2.2min remaining:    0.0s


[CV] ........................ max_depth=81, score=0.933, total=  19.6s
[CV] max_depth=81 ....................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  2.6min remaining:    0.0s


[CV] ........................ max_depth=81, score=0.939, total=  20.1s
[CV] max_depth=81 ....................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.9min remaining:    0.0s


[CV] ........................ max_depth=81, score=0.928, total=  19.4s
[CV] max_depth=82 ....................................................
[CV] ........................ max_depth=82, score=0.934, total=  19.7s
[CV] max_depth=82 ....................................................
[CV] ........................ max_depth=82, score=0.940, total=  20.0s
[CV] max_depth=82 ....................................................
[CV] ........................ max_depth=82, score=0.933, total=  19.4s
[CV] max_depth=82 ....................................................
[CV] ........................ max_depth=82, score=0.939, total=  19.3s
[CV] max_depth=82 ....................................................
[CV] ........................ max_depth=82, score=0.928, total=  18.5s
[CV] max_depth=83 ....................................................
[CV] ........................ max_depth=83, score=0.934, total=  18.7s
[CV] max_depth=83 ....................................................
[CV] .

[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed: 11.3min finished


GridSearchCV(estimator=DecisionTreeClassifier(criterion='entropy',
                                              random_state=24),
             param_grid={'max_depth': [80, 81, 82, 83, 86, 88, 90]},
             verbose=10)

In [44]:
search_cv.best_params_

{'max_depth': 80}

In [45]:
tree_model.max_depth = 80

In [46]:
tree_model.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=80, random_state=24)

In [55]:
trainpredict_tree  = tree_model.predict(x_train)

In [56]:
testpredict_tree = tree_model.predict(x_test)
testpredict_tree[0:5]

array([0, 1, 0, 1, 1], dtype=int64)

In [57]:
trainscore_tree = accuracy_score(y_train,trainpredict_tree)
testscore_tree = accuracy_score(y_test,testpredict_tree)

In [58]:
tree_scores = pd.DataFrame({'Training score':trainscore_tree,'Testing score':testscore_tree},index=[0])
tree_scores

Unnamed: 0,Training score,Testing score
0,1.0,0.942356


# Our decition tree model is overfitting

In [None]:
#

In [None]:
# 