In [1]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
import numpy as np
import xgboost
from xgboost import XGBClassifier
import lightgbm
from lightgbm import LGBMClassifier

In [2]:
df = pd.read_csv('../data/preprocessed_train.csv', encoding='utf-8')
df.head()

Unnamed: 0,title,author,text,label,removed_punc,tokens,filtered_tokens,clean_tokens,lemma_words,clean_text
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide We Didn’t Even See Comey’s Lett...,"['house', 'dem', 'aide', 'we', 'didn’t', 'even...","['house', 'aide', 'didn’t', 'even', 'comey’s',...","['house', 'aide', 'didn’t', 'even', 'comey’s',...","['house', 'aide', 'didn’t', 'even', 'comey’s',...",house aide didn’t even comey’s letter jason ch...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,Ever get the feeling your life circles the rou...,"['ever', 'get', 'the', 'feeling', 'your', 'lif...","['ever', 'feeling', 'your', 'life', 'circles',...","['ever', 'feeling', 'life', 'circles', 'rounda...","['ever', 'feeling', 'life', 'circle', 'roundab...",ever feeling life circle roundabout rather hea...
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired October 29 2...,"['why', 'the', 'truth', 'might', 'get', 'you',...","['truth', 'might', 'fired', 'october', '2016',...","['truth', 'might', 'fired', 'october', '2016',...","['truth', 'might', 'fired', 'october', '2016',...",truth might fired october 2016 tension intelli...
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Videos 15 Civilians Killed In Single US Airstr...,"['videos', '15', 'civilians', 'killed', 'in', ...","['videos', 'civilians', 'killed', 'single', 'a...","['videos', 'civilians', 'killed', 'single', 'a...","['video', 'civilian', 'killed', 'single', 'air...",video civilian killed single airstrike identif...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Print \nAn Iranian woman has been sentenced to...,"['print', 'an', 'iranian', 'woman', 'has', 'be...","['print', 'iranian', 'woman', 'been', 'sentenc...","['print', 'iranian', 'woman', 'sentenced', 'ye...","['print', 'iranian', 'woman', 'sentenced', 'ye...",print iranian woman sentenced year prison iran...


In [3]:
df = df.dropna()

In [4]:
X = df['title']
y = df['label']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 5)

In [6]:
X_train.isna().sum()

0

In [7]:
print(X_train.shape)
print(X_test.shape)

(14533,)
(3634,)


In [8]:
tfidf = TfidfVectorizer()
tfidf_train = tfidf.fit_transform(X_train)
tfidf_test = tfidf.transform(X_test)

print(tfidf_train.toarray())
print(tfidf_train.shape)
print(tfidf_test.toarray())
print(tfidf_test.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(14533, 18476)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.30940996 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
(3634, 18476)


In [9]:
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

pred = pac.predict(tfidf_test)
print("Accuracy score : {}".format(accuracy_score(y_test, pred)))
print("Cross val score : {}".format(cross_val_score(pac, tfidf_train, y_train, cv=5)))
print("Confusion matrix : \n {}".format(confusion_matrix(y_test, pred)))

Accuracy score : 0.9320308200330215
Cross val score : [0.93188854 0.93326453 0.93257654 0.92980041 0.92670337]
Confusion matrix : 
 [[1904  147]
 [ 100 1483]]


In [10]:
lr = LogisticRegression(max_iter = 500)
lr.fit(tfidf_train, y_train)
print('Logistic Regression model fitted..')

pred = lr.predict(tfidf_test)
print("Accuracy score : {}".format(accuracy_score(y_test, pred)))
print("Cross val score : {}".format(cross_val_score(lr, tfidf_train, y_train, cv=5)))
print("Confusion matrix : \n {}".format(confusion_matrix(y_test, pred)))

Logistic Regression model fitted..
Accuracy score : 0.9174463401210787
Cross val score : [0.92707258 0.92397661 0.91812865 0.92085341 0.92085341]
Confusion matrix : 
 [[1787  264]
 [  36 1547]]


In [11]:
# xgb = XGBClassifier()
# xgb.fit(tfidf_train, y_train)

# print('XGBoost Classifier model fitted..')
# pred = xgb.predict(tfidf_test)
# print("Accuracy score : {}".format(accuracy_score(y_test, pred)))
# print("Cross val score : {}".format(cross_val_score(xgb, tfidf_train, y_train, cv=5)))
# print("Confusion matrix : \n {}".format(confusion_matrix(y_test, pred)))

In [12]:
lgbm = LGBMClassifier()
lgbm.fit(tfidf_train, y_train)

print('LightGBM Classifier model fitted..')
pred = lgbm.predict(tfidf_test)
print("Accuracy score : {}".format(accuracy_score(y_test, pred)))
print("Cross val score : {}".format(cross_val_score(lgbm, tfidf_train, y_train, cv=5)))
print("Confusion matrix : \n {}".format(confusion_matrix(y_test, pred)))

[LightGBM] [Info] Number of positive: 6224, number of negative: 8309
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020019 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29231
[LightGBM] [Info] Number of data points in the train set: 14533, number of used features: 1214
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.428267 -> initscore=-0.288926
[LightGBM] [Info] Start training from score -0.288926
LightGBM Classifier model fitted..
Accuracy score : 0.9221243808475509
[LightGBM] [Info] Number of positive: 4979, number of negative: 6647
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012884 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22623
[LightGBM] [Info] Number of data points in the train set: 11626, number of used features: 953
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.428264 -> initscore=-0.2