In [1]:
import os
os.chdir(os.getcwd() + '/../../')

In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix

from scripts.utils import *

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\PendragonS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PendragonS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
news = pd.read_csv('data/news/preprocessed_news.csv', index_col=0)
news = news.dropna()
news.head()

Unnamed: 0,title,text,label
0,"a you.s . budget fight loom , republican flip ...",the head of a conservative republican faction...,0
1,you.s . military to accept transgender recruit...,transgender people will be allow for the firs...,0
2,senior you.s . republican senator : 'let mr. m...,the special counsel investigation of link bet...,0
3,fbi russia probe help by australian diplomat t...,trump campaign adviser george papadopoulos te...,0
4,trump want postal service to charge 'much more...,president donald trump call on the you.s . po...,0


In [5]:
# vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(news['text'])
y = news['label']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

In [6]:
model = LogisticRegression(max_iter=10000).fit(X_train, y_train)
y_hat = model.predict(X_test)
probs = model.predict_proba(X_test)

In [7]:
# train test split
text_train, text_test, y_train, y_test = train_test_split(news['text'], news['label'], 
                                                          test_size=0.2, stratify=news['label'], random_state=0)

In [26]:
test_df = pd.DataFrame()
test_df['text'] = text_test
test_df['label'] = y_test
test_df['pred'] = y_hat
test_df[['prob_0', 'prob_1']] = probs

In [28]:
miss = test_df[test_df['label'] != test_df['pred']]
miss.to_csv('data/news/missclassified.csv')
miss.head()

Unnamed: 0,text,label,pred,prob_0,prob_1
12340,a young chinese climb enthusiast s fatal fall...,0,1,0.428372,0.571628
32129,wow ! this be clearly something the white hous...,1,0,0.63825,0.36175
35017,barack obama s historic visit to hiroshima thi...,1,0,0.886892,0.113108
35503,immigrant afd alternative for germany party.fo...,1,0,0.718766,0.281234
33121,"regulating , punitive administration that have...",1,0,0.618504,0.381496


In [16]:
# ALL DOC

terms = vectorizer.get_feature_names()

# sum tfidf frequency of each term through documents
sums = X.sum(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append( (term, sums[0,col] ))

ranking = pd.DataFrame(data, columns=['term','sum'])
display(ranking.sort_values('sum', ascending=False))

Unnamed: 0,term,sum
97024,trump,1980.816194
84541,say,1818.949507
90496,state,892.670579
76086,president,856.536266
80813,republican,760.450572
...,...,...
41401,google_ads_iframe,0.001576
24510,concat,0.001576
41397,goog_request_monitoring,0.001576
79234,readystatechange,0.001576


In [38]:
# ONLY FAKE

terms = vectorizer.get_feature_names()

# sum tfidf frequency of each term through documents
means = X[y==1].mean(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append( (term, means[0,col] ))

ranking = pd.DataFrame(data, columns=['term','mean'])
display(ranking.sort_values('mean', ascending=False).iloc[:10])

Unnamed: 0,term,mean
97024,trump,0.059594
47495,image,0.026407
84541,say,0.024158
73275,people,0.020457
23333,clinton,0.020199
52163,just,0.018506
57535,like,0.017681
76086,president,0.017344
69461,obama,0.017086
59756,make,0.01663


In [39]:
# ONLY REAL

terms = vectorizer.get_feature_names()

# sum tfidf frequency of each term through documents
means = X[y==0].mean(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append( (term, means[0,col] ))

ranking = pd.DataFrame(data, columns=['term','mean'])
display(ranking.sort_values('mean', ascending=False).iloc[:10])

Unnamed: 0,term,mean
84541,say,0.065913
97024,trump,0.044357
90496,state,0.030544
76086,president,0.026121
80813,republican,0.023803
46140,house,0.021259
41585,government,0.019999
105427,year,0.01763
99207,united,0.01739
72473,party,0.017002


In [35]:
clinton = test_df[(test_df['text'].str.contains('clinton'))]