In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
from textblob import TextBlob
import csv
import nltk
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import os
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [3]:

def explore_data(dataset):
	df = pd.read_csv(os.path.join(dataset))
	return df 
#data is taken from lair dataset
train_news = explore_data('train.csv')
train_news.head(100)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
95,95,White House Confirms More Gitmo Transfers Befo...,Edwin Mora,President Barack Obama will likely release mor...,0
96,96,The Geometry of Energy and Meditation of Buddha,,License DMCA \nA mandala is a visual symbol of...,1
97,97,Poll: Most Voters Have Not Heard of Democratic...,Katherine Rodriguez,There is a minefield of potential 2020 electio...,0
98,98,Migrants Confront Judgment Day Over Old Deport...,Vivian Yee,There are a little more than two weeks between...,0


In [6]:

df1 = train_news[['text', 'label']]


In [7]:

# Getting rid of empty lines
df1 = df1[df1.text.isna() == False]
length_df1 = len(df1)

In [8]:

# Build sublist of original df1, contains # lines picked at random, out of 20671 possible
random_indexes = list(np.random.choice(length_df1 - 2, 3000, replace=False))
df1 = df1.iloc[random_indexes]



In [13]:
# Function dissects text i, attributes polarity scores, positive/negative/neutral, polarity or not, and subject
def sentiment_analyzer(dataframe):
    sid = SentimentIntensityAnalyzer()
    scores = [sid.polarity_scores(i) for i in dataframe.text]
    compounds = np.array([i['compound'] for i in scores], dtype='float32')
    abs_compounds = np.array([np.sqrt(i ** 2) for i in compounds], dtype='float32')
    negs = np.array([i['neg'] for i in scores], dtype='float32')
    poss = np.array([i['pos'] for i in scores], dtype='float32')
    neus = np.array([i['neu'] for i in scores], dtype='float32')
    sent = dataframe['text'].apply(lambda x: TextBlob(x).sentiment)
    pol = np.array([s[0] for s in sent], dtype='float32')
    abs_pol = np.array([np.sqrt(i ** 2) for i in pol], dtype='float32')
    subj = np.array([s[1] for s in sent], dtype='float32')

    return compounds, abs_compounds, negs, poss, neus, sent, pol, abs_pol, subj


compounds, abs_compounds, negs, poss, neus, sent, pol, abs_pol, subj = sentiment_analyzer(df1)


In [14]:

# Adding columns to df1, matching them with newly created variables
df1['compounds'] = compounds
df1['abs_compounds'] = abs_compounds
df1['negs'] = negs
df1['neus'] = neus
df1['poss'] = poss
df1['pol'] = pol
df1['abs_pol'] = abs_pol
df1['subj'] = subj



In [15]:

X = df1[['compounds', 'negs', 'neus', 'poss', 'pol', 'subj']]
y = df1['label']


In [16]:

# First classifier
lrxtrain, lrxtest, lrytrain, lrytest = train_test_split(X, y)
lr = LogisticRegression()
lr.fit(lrxtrain, lrytrain)
lrpreds = lr.predict(lrxtest)
accuracy = accuracy_score(lrytest, lrpreds)
f1 = f1_score(lrytest, lrpreds)

In [17]:
print(accuracy, f1)


0.5306666666666666 0.5056179775280899


In [18]:

x_values = df1[['text', 'compounds', 'abs_compounds', 'negs', 'neus', 'poss', 'pol', 'abs_pol', 'subj']]
y_values = df1['label']
xtrain, xtest, ytrain, ytest = train_test_split(x_values, y_values)



In [19]:

# Cleans article from numbers, capital letters, punctuation and spaces for better classifier results
def clean_article(article):
    art = re.sub("[^A-Za-z0-9' ]", '', str(article))
    art2 = re.sub("[( ' )(' )( ')]", ' ', str(art))
    art3 = re.sub("\s[A-Za-z]\s", ' ', str(art2))
    return art3.lower()



In [20]:

### Tokenize
# Stop_words will ignore common english words which are noise (the / a / an / etc.)
# Max_df / min_df : ignore words which frequencies are above/under those thresholds

bow = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=998, max_df=1.0, min_df=1, binary=False)


In [21]:

training_data = bow.fit_transform(xtrain.text)
test_data = bow.transform(xtest.text)

In [22]:

dftrain = pd.DataFrame(training_data.toarray())
dftrain.columns = bow.get_feature_names()
# dftrain = dftrain.drop('s', axis=1)
# dftrain = dftrain.drop('m', axis=1)


In [23]:

dftest = pd.DataFrame(test_data.toarray())
dftest.columns = bow.get_feature_names()
# dftest = dftest.drop('s', axis=1)
# dftest = dftest.drop('m', axis=1)


In [24]:

### Set up 2nd classifier
lr2 = LogisticRegression()
lr2.fit(dftrain, ytrain)
lr2_preds = lr2.predict(dftest)
accuracy = accuracy_score(ytest, lr2_preds)
f1 = f1_score(ytest, lr2_preds)

print(accuracy, f1)

0.928 0.926027397260274


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [26]:
import pickle
pickle.dump(lr2, open("model.pkl", "wb"), protocol=2)
pickle.dump(clean_article, open("clean_article.pkl", 'wb'))
pickle.dump(bow, open("bow2.pkl", 'wb'), protocol=2)
