In [145]:
#impoting regular libraries

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [54]:
import pickle

#----------------------------------------

from tqdm import tqdm

tqdm.pandas()

  from pandas import Panel


In [130]:
# Importing libraries and packages for the preprocessing

#nltk
from nltk.stem import WordNetLemmatizer #normalization library
from nltk.tokenize import RegexpTokenizer # tokenizer library
import nltk
from nltk.corpus import stopwords

#spacy
import spacy #deep learning nlp preprocesser
import string #punctuation library
from spacy.lang.en.stop_words import STOP_WORDS #importing stop words library

#tfidf
from sklearn.feature_extraction.text import TfidfVectorizer #tfid matrix


#sklearn
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
from sklearn.svm import LinearSVC #model 1 
from sklearn.naive_bayes import MultinomialNB #model2
from sklearn.linear_model import LogisticRegression #model3
from sklearn.ensemble import RandomForestClassifier #model4

# Loading the dataset

In [25]:
df1 = pd.read_csv('data/clean_news.csv', index_col = False )

In [26]:
df2 = pd.read_csv('data/clean_news_notitle.csv', index_col = False )

In [27]:
df1 = df1.drop(['Unnamed: 0'], axis=1).copy()
df2 = df2.drop(['Unnamed: 0'], axis=1).copy()

In [28]:
df1.head()

Unnamed: 0,title,text,author,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,0,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,0
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,0,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,0,1


In [29]:
df2.head()

Unnamed: 0,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,0,Ever get the feeling your life circles the rou...,0
2,1,"Why the Truth Might Get You Fired October 29, ...",1
3,0,Videos 15 Civilians Killed In Single US Airstr...,1
4,0,Print \nAn Iranian woman has been sentenced to...,1


In [30]:
text_clean = pd.read_pickle('./data/cleaned_text.pkl')

In [31]:
text_clean.head()

0    [house, dem, aide, comey, letter, jason, chaff...
1    [feeling, life, circle, roundabout, head, stra...
2    [truth, fire, october, 29, 2016, tension, inte...
3    [videos, 15, civilian, kill, single, airstrike...
4    [print, iranian, woman, sentence, year, prison...
Name: text, dtype: object

In [32]:
title_clean = pd.read_pickle('./data/cleaned_title.pkl')

In [33]:
title_clean.head()

0    [house, dem, aide, comey, letter, jason, chaff...
1    [flynn, hillary, clinton, big, woman, campus, ...
2                           [truth, fire, truth, fire]
3    [15, civilians, kill, single, airstrike, ident...
4    [iranian, woman, jail, fictional, unpublished,...
Name: title, dtype: object

# Preprocessing our data
Since our data is all text, we have to convert it to numbers so we can then try different classification models. 

To do so, we are going to tokenize, normalize, remove stop words and then convert it to a TF-IDF matrix. First, we divide our data in target and data.

# Preprocess our data using nltk

Chopping a character sequence into pieces and throwing away commas, punctuation marks,etc.

In [49]:
from nltk.corpus import stopwords

def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    stopwords = set(stopwords.words('english'))
    keywords= [lemma for lemma in lemmas if lemma not in stopwords]
    return keywords


In [None]:
df1['text'] = [preprocess_text(row)for row in df1['text']]    

# Preprocessing data with Spacy

We performed this text in the topic modeling stage so we are going to use those datasets already:

In [40]:
#defining variables for punctuation and stop words 
punct = string.punctuation
stopwords = list(STOP_WORDS)
stopwords.append(['·','»','«','–','...','--'])
    
    #declaring the spacy model name we are going to be using and loading it
spacy_model_name = 'en_core_web_sm'

    #if not spacy.util.is_package(spacy_model_name):
        #spacy.cli.download(spacy_model_name)
nlp = spacy.load(spacy_model_name)
    
    
#cleaning text function (try to import it instead!)
def cleaning_news(article):
    
    doc = nlp(article)
    
    #turning the articles into tokens and lowering the case and lemmatizing them
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    #storing clean tokens which aren't stopwords in a list.
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
            
    #double cleaning tokens to eliminate any trace of punctuation and signs:
    pat = r'[^a-zA-z.,!?/:;\"\'\s]'
    cleantokens = cleaned_tokens.copy()
    for word in cleaned_tokens:
        word = re.sub(pat, '', word)
        cleantokens.append(word)
        for word in cleantokens: 
            if word == '':
                cleantokens.remove('')
                

    return cleantokens

# Vectorizing our datasets

# 1. Turning our articles into Vectors with the TF-IDF matrix

#### - Treating nltk preprocessed columns

In [45]:
tfidf = TfidfVectorizer(analyzer=preprocess_text) #vectorizer function nltk

In [50]:
#applying the tfidf function to each of the text columns 
title_nltk = tfidf.fit_transform(df1['title'])
text_nltk = tfidf.fit_transform(df1['text'])

In [51]:
title_nltk.shape, text_nltk.shape

((19348, 16131), (19348, 115909))

In [72]:
title_spacy_nltk = pd.DataFrame.sparse.from_spmatrix(title_nltk)

In [73]:
text_spacy_nltk = pd.DataFrame.sparse.from_spmatrix(text_nltk)

In [75]:
text_spacy_nltk

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,115899,115900,115901,115902,115903,115904,115905,115906,115907,115908
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19343,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19344,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19345,0.0,0.0,0.012537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19346,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### - Treating spacy preprocessed columns

In [41]:
tfidf = TfidfVectorizer(tokenizer = cleaning_news) #vectorizer function

In [43]:
#applying the tfidf function to each of the text columns 
title_spacy = tfidf.fit_transform(df1['title'])
text_spacy = tfidf.fit_transform(df1['text'])

In [52]:
title_spacy.shape, text_spacy.shape

((19348, 16815), (19348, 138529))

In [91]:
pd_title_spacy = pd.DataFrame.sparse.from_spmatrix(title_spacy)

In [92]:
pd_title_spacy

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16805,16806,16807,16808,16809,16810,16811,16812,16813,16814
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
pd_text_spacy = pd.DataFrame.sparse.from_spmatrix(text_spacy)

In [89]:
pd_text_spacy

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,138519,138520,138521,138522,138523,138524,138525,138526,138527,138528
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
pickle.dump(title_spacy, open("./data/title_spacy.pkl", "wb"))
pickle.dump(text_spacy, open("./data/text_spacy.pkl", "wb"))

pickle.dump(title_nltk, open("./data/title_nltk.pkl", "wb"))
pickle.dump(text_nltk, open("./data/text_nltk.pkl", "wb"))


In [57]:
title_spacy_pickle = pickle.load(open("./data/title_spacy.pkl", "rb"))

In [58]:
title_spacy_pickle

<19348x16815 sparse matrix of type '<class 'numpy.float64'>'
	with 163336 stored elements in Compressed Sparse Row format>

# 2. Word2Vec transformation

##### Loading our data

In [84]:
title_word2vec = title_clean
text_word2vec = text_clean

In [85]:
type(title_word2vec)

pandas.core.series.Series

##### Defining our model

In [None]:
import gensim.downloader as api
from gensim.models import Word2Vec
import numpy as np

In [None]:
model_title = Word2Vec(title_word2vec, min_count=1,size= 500,workers=5)

In [None]:
model_text = Word2Vec(text_word2vec, min_count=1,size= 500,workers=5)

In [None]:
# loading google news dataset 

corpus = api.load('word2vec-google-news-300')  # download the corpus and return it opened as an iterable

In [None]:
model_google = corpus
model_google.vector_size

In [None]:
model_google.most_similar("trump")

In [None]:
title_word2vec

In [None]:
#putting all the words in the title together to train with our preload model
x = [row[n] for row in title_word2vec for n in range(len(row))]
len(x)

In [None]:
# Filter the list of vectors to include only those that Word2Vec has a vector for
vector_list = [model_google[word] for word in x if word in model_google.vocab]

# Create a list of the words corresponding to these vectors
words_filtered = [word for word in x if word in model_google.vocab]

# Zip the words together with their vector representations
word_vec_zip = zip(words_filtered, vector_list)

# Cast to a dict so we can turn it into a DataFrame
word_vec_dict = dict(word_vec_zip)
df = pd.DataFrame.from_dict(word_vec_dict, orient='index')
df

In [None]:
model_title.wv.most_similar('trump')[:5]

In [None]:
model_text.wv.most_similar('trump')[:5]

In [None]:
pd_title_word2vec = pd.DataFrame(np.array(vector_title).T.tolist())

In [None]:
#def cosine_distance (model, word,target_list , num) :
    '''cosine_dict ={}
    word_list = []
    a = model[word]
    for item in target_list :
        if item != word :
            b = model [item]
            cos_sim = np.dot(a, b)/(norm(a)*norm(b))
            cosine_dict[item] = cos_sim
    dist_sort=sorted(cosine_dict.items(), key=lambda dist: dist[1],reverse = True) ## in Descedning order 
    for item in dist_sort:
        word_list.append((item[0], item[1]))
    return word_list[0:num]'''

# only get the unique Maker_Model
#Maker_Model = list(df.Maker_Model.unique()) 

# Show the most similar by cosine distance 
#cosine_distance (model,'trump',title,5)



# 3. FastText

In [60]:
import fasttext.util

In [61]:
#downloading model 
fasttext.util.download_model('en', if_exists='ignore')

'cc.en.300.bin'

In [62]:
model_ft = fasttext.load_model('cc.en.300.bin')



In [63]:
title_ft = df1['title']
text_ft = df1['text']

In [64]:
title_ft = title_ft.apply(lambda x: x.replace('\n',''))
text_ft = text_ft.apply(lambda x: x.replace('\n',''))

In [65]:
title_ft = title_ft.apply(lambda x: model_ft.get_sentence_vector(x)).T
text_ft = text_ft.apply(lambda x: model_ft.get_sentence_vector(x)).T

In [66]:
pd_title_ft = pd.DataFrame(np.array(title_ft).T.tolist())
pd_text_ft = pd.DataFrame(np.array(text_ft).T.tolist())

In [70]:
pd_text_ft

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.012867,0.012430,0.008407,0.040035,-0.037501,-0.007560,0.005550,-0.010244,-0.007751,0.002132,...,0.024282,-0.005968,-0.054760,0.000304,-0.008345,0.001611,0.000055,0.076241,-0.001044,0.008919
1,-0.015467,0.005928,-0.002342,0.035837,-0.028701,-0.015839,0.003369,-0.005036,-0.006150,0.007371,...,0.019597,-0.005454,-0.051866,0.004816,-0.010408,-0.005244,0.002354,0.081248,0.007194,0.008962
2,-0.023951,0.002629,0.005680,0.037418,-0.039248,-0.014571,0.010038,-0.007462,-0.007391,-0.003181,...,0.025684,-0.005943,-0.050470,-0.002823,-0.006607,-0.001486,-0.000200,0.074815,0.004534,0.007072
3,-0.012166,0.003768,0.002058,0.031053,-0.024341,-0.003137,0.002987,-0.002819,-0.009796,-0.001152,...,-0.001622,0.002489,-0.045115,0.001569,-0.011371,-0.009202,-0.006907,0.076405,0.019867,0.005708
4,-0.003908,0.008755,-0.000954,0.036000,-0.010166,-0.001310,0.009808,0.000452,-0.013797,0.007584,...,0.007642,-0.000452,-0.040028,0.005083,-0.011074,0.003543,-0.011510,0.067126,0.002755,0.026403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19343,-0.007817,0.005252,0.004396,0.035095,-0.037482,-0.013774,0.008956,-0.002838,-0.009536,0.016297,...,0.020365,-0.011106,-0.044656,0.005573,-0.006550,-0.006104,-0.005544,0.081941,0.004719,0.015054
19344,-0.019365,0.010262,0.002824,0.030501,-0.032252,0.004116,0.004063,-0.010744,-0.009511,0.007537,...,0.015969,-0.001155,-0.039115,0.001557,-0.017697,-0.014797,-0.004252,0.081399,-0.005050,0.000275
19345,-0.015881,0.011792,0.001355,0.034180,-0.022974,0.005721,0.010278,-0.011370,-0.002720,0.000054,...,0.019278,-0.002135,-0.051750,0.001448,-0.008973,-0.005062,-0.000945,0.062204,-0.001408,0.007093
19346,-0.018644,-0.004273,0.009485,0.028203,-0.023072,-0.005037,0.022662,0.002362,-0.015727,-0.001312,...,-0.002077,-0.009480,-0.034852,-0.005164,-0.008382,-0.008720,0.001735,0.068388,0.009567,0.004055


In [68]:
model_ft.get_analogies('trump', 'washington', 'obama')

[(0.5944359302520752, 'drumpf'),
 (0.5641477704048157, '0bama'),
 (0.5490846037864685, 'Obummer'),
 (0.548833966255188, 'Obams'),
 (0.5474653840065002, 'Obuma'),
 (0.5468364953994751, 'Drumpf'),
 (0.5445432066917419, 'Obama'),
 (0.5386767983436584, 'tRump'),
 (0.5359978079795837, 'trumps'),
 (0.5337789058685303, 'obamas')]

In [69]:
pd_title_ft.to_csv('./data/pd_title_ft.csv')

In [71]:
pd_text_ft.to_csv('./data/pd_text_ft.csv')

# 4. Doc2Vec

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#tokenize and tag the card text
card_docs = [TaggedDocument(doc.split(' '), [i]) 
             for i, doc in enumerate(title_clean)]

#display the tagged docs
card_docs

### Resulting datasets for modeling 

In [122]:
#defining models
svc = LinearSVC() 
lr = LogisticRegression()
rf = RandomForestClassifier()
nb = MultinomialNB()

1. Processed with nltk and tfidf

In [102]:
title_spacy_nltk.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16121,16122,16123,16124,16125,16126,16127,16128,16129,16130
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
text_spacy_nltk.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,115899,115900,115901,115902,115903,115904,115905,115906,115907,115908
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#making a first prediction with all of our models to see what text works best

In [114]:
X_nltk = text_spacy_nltk
y_nltk = df1['label']

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X_nltk, y_nltk, test_size = 0.2, shuffle = True)
X_train.shape, X_test.shape

((15478, 115909), (3870, 115909))

### SVC 

In [182]:
svc.fit(X_train, np.ravel(y_train)).score(X_train, np.ravel(y_train))

0.9994185295257785

In [183]:
y_pred_svc = svc.predict(X_test)

In [184]:
print("Accuracy is:", metrics.accuracy_score(y_test, y_pred_svc))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test, y_pred_svc)))
print(metrics.classification_report(y_test, y_pred_svc))

Accuracy is: 0.9666666666666667
Mean Squared Error is: 0.18257418583505536
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2074
           1       0.97      0.96      0.96      1796

    accuracy                           0.97      3870
   macro avg       0.97      0.97      0.97      3870
weighted avg       0.97      0.97      0.97      3870



### Logistic Regression

In [191]:
lr.fit(X_train, np.ravel(y_train)).score(X_train, np.ravel(y_train))

0.9772580436748934

In [192]:
y_pred_lr = lr.predict(X_test)

In [193]:
print("Accuracy is:", metrics.accuracy_score(y_test, y_pred_lr))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print(metrics.classification_report(y_test, y_pred_lr))

Accuracy is: 0.9555555555555556
Mean Squared Error is: 0.21081851067789195
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      2074
           1       0.96      0.94      0.95      1796

    accuracy                           0.96      3870
   macro avg       0.96      0.95      0.96      3870
weighted avg       0.96      0.96      0.96      3870



### Naive Bayes

In [194]:
nb.fit(X_train, np.ravel(y_train)).score(X_train, np.ravel(y_train))

0.892492570099496

In [195]:
y_pred_nb = nb.predict(X_test)

In [196]:
print("Accuracy is:", metrics.accuracy_score(y_test, y_pred_nb))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test, y_pred_nb)))
print(metrics.classification_report(y_test, y_pred_nb))

Accuracy is: 0.869250645994832
Mean Squared Error is: 0.361592801373545
              precision    recall  f1-score   support

           0       0.81      0.99      0.89      2074
           1       0.99      0.73      0.84      1796

    accuracy                           0.87      3870
   macro avg       0.90      0.86      0.86      3870
weighted avg       0.89      0.87      0.87      3870



### Random Forest 

In [197]:
rf.fit(X_train, np.ravel(y_train)).score(X_train, np.ravel(y_train))

0.9430158935262953

In [198]:
y_pred_rf = rf.predict(X_test)

In [199]:
print("Accuracy is:", metrics.accuracy_score(y_test, y_pred_rf))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print(metrics.classification_report(y_test, y_pred_rf))

Accuracy is: 0.8958656330749354
Mean Squared Error is: 0.3226985697598683
              precision    recall  f1-score   support

           0       0.87      0.94      0.91      2074
           1       0.93      0.84      0.88      1796

    accuracy                           0.90      3870
   macro avg       0.90      0.89      0.89      3870
weighted avg       0.90      0.90      0.90      3870



2. Processed with Spacy and tfidf

In [105]:
pd_text_spacy.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,138519,138520,138521,138522,138523,138524,138525,138526,138527,138528
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [104]:
pd_title_spacy.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16805,16806,16807,16808,16809,16810,16811,16812,16813,16814
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [110]:
#making a first prediction with all of our models to see what text works best

In [118]:
X_spacy = pd_text_spacy
y_spacy = df1['label']

In [119]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_spacy, y_spacy, test_size = 0.2, shuffle = True)
X_train_s.shape, X_test_s.shape

((15478, 138529), (3870, 138529))

### SVC 

In [200]:
svc.fit(X_train_s, np.ravel(y_train_s)).score(X_train_s, np.ravel(y_train_s))

0.9998061765085928

In [201]:
y_pred_svc_s = svc.predict(X_test_s)

In [202]:
print("Accuracy is:", metrics.accuracy_score(y_test_s, y_pred_svc_s))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test_s, y_pred_svc_s)))
print(metrics.classification_report(y_test_s, y_pred_svc_s))

Accuracy is: 0.9710594315245478
Mean Squared Error is: 0.1701192772011808
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2121
           1       0.97      0.97      0.97      1749

    accuracy                           0.97      3870
   macro avg       0.97      0.97      0.97      3870
weighted avg       0.97      0.97      0.97      3870



### Logistic Regression

In [203]:
lr.fit(X_train_s, np.ravel(y_train_s)).score(X_train_s, np.ravel(y_train_s))

0.9835250032303915

In [204]:
y_pred_lr_s = lr.predict(X_test_s)

In [205]:
print("Accuracy is:", metrics.accuracy_score(y_test_s, y_pred_lr_s))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test_s, y_pred_lr_s)))
print(metrics.classification_report(y_test_s, y_pred_lr_s))

Accuracy is: 0.9627906976744186
Mean Squared Error is: 0.19289712886816485
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      2121
           1       0.95      0.96      0.96      1749

    accuracy                           0.96      3870
   macro avg       0.96      0.96      0.96      3870
weighted avg       0.96      0.96      0.96      3870



### Naive Bayes

In [206]:
nb.fit(X_train_s, np.ravel(y_train_s)).score(X_train_s, np.ravel(y_train_s))

0.9075461945987854

In [207]:
y_pred_nb_s = nb.predict(X_test_s)

In [208]:
print("Accuracy is:", metrics.accuracy_score(y_test_s, y_pred_nb_s))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test_s, y_pred_nb_s)))
print(metrics.classification_report(y_test_s, y_pred_nb_s))

Accuracy is: 0.8770025839793282
Mean Squared Error is: 0.3507098744270994
              precision    recall  f1-score   support

           0       0.82      1.00      0.90      2121
           1       0.99      0.73      0.84      1749

    accuracy                           0.88      3870
   macro avg       0.91      0.86      0.87      3870
weighted avg       0.90      0.88      0.87      3870



### Random Forest 

In [209]:
rf.fit(X_train_s, np.ravel(y_train_s)).score(X_train_s, np.ravel(y_train_s))

0.9583279493474609

In [210]:
y_pred_rf_s = rf.predict(X_test_s)

In [211]:
print("Accuracy is:", metrics.accuracy_score(y_test_s, y_pred_rf_s))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test_s, y_pred_rf_s)))
print(metrics.classification_report(y_test_s, y_pred_rf_s))

Accuracy is: 0.9219638242894057
Mean Squared Error is: 0.2793495582788602
              precision    recall  f1-score   support

           0       0.92      0.93      0.93      2121
           1       0.92      0.91      0.91      1749

    accuracy                           0.92      3870
   macro avg       0.92      0.92      0.92      3870
weighted avg       0.92      0.92      0.92      3870



In [212]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 12,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

3. Word2Vec transformation

In [None]:
#PENDING

4. Fasttext transformation

In [106]:
pd_title_ft.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.004179,0.027928,0.000111,0.019891,-0.044248,-0.004025,0.007626,-0.030489,-0.006896,0.030486,...,0.021898,-0.008868,-0.053589,-0.031112,0.009246,0.013437,0.015421,0.066602,0.003685,0.006341


In [107]:
pd_text_ft.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.012867,0.01243,0.008407,0.040035,-0.037501,-0.00756,0.00555,-0.010244,-0.007751,0.002132,...,0.024282,-0.005968,-0.05476,0.000304,-0.008345,0.001611,5.5e-05,0.076241,-0.001044,0.008919


In [None]:
#making a first prediction with all of our models to see what text works best

In [116]:
X_ft = pd_text_ft
y_ft = df1['label']

In [117]:
X_train_ft, X_test_ft, y_train_ft, y_test_ft = train_test_split(X_ft, y_ft, test_size = 0.2, shuffle = True)
X_train_ft.shape, X_test_ft.shape

((15478, 300), (3870, 300))

### SVC 

In [213]:
svc.fit(X_train_ft, np.ravel(y_train_ft)).score(X_train_ft, np.ravel(y_train_ft))

0.919240211913684

In [214]:
y_pred_svc_ft = svc.predict(X_test_ft)

In [215]:
print("Accuracy is:", metrics.accuracy_score(y_test_ft, y_pred_svc_ft))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test_ft, y_pred_svc_ft)))
print(metrics.classification_report(y_test_ft, y_pred_svc_ft))

Accuracy is: 0.9183462532299742
Mean Squared Error is: 0.2857511973203714
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      2053
           1       0.92      0.90      0.91      1817

    accuracy                           0.92      3870
   macro avg       0.92      0.92      0.92      3870
weighted avg       0.92      0.92      0.92      3870



### Logistic Regression

In [216]:
lr.fit(X_train_ft, np.ravel(y_train_ft)).score(X_train_ft, np.ravel(y_train_ft))

0.8605116940173149

In [217]:
y_pred_lr_ft = lr.predict(X_test_ft)

In [218]:
print("Accuracy is:", metrics.accuracy_score(y_test_ft, y_pred_lr_ft))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test_ft, y_pred_lr_ft)))
print(metrics.classification_report(y_test_ft, y_pred_lr_ft))

Accuracy is: 0.8576227390180878
Mean Squared Error is: 0.37732911494067367
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      2053
           1       0.89      0.79      0.84      1817

    accuracy                           0.86      3870
   macro avg       0.86      0.85      0.86      3870
weighted avg       0.86      0.86      0.86      3870



### Naive Bayes

In [None]:
nb.fit(X_train_ft, np.ravel(y_train_ft)).score(X_train_ft, np.ravel(y_train_ft))

In [None]:
y_pred_nb = nb.predict(X_test_ft)

In [None]:
print("Accuracy is:", metrics.accuracy_score(y_test_ft, y_pred_nb))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test_ft, y_pred_nb)))
print(metrics.classification_report(y_test_ft, y_pred_nb))

### Random Forest 

In [219]:
rf.fit(X_train_ft, np.ravel(y_train_ft)).score(X_train_ft, np.ravel(y_train_ft))

0.9910841193952707

In [220]:
y_pred_rf_ft = rf.predict(X_test_ft)

In [221]:
print("Accuracy is:", metrics.accuracy_score(y_test_ft, y_pred_rf_ft))
print("Mean Squared Error is:", np.sqrt(mean_squared_error(y_test_ft, y_pred_rf_ft)))
print(metrics.classification_report(y_test_ft, y_pred_rf_ft))

Accuracy is: 0.8904392764857881
Mean Squared Error is: 0.3309995823474886
              precision    recall  f1-score   support

           0       0.88      0.91      0.90      2053
           1       0.90      0.86      0.88      1817

    accuracy                           0.89      3870
   macro avg       0.89      0.89      0.89      3870
weighted avg       0.89      0.89      0.89      3870



In [222]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 12,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Comparing first results to select best preprocessing data and models

The best results have been given by preprocessed data with spacy:

In [223]:
print(metrics.classification_report(y_test_s, y_pred_svc_s))
print(metrics.classification_report(y_test_s, y_pred_lr_s))
print(metrics.classification_report(y_test_s, y_pred_nb_s))
print(metrics.classification_report(y_test_s, y_pred_rf_s))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2121
           1       0.97      0.97      0.97      1749

    accuracy                           0.97      3870
   macro avg       0.97      0.97      0.97      3870
weighted avg       0.97      0.97      0.97      3870

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      2121
           1       0.95      0.96      0.96      1749

    accuracy                           0.96      3870
   macro avg       0.96      0.96      0.96      3870
weighted avg       0.96      0.96      0.96      3870

              precision    recall  f1-score   support

           0       0.82      1.00      0.90      2121
           1       0.99      0.73      0.84      1749

    accuracy                           0.88      3870
   macro avg       0.91      0.86      0.87      3870
weighted avg       0.90      0.88      0.87      3870

              preci