In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk
import string
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer

In [2]:
dataFrame = pd.read_csv("/Users/Oyewole Salami/Documents/WarPeaceModel/War_Peace.csv", sep =",")
df = dataFrame.sample(frac = 1)
labelEn = LabelEncoder()
df['Class'] = labelEn.fit_transform(df['Class'])
df.describe()

Unnamed: 0,Class
count,3061.0
mean,0.470434
std,0.499207
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [3]:
df

Unnamed: 0,Headline,Class
1964,"With Yemen boiling over to the south, ISIL thr...",1
1076,JUST IN: President Trump warns Russia it shoul...,0
468,"More than 470 people, including 150 children, ...",0
1822,Prince Ahmed bin Abdulaziz is considering not ...,1
1133,A FBI translator travelled to Syria and secret...,0
...,...,...
999,One shell in Syria can change a life forever. ...,0
899,"""With the US we have an extradition agreement....",0
1403,Ahmad Khalil knew that his family's days in Sy...,0
889,The summit brought together top lawmakers of I...,0


## Where 0 == Peace and 1 == War

In [4]:
#Number of Peace Related News
Peace = df.loc[df['Class'] == 0 ].count()
Peace

Headline    1621
Class       1621
dtype: int64

In [5]:
#Number of War Related News
War = df.loc[df['Class'] == 1 ].count()
War

Headline    1440
Class       1440
dtype: int64

In [6]:
#Missing Values
df['Class'].isnull()

1964    False
1076    False
468     False
1822    False
1133    False
        ...  
999     False
899     False
1403    False
889     False
148     False
Name: Class, Length: 3061, dtype: bool

## Preprocessing

In [7]:
#Punctuations
def removePunc(news):
    newsWithoutPunc = "".join([charac for charac in news if charac not in string.punctuation])
    return newsWithoutPunc

In [18]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
df['Headline'] = df['Headline'].apply(lambda news: removePunc(news))
df

Unnamed: 0,Headline,Class
1964,With Yemen boiling over to the south ISIL thre...,1
1076,JUST IN President Trump warns Russia it should...,0
468,More than 470 people including 150 children ha...,0
1822,Prince Ahmed bin Abdulaziz is considering not ...,1
1133,A FBI translator travelled to Syria and secret...,0
...,...,...
999,One shell in Syria can change a life forever M...,0
899,With the US we have an extradition agreement T...,0
1403,Ahmad Khalil knew that his familys days in Syr...,0
889,The summit brought together top lawmakers of I...,0


In [9]:
#Tokenization
def tokenize(news):
    tokens = re.split('\W+', news)
    return tokens

df['tokenizedHeadline'] = df['Headline'].apply(lambda words: tokenize(words.lower()))
df

Unnamed: 0,Headline,Class,tokenizedHeadline
1964,With Yemen boiling over to the south ISIL thre...,1,"[with, yemen, boiling, over, to, the, south, i..."
1076,JUST IN President Trump warns Russia it should...,0,"[just, in, president, trump, warns, russia, it..."
468,More than 470 people including 150 children ha...,0,"[more, than, 470, people, including, 150, chil..."
1822,Prince Ahmed bin Abdulaziz is considering not ...,1,"[prince, ahmed, bin, abdulaziz, is, considerin..."
1133,A FBI translator travelled to Syria and secret...,0,"[a, fbi, translator, travelled, to, syria, and..."
...,...,...,...
999,One shell in Syria can change a life forever M...,0,"[one, shell, in, syria, can, change, a, life, ..."
899,With the US we have an extradition agreement T...,0,"[with, the, us, we, have, an, extradition, agr..."
1403,Ahmad Khalil knew that his familys days in Syr...,0,"[ahmad, khalil, knew, that, his, familys, days..."
889,The summit brought together top lawmakers of I...,0,"[the, summit, brought, together, top, lawmaker..."


In [10]:
#Removing Stopwords
stopwords = nltk.corpus.stopwords.words('english')
def removeStopwords(tokenizedHeadline):
    cleanedNews = [word for word in tokenizedHeadline if word not in stopwords]
    return cleanedNews
df['refinedTokens'] = df['tokenizedHeadline'].apply(lambda news: removeStopwords(news))
df

Unnamed: 0,Headline,Class,tokenizedHeadline,refinedTokens
1964,With Yemen boiling over to the south ISIL thre...,1,"[with, yemen, boiling, over, to, the, south, i...","[yemen, boiling, south, isil, threatening, nor..."
1076,JUST IN President Trump warns Russia it should...,0,"[just, in, president, trump, warns, russia, it...","[president, trump, warns, russia, get, ready, ..."
468,More than 470 people including 150 children ha...,0,"[more, than, 470, people, including, 150, chil...","[470, people, including, 150, children, killed..."
1822,Prince Ahmed bin Abdulaziz is considering not ...,1,"[prince, ahmed, bin, abdulaziz, is, considerin...","[prince, ahmed, bin, abdulaziz, considering, r..."
1133,A FBI translator travelled to Syria and secret...,0,"[a, fbi, translator, travelled, to, syria, and...","[fbi, translator, travelled, syria, secretly, ..."
...,...,...,...,...
999,One shell in Syria can change a life forever M...,0,"[one, shell, in, syria, can, change, a, life, ...","[one, shell, syria, change, life, forever, mee..."
899,With the US we have an extradition agreement T...,0,"[with, the, us, we, have, an, extradition, agr...","[us, extradition, agreement, us, hand, man, us]"
1403,Ahmad Khalil knew that his familys days in Syr...,0,"[ahmad, khalil, knew, that, his, familys, days...","[ahmad, khalil, knew, familys, days, syria, nu..."
889,The summit brought together top lawmakers of I...,0,"[the, summit, brought, together, top, lawmaker...","[summit, brought, together, top, lawmakers, ir..."


In [11]:
#Stemming
#def stemmer(refinedTokens):
#    ps = PorterStemmer()
#    stemmedWord = [ps.stem(word) for word in refinedTokens]
#    return stemmedWord
#df['stemmedTokens'] = df['refinedTokens'].apply(lambda word: stemmer(word))
#df

In [24]:
#Lemmatization
def lemmatizer(refinedTokens):
    wn = nltk.WordNetLemmatizer()
    lemmatizedWords =  [wn.lemmatize(word) for word in refinedTokens]
    return lemmatizedWords
df['lemmatizedWords'] = df['refinedTokens'].apply(lambda word: lemmatizer(word))
df

Unnamed: 0,Headline,Class,tokenizedHeadline,refinedTokens,lemmatizedWords
1964,With Yemen boiling over to the south ISIL thre...,1,"[with, yemen, boiling, over, to, the, south, i...","[yemen, boiling, south, isil, threatening, nor...","[yemen, boiling, south, isil, threatening, nor..."
1076,JUST IN President Trump warns Russia it should...,0,"[just, in, president, trump, warns, russia, it...","[president, trump, warns, russia, get, ready, ...","[president, trump, warns, russia, get, ready, ..."
468,More than 470 people including 150 children ha...,0,"[more, than, 470, people, including, 150, chil...","[470, people, including, 150, children, killed...","[470, people, including, 150, child, killed, s..."
1822,Prince Ahmed bin Abdulaziz is considering not ...,1,"[prince, ahmed, bin, abdulaziz, is, considerin...","[prince, ahmed, bin, abdulaziz, considering, r...","[prince, ahmed, bin, abdulaziz, considering, r..."
1133,A FBI translator travelled to Syria and secret...,0,"[a, fbi, translator, travelled, to, syria, and...","[fbi, translator, travelled, syria, secretly, ...","[fbi, translator, travelled, syria, secretly, ..."
...,...,...,...,...,...
999,One shell in Syria can change a life forever M...,0,"[one, shell, in, syria, can, change, a, life, ...","[one, shell, syria, change, life, forever, mee...","[one, shell, syria, change, life, forever, mee..."
899,With the US we have an extradition agreement T...,0,"[with, the, us, we, have, an, extradition, agr...","[us, extradition, agreement, us, hand, man, us]","[u, extradition, agreement, u, hand, man, u]"
1403,Ahmad Khalil knew that his familys days in Syr...,0,"[ahmad, khalil, knew, that, his, familys, days...","[ahmad, khalil, knew, familys, days, syria, nu...","[ahmad, khalil, knew, family, day, syria, numb..."
889,The summit brought together top lawmakers of I...,0,"[the, summit, brought, together, top, lawmaker...","[summit, brought, together, top, lawmakers, ir...","[summit, brought, together, top, lawmaker, ira..."


In [13]:
#Vectorization
cv = CountVectorizer(analyzer=lemmatizer)
vectors = cv.fit_transform(df['lemmatizedWords']).toarray
print(vectors)

<bound method _cs_matrix.toarray of <3061x8379 sparse matrix of type '<class 'numpy.int64'>'
	with 46905 stored elements in Compressed Sparse Row format>>


In [14]:
cv.get_feature_names()

['',
 '0100',
 '0330',
 '1',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '1001st',
 '101',
 '103',
 '105',
 '106',
 '107',
 '10dayold',
 '10kg',
 '10km',
 '10point',
 '10year',
 '10yearold',
 '11',
 '110',
 '11000',
 '110bn',
 '1125',
 '1130',
 '115',
 '115yearold',
 '116000',
 '118',
 '11gmt',
 '11minute',
 '11th',
 '11yearold',
 '12',
 '120',
 '12000',
 '120000',
 '120day',
 '121',
 '123',
 '12600',
 '1288',
 '129',
 '12th',
 '12yearold',
 '13',
 '130',
 '1300',
 '138',
 '13th',
 '13yearold',
 '14',
 '140',
 '1400',
 '14000',
 '145',
 '14500',
 '146',
 '149',
 '14yearold',
 '15',
 '150',
 '1500',
 '150000',
 '1500km',
 '15km',
 '15yearold',
 '16',
 '1600',
 '160000',
 '1630',
 '164000',
 '16yearold',
 '17',
 '170',
 '1700',
 '170000',
 '1730',
 '175000',
 '1787',
 '17yearold',
 '17yo',
 '18',
 '1800',
 '18000',
 '182',
 '189',
 '1892',
 '18bn',
 '18th',
 '19',
 '19000',
 '19200',
 '1922',
 '1947',
 '1948',
 '1965',
 '1967',
 '1970s',
 '1971',
 '1973',
 '19751990',
 '1976',
 '1982',