In [60]:
import pandas as pd

In [61]:
#importing the dataset as a corpus using the pandas library
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [62]:
#inspecting the data to see what it looks like
data['Body'][0]

'Image copyright Getty Images\nOn Sunday morning, Donald Trump went off on a Twitter tirade against a member of his own party.\nThis, in itself, isn\'t exactly huge news. It\'s far from the first time the president has turned his rhetorical cannons on his own ranks.\nThis time, however, his attacks were particularly biting and personal. He essentially called Tennessee Senator Bob Corker, the chair of the powerful Senate Foreign Relations Committee, a coward for not running for re-election.\nHe said Mr Corker "begged" for the president\'s endorsement, which he refused to give. He wrongly claimed that Mr Corker\'s support of the Iranian nuclear agreement was his only political accomplishment.\nUnlike some of his colleagues, Mr Corker - free from having to worry about his immediate political future - didn\'t hold his tongue.\nSkip Twitter post by @SenBobCorker It\'s a shame the White House has become an adult day care center. Someone obviously missed their shift this morning. — Senator Bo

In [63]:
data['Body'][:7]

0    Image copyright Getty Images\nOn Sunday mornin...
1    LONDON (Reuters) - “Last Flag Flying”, a comed...
2    The feud broke into public view last week when...
3    MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...
4    Country singer Jason Aldean, who was performin...
5    JetNation FanDuel League; Week 4\n% of readers...
6    In 2012, Kansas lawmakers, led by Gov. Sam Bro...
Name: Body, dtype: object

In [64]:
#Looking at the data, there are some missing columns, so let's take care of that
data.fillna('Article unavailable')

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1
...,...,...,...,...
4004,http://beforeitsnews.com/sports/2017/09/trends...,Trends to Watch,Trends to Watch\n% of readers think this story...,0
4005,http://beforeitsnews.com/u-s-politics/2017/10/...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,0
4006,https://www.activistpost.com/2017/09/ron-paul-...,"Ron Paul on Trump, Anarchism & the AltRight",Article unavailable,0
4007,https://www.reuters.com/article/us-china-pharm...,China to accept overseas trial data in bid to ...,SHANGHAI (Reuters) - China said it plans to ac...,1


In [65]:
#data cleaning with text preprocessing techniques
#data cleaning first round
#using regular expressions and string to clean

import re
import string

In [66]:
#function for first round of data cleaning
def clean_text_round1(text):
    text = str(text).lower() #making all text lowercase
    text = re.sub('\[.*?\]', '', text) #removing full stops and question marks
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text) #removing digits
    return text

round1 = lambda x: clean_text_round1(x)

In [67]:
#Let's take a look at the updated text
data_clean = pd.DataFrame(data.Body.apply(round1))
data_clean

Unnamed: 0,Body
0,image copyright getty images\non sunday mornin...
1,london reuters “last flag flying” a comedydra...
2,the feud broke into public view last week when...
3,mexico city reuters egypt’s cheiron holdings ...
4,country singer jason aldean who was performing...
...,...
4004,trends to watch\n of readers think this story ...
4005,trump jr is soon to give a speech for \n of r...
4006,
4007,shanghai reuters china said it plans to accep...


In [68]:
#let's apply a second round of cleaning because some nonsensical text was ignored in the first clean
def clean_text_round2(text):
    text = re.sub('[''""...]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [69]:
#let's take a look at the updated text again
data_clean = pd.DataFrame(data_clean.Body.apply(round2))
data_clean['Body'][0]

'image copyright getty imageson sunday morning donald trump went off on a twitter tirade against a member of his own partythis in itself isnt exactly huge news its far from the first time the president has turned his rhetorical cannons on his own ranksthis time however his attacks were particularly biting and personal he essentially called tennessee senator bob corker the chair of the powerful senate foreign relations committee a coward for not running for reelectionhe said mr corker begged for the presidents endorsement which he refused to give he wrongly claimed that mr corkers support of the iranian nuclear agreement was his only political accomplishmentunlike some of his colleagues mr corker  free from having to worry about his immediate political future  didnt hold his tongueskip twitter post by senbobcorker its a shame the white house has become an adult day care center someone obviously missed their shift this morning — senator bob corker senbobcorker october   reportthat wasnt 

In [70]:
#Concatenating our cleaned data to our corpus
data['clean_body'] = data_clean
data['clean_body'][0]

'image copyright getty imageson sunday morning donald trump went off on a twitter tirade against a member of his own partythis in itself isnt exactly huge news its far from the first time the president has turned his rhetorical cannons on his own ranksthis time however his attacks were particularly biting and personal he essentially called tennessee senator bob corker the chair of the powerful senate foreign relations committee a coward for not running for reelectionhe said mr corker begged for the presidents endorsement which he refused to give he wrongly claimed that mr corkers support of the iranian nuclear agreement was his only political accomplishmentunlike some of his colleagues mr corker  free from having to worry about his immediate political future  didnt hold his tongueskip twitter post by senbobcorker its a shame the white house has become an adult day care center someone obviously missed their shift this morning — senator bob corker senbobcorker october   reportthat wasnt 

In [71]:
#Extract features and target variables
import numpy as np

X = np.array(data['clean_body'], data['URLs'])     #feature variables
y = np.array(data['Label'])                          #target variable

y = list(map(int, y))

In [72]:
#Split the data into folds
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits = 2)
kf.get_n_splits(X)

2

In [73]:
#Split the data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

In [74]:
#create a document-term matrix for the train and test data using tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words = 'english', max_df = 0.7)        #removing all Englilsh stop words
tf_train = tf.fit_transform(X_train)
tf_test = tf.transform(X_test)

In [75]:
#Now we feed our data into our classifiers to develop our model.
#First we try the Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

#training the model
nb.fit(tf_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [76]:
#predicting
nb_pred = nb.predict(tf_test)
nb_pred[0:10]

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 1])

In [77]:
#Evaluating the accuracy of the model
nb_score = nb.score(tf_test, y_test)
print('accuracy: %0.3f' % nb_score)

accuracy: 0.943


In [78]:
#Next let's build another model using logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

#training the model
lr.fit(tf_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [79]:
#predicting
lr_pred = lr.predict(tf_test)
lr_pred[0:10]

array([0, 0, 0, 1, 0, 1, 0, 1, 0, 1])

In [80]:
#Evaluating the accuracy of the logistic regression model
lr_score = lr.score(tf_test, y_test)
print('accuracy: %0.3f' % lr_score)

accuracy: 0.966


In [81]:
#Using the random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

#training the model
rf.fit(tf_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [82]:
#predicting
rf_pred = rf.predict(tf_test)
rf_pred[0:10]

array([0, 0, 0, 1, 0, 1, 1, 1, 1, 1])

In [83]:
#Evaluating the accuracy of the model
rf_score = rf.score(tf_test, y_test)
print('accuracy: %0.3f' % rf_score)

accuracy: 0.941


In [84]:
# We are going to create document-term matrix using the tfidf vectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

#tf = TfidfVectorizer(stop_words = 'english', max_df = 0.7) #removing all english stop words
#data_tf = tf.fit_transform(data_clean.Body)
#data_dtm = pd.DataFrame(data_tf.toarray(), columns = tf.get_feature_names())
#data_dtm.index = data_clean.index
#data_dtm

In [85]:
#let's pickle the data_dtm for future use
#data_dtm.to_pickle("dtm.pkl")

In [86]:
#let's pickle the cleaned data for future use
import pickle

data_clean.to_pickle('data_clean.pkl')
pickle.dump(tf, open('cv.pkl', 'wb'))

In [87]:
#Since the logistic regression model is the most accurate classifier, lets save it and test it with 
#other news articles

#saving the model

with open ('log_model', 'wb') as f:
    pickle.dump(lr, f)

In [88]:
#importing the model and testing

with open ('log_model', 'rb') as f:
    lr_model = pickle.load(f)

In [89]:
lr_model.predict(tf_test)

array([0, 0, 0, ..., 0, 1, 1])

In [97]:
#A function that converts articles into dtm and parses into the model

#def words_to_dtm(article):
#    tfa = TfidfVectorizer(stop_words = 'english', max_df = 0.5)
#    tfa_train = tfa.fit_transform(article)
#    tfa_test = tfa.transform(article)
    
#    return tfa_test

In [98]:
text = """"LONDON (Reuters) - “Last Flag Flying”, a comedy-drama about Vietnam war veterans, will resonate with Trump’s America, despite, or perhaps because of, its period setting, actor Bryan Cranston said on Sunday after a screening at the London Film Festival.
Set in the United States in December 2003 – when U.S. forces in Iraq were dragging Saddam Hussein out of a “spider hole” - it is the story of three ageing former servicemen who reunite to bury the son of one of them who has been killed in action.
With President Donald Trump saying he could “totally destroy” North Korea and characterizing a dinner with military commanders as “the calm before the storm”, Cranston said “Last Flag Flying” was a timely reminder of the effect on normal Americans of ill-advised military campaigns.
“I think it has a lot of relevance today in the sense that (today) it’s not clear cut as far as the (what are the) intentions of the government or military,” Cranston, acclaimed for his lead role in the TV drama “Breaking Bad”, told Reuters.
“In World War Two, it was the ‘good war’, it was clear and present danger, we had to stop this mad man. Since then, with Vietnam and Iraq, (there are) a lot of questions ... among the troops and the citizens as to if we are doing the right thing and what is the purpose of our being there.”
Cast member Bryan Cranston (R) and director Richard Linklater pose with festival director Claire Stewart as they arrive for the premiere of "Last Flag Flying" during the British Film Institute (BFI) London Film Festival at the Odeon, Leicester Square, in London, Britain October 8, 2017. REUTERS/Afolabi Sotunde
“Last Flag Flying” was produced by Amazon Studios and directed and co-written by Richard Linklater, whose greatest critical acclaim has been for the naturalistic “Before Sunset” trilogy and the 2014 “Boyhood” which won a slew of Oscar nominations.
Linklater also made comedies including “School of Rock” and “Everybody Wants Some!!”, about skirt-chasing undergraduates. “Last Flag Flying” falls somewhere between the two genres.
Cast member Bryan Cranston (R) poses with director Richard Linklater as they arrive for the premiere of "Last Flag Flying" during the British Film Institute (BFI) London Film Festival at the Odeon, Leicester Square, in London, Britain October 8, 2017. REUTERS/Afolabi Sotunde
The drama and comedy stem from the chemistry between the three leads, each played by a big Hollywood name.
Steve Carell is the awkward shy one who, we assume, was quiet and withdrawn even before the loss of his son. Cranston plays a foul-mouthed, hard-drinking bar owner who is his own best customer, and Laurence Fishburne, is a man who has found God and become an evangelical preacher, preferring to forget the sex and drugs they all indulged in back in ‘Nam.
Vanity Fair’s Richard Lawson said the film’s ability to honor the footsoldiers while being critical of the wars they are sent to fight, could hit “an Academy sweet spot, satisfying both the more conservative oldsters and the younger, leftier types.”
Other critics said “Last Flag Flying” lacked the light touch of Linklater’s best work. The Guardian’s Benjamin Lee called it “a half-baked TV movie masquerading as Oscarbait, a curious misstep for the Oscar-nominated indie auteur”.
Writing by Robin Pomeroy, editing by David Evans"""

In [99]:
#words_to_dtm([text])

In [None]:
#lr_model.predict(words_to_dtm([text]))