In [1]:
import pandas as pd
import numpy as np

In [12]:
# Importing libraries and packages for the preprocessing

from nltk.stem import WordNetLemmatizer #normalization library
from nltk.tokenize import RegexpTokenizer # tokenizer library
from nltk.corpus import stopwords #stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer #tfid matrix

In [2]:
train_news = pd.read_csv('data/train_news.csv', index_col = False )

In [3]:
news = pd.read_csv('data/News.csv', index_col = False)

In [4]:
train_news.head()

Unnamed: 0.1,Unnamed: 0,author,mergedtext,label
0,0,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,Daniel J. Flynn,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,2,Consortiumnews.com,Why the Truth Might Get You FiredWhy the Truth...,1
3,3,Jessica Purkiss,15 Civilians Killed In Single US Airstrike Hav...,1
4,4,Howard Portnoy,Iranian woman jailed for fictional unpublished...,1


In [9]:
train_news = train_news.drop(['Unnamed: 0','author'], axis=1).copy()

In [10]:
train_news

Unnamed: 0,mergedtext,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You FiredWhy the Truth...,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1
...,...,...
20351,Rapper T.I.: Trump a ’Poster Child For White S...,0
20352,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",0
20353,Macy’s Is Said to Receive Takeover Approach by...,0
20354,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [5]:
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,Type
0,0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,1
1,1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,1
2,2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,2017-12-30,1
3,3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,2017-12-29,1
4,4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,2017-12-25,1


In [6]:
news['mergedtext'] = news['title'] + news['text']
news['label'] = news['Type']

In [7]:
news = news.drop(['date','subject','title','text'], axis=1).copy()
news = news[['mergedtext','Type']]

In [19]:
news

Unnamed: 0,mergedtext,Type
0,Donald Trump Sends Out Embarrassing New Year’...,1
1,Drunk Bragging Trump Staffer Started Russian ...,1
2,Sheriff David Clarke Becomes An Internet Joke...,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,1
4,Pope Francis Just Called Out Donald Trump Dur...,1
...,...,...
44883,'Fully committed' NATO backs new U.S. approach...,0
44884,LexisNexis withdrew two products from Chinese ...,0
44885,Minsk cultural hub becomes haven from authorit...,0
44886,Vatican upbeat on possibility of Pope Francis ...,0


# Preprocessing our data
Since our data is all text, we have to convert it to numbers so we can then try different classification models. 

To do so, we are going to tokenize, normalize, remove stop words and then convert it to a TF-IDF matrix. First, we divide our data in target and data.

## Tokenize our data and preprocess it using nltk

Chopping a character sequence into pieces and throwing away commas, punctuation marks,etc.

In [None]:
X.head(2)

In [None]:
tokeniser = RegexpTokenizer(r'\w+') #find all the word characters
tokens = tokeniser.tokenize(text)

# Preprocessing data with Spacy

In [None]:
#installing the library

In [15]:
pip install spacy

Collecting spacy
  Downloading spacy-2.3.4-cp38-cp38-macosx_10_9_x86_64.whl (10.2 MB)
[K     |████████████████████████████████| 10.2 MB 3.1 MB/s eta 0:00:01
[?25hCollecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.4-cp38-cp38-macosx_10_9_x86_64.whl (263 kB)
[K     |████████████████████████████████| 263 kB 56.5 MB/s eta 0:00:01
[?25hCollecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.4-cp38-cp38-macosx_10_9_x86_64.whl (31 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.4-cp38-cp38-macosx_10_9_x86_64.whl (18 kB)
Collecting blis<0.8.0,>=0.4.0; python_version >= "3.6"
  Downloading blis-0.7.3-cp38-cp38-macosx_10_9_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 49.2 MB/s eta 0:00:01
[?25hCollecting thinc<7.5.0,>=7.4.1
  Downloading thinc-7.4.3-cp38-cp38-macosx_10_9_x86_64.whl (981 kB)
[K     |████████████████████████████████| 981 kB 50.2 MB/s eta 0:00:01
[?25hCollecting wasabi<1.1.0,>=0.4.0
  Downloading wasabi-0.8.0-py3-none-any

In [18]:
#importing library and model 
import spacy

spacy_model_name = 'en_core_web_sm'
if not spacy.util.is_package(spacy_model_name):
    spacy.cli.download(spacy_model_name)
nlp = spacy.load(spacy_model_name)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


### 1. Sentence Detection, Tokenization, Stop words, Lemmatization

In [46]:
#importing punctuation signs to remove them from our articles 
import string
punct = string.punctuation
punct

#importing stop words in english to remove them from our articles 
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)

In [47]:
def cleaning_news(article):
    doc = nlp(article)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [60]:
test = train_news[:100]

In [63]:
test.mergedtext.apply(lambda x: cleaning_news(x))

## 2. Turning our articles into Vectors with the TF-IDF matrix

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [83]:
tfidf = TfidfVectorizer(tokenizer = cleaning_news)
classifier = LinearSVC()

In [68]:
test

Unnamed: 0,mergedtext,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You FiredWhy the Truth...,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1
...,...,...
95,White House Confirms More Gitmo Transfers Befo...,0
96,The Geometry of Energy and Meditation of Buddh...,1
97,Poll: Most Voters Have Not Heard of Democratic...,0
98,Migrants Confront Judgment Day Over Old Deport...,0


In [84]:
X = test['mergedtext']
y = test['label']

In [97]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

In [92]:
tfidf.fit_transform(X_train, y_train)

<80x8298 sparse matrix of type '<class 'numpy.float64'>'
	with 21847 stored elements in Compressed Sparse Row format>

In [94]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

In [95]:
clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function cleaning_news at 0x129f55550>)),
                ('clf', LinearSVC())])

In [96]:
y_pred = clf.predict(X_test)

In [98]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      1.00      0.83        10
           1       1.00      0.60      0.75        10

    accuracy                           0.80        20
   macro avg       0.86      0.80      0.79        20
weighted avg       0.86      0.80      0.79        20



In [99]:
X = train_news['mergedtext']
y = train_news['label']

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 24)

In [105]:
clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function cleaning_news at 0x129f55550>)),
                ('clf', LinearSVC())])

In [106]:
y_pred = clf.predict(X_test)

In [107]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      2115
           1       0.97      0.98      0.97      1957

    accuracy                           0.97      4072
   macro avg       0.97      0.97      0.97      4072
weighted avg       0.97      0.97      0.97      4072



In [75]:
# Convert sparse matrix to dataframe
X_train = pd.DataFrame.sparse.from_spmatrix(X_train)

# Save mapping on which index refers to which terms
col_map = {v:k for k, v in tfidf.vocabulary_.items()}

# Rename each column using the mapping
for col in X_train.columns:
    X_train.rename(columns={col: col_map[col]}, inplace=True)
X_train

KeyboardInterrupt: 