Task fnc-1 - classifying using merged two tf-idf representations

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from scipy.sparse import hstack
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_val_score

In [2]:
train_bodies = pd.read_csv('./data/train_bodies.csv')
train_stancies = pd.read_csv('./data/train_stances.csv')
test_bodies = pd.read_csv('./data/competition_test_bodies.csv')
test_stancies = pd.read_csv('./data/competition_test_stances.csv')

Merge two datasets into one

In [3]:
train = pd.merge(left=train_bodies, right=train_stancies, left_on='Body ID', right_on='Body ID')

In [4]:
test = pd.merge(left=test_bodies, right=test_stancies, left_on='Body ID', right_on='Body ID')

In [5]:
train

Unnamed: 0,Body ID,articleBody,Headline,Stance
0,0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated
1,0,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated
2,0,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated
3,0,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Ottawa,unrelated
4,0,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated
...,...,...,...,...
49967,2532,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Pizza delivery man gets tipped more than $2,00...",agree
49968,2532,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Pizza delivery man gets $2,000 tip",agree
49969,2532,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Luckiest Pizza Delivery Guy Ever Gets $2,000 Tip",agree
49970,2532,"ANN ARBOR, Mich. – A pizza delivery man in Mic...",Ann Arbor pizza delivery driver surprised with...,agree


In [6]:
train = train.drop_duplicates()
train

Unnamed: 0,Body ID,articleBody,Headline,Stance
0,0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated
1,0,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated
2,0,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated
3,0,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Ottawa,unrelated
4,0,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated
...,...,...,...,...
49966,2532,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Pizza delivery driver surprised with $2,000 tip",agree
49967,2532,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Pizza delivery man gets tipped more than $2,00...",agree
49968,2532,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Pizza delivery man gets $2,000 tip",agree
49969,2532,"ANN ARBOR, Mich. – A pizza delivery man in Mic...","Luckiest Pizza Delivery Guy Ever Gets $2,000 Tip",agree


In [7]:
test

Unnamed: 0,Body ID,articleBody,Headline,Stance
0,1,Al-Sisi has denied Israeli reports stating tha...,Apple installing safes in-store to protect gol...,unrelated
1,1,Al-Sisi has denied Israeli reports stating tha...,El-Sisi denies claims he'll give Sinai land to...,agree
2,1,Al-Sisi has denied Israeli reports stating tha...,Apple to keep gold Watch Editions in special i...,unrelated
3,1,Al-Sisi has denied Israeli reports stating tha...,Apple Stores to Keep Gold “Edition” Apple Watc...,unrelated
4,1,Al-Sisi has denied Israeli reports stating tha...,South Korean woman's hair 'eaten' by robot vac...,unrelated
...,...,...,...,...
25408,2586,Remember how much Republicans wanted to repeal...,A Sign That Obamacare Exchanges Are Failing,disagree
25409,2586,Remember how much Republicans wanted to repeal...,Republicans call Obamacare a 'failure.' These ...,agree
25410,2586,Remember how much Republicans wanted to repeal...,CBO’s Alternate Facts Show Obamacare is Unsust...,disagree
25411,2586,Remember how much Republicans wanted to repeal...,Why Obamacare failed,disagree


In [8]:
encoder = LabelEncoder().fit(train['Stance'])

In [9]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        
    def __call__(self, text):
        return [self.wnl.lemmatize(token) for token in word_tokenize(text)]

In [10]:
X_train_headline, X_train_text, y_train = train['Headline'], train['articleBody'], encoder.transform(train['Stance'])

In [11]:
X_test_headline, X_test_text, y_test = test['Headline'], test['articleBody'], encoder.transform(test['Stance'])

In [14]:
headline_vec = CountVectorizer(tokenizer=LemmaTokenizer()).fit(X_train_headline)
text_vec = CountVectorizer(tokenizer=LemmaTokenizer()).fit(X_train_text)

In [15]:
X_train_headline_counts = headline_vec.transform(X_train_headline)
X_train_text_counts = text_vec.transform(X_train_text)

In [18]:
X_test_headline_counts = headline_vec.transform(X_test_headline)
X_test_text_counts = text_vec.transform(X_test_text)

Merge two count representations

In [19]:
X_train_counts = hstack([X_train_headline_counts, X_train_text_counts])

In [20]:
X_test_counts = hstack([X_test_headline_counts, X_test_text_counts])

Select best attributes for classifiers - chi2 with pvalue < 0.05

In [21]:
kbest = SelectKBest(chi2, k='all').fit(X_train_counts, y_train)
print(kbest.scores_)
print(kbest.pvalues_)

print(kbest.pvalues_ < 0.05)
print(np.count_nonzero(kbest.pvalues_ < 0.05))

[  6.34525664  19.97268419  64.05314582 ... 113.10775588  50.46529501
  50.46529501]
[9.59690916e-02 1.71969408e-04 7.99603237e-14 ... 2.35200975e-24
 6.35916907e-11 6.35916907e-11]
[False  True  True ...  True  True  True]
22062


In [23]:
kbest = SelectKBest(chi2, k=22062).fit(X_train_counts, y_train)

X_train_best = kbest.transform(X_train_counts)
X_test_best = kbest.transform(X_test_counts)

Calculate td-idf for documents

In [24]:
tfidf = TfidfTransformer().fit(X_train_counts)

X_train_tfidf = tfidf.transform(X_train_counts)
X_test_tfidf = tfidf.transform(X_test_counts)

GradientBoosting classifier for merged tf-idf representation for texts and headlines

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

classifier = GradientBoostingClassifier()
cross_val_score(classifier, X_train_tfidf, y_train, cv=5, n_jobs=5)

array([0.74752875, 0.6990115 , 0.60349001, 0.56495864, 0.69538027])

SVC classifier for merged tf-idf representation for texts and headlines

In [28]:
from sklearn.svm import SVC

classifier = SVC()
cross_val_score(classifier, X_train_tfidf, y_train, cv=5)

array([0.67924148, 0.14333266, 0.1010692 , 0.22553964, 0.47327012])

Gradient boosting classifier has better result - check final test result

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

classifier = GradientBoostingClassifier().fit(X_train_tfidf, y_train)
print(classifier.score(X_test_tfidf, y_test))

0.7162082398772281
