In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from util import CorpusLoader, TextNormalizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
corpus = CorpusLoader('../datasets/News_dataset/fixed_dataset.json',is_json=True)
df = corpus.df
df.head(2)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [3]:
df.drop(['authors','link','date'],axis=1,inplace=True)
df.dtypes

category             object
headline             object
short_description    object
dtype: object

In [4]:
df.category = df.category.astype('category')
df.dtypes

category             category
headline               object
short_description      object
dtype: object

In [5]:
df['cat_code'] = df.category.cat.codes
df['cat_code'].max() # our dataset has 40 different categories

40

In [6]:
normalizer = TextNormalizer()
headline_norm = normalizer.transform(df.headline)
description_norm = normalizer.transform(df['short_description'])

In [7]:
vectorizer = CountVectorizer(min_df=5,max_df=0.6,max_features=2000)
headline_vec = vectorizer.fit_transform(headline_norm)
description_vec = vectorizer.fit_transform(description_norm)

In [8]:
from sklearn.model_selection import KFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

def trainer(model, X, y, folds):
    count = 1
    for train_idx, test_idx in folds.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_pred, y_test)
        print('Split:',count,'Accuracy:',acc)
        count += 1
    return model

In [9]:
X = headline_vec
X2 = description_vec
y = df['cat_code']
folds = KFold(n_splits=7, shuffle=True)
clf = RandomForestClassifier(max_depth=7) # Bad efficiency in this problem 

In [10]:
trainer(SGDClassifier(),X,y,folds)

Split: 1 Accuracy: 0.4842127273994563
Split: 2 Accuracy: 0.4781138914058688
Split: 3 Accuracy: 0.480848987557941
Split: 4 Accuracy: 0.48454326839298784
Split: 5 Accuracy: 0.4776774823127592
Split: 6 Accuracy: 0.480848987557941
Split: 7 Accuracy: 0.4815460216777611


SGDClassifier()

In [11]:
trainer(SGDClassifier(),X2,y,folds)

Split: 1 Accuracy: 0.35460374991287374
Split: 2 Accuracy: 0.3671150763225762
Split: 3 Accuracy: 0.36531558219774857
Split: 4 Accuracy: 0.36894015962081345
Split: 5 Accuracy: 0.36301536960234204
Split: 6 Accuracy: 0.37280869898581537
Split: 7 Accuracy: 0.3610636740668456


SGDClassifier()

In [12]:
vectorizer_tfidf = TfidfVectorizer()
headline_tfidf = vectorizer_tfidf.fit_transform(headline_norm)

In [13]:
trainer(SGDClassifier(),headline_tfidf,y,folds)

Split: 1 Accuracy: 0.5396947096954067
Split: 2 Accuracy: 0.5453056388095072
Split: 3 Accuracy: 0.5471369323528387
Split: 4 Accuracy: 0.5405499599205381
Split: 5 Accuracy: 0.5442093890495939
Split: 6 Accuracy: 0.5478339664726588
Split: 7 Accuracy: 0.5420485832781514


SGDClassifier()

In [14]:
vectorizer_hash = HashingVectorizer(2**10)
headline_hash = vectorizer_hash.fit_transform(headline_norm)



In [15]:
trainer(SGDClassifier(),headline_hash,y,folds)

Split: 1 Accuracy: 0.537150623823796
Split: 2 Accuracy: 0.5373597267721475
Split: 3 Accuracy: 0.5365420137315722
Split: 4 Accuracy: 0.5389467814449518
Split: 5 Accuracy: 0.536019238141707
Split: 6 Accuracy: 0.540062036036664
Split: 7 Accuracy: 0.5425713588680166


SGDClassifier()