In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from util import CorpusLoader, TextNormalizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
corpus = CorpusLoader('../datasets/News_dataset/fixed_dataset.json',is_json=True)
df = corpus.df
df.head(2)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26


In [3]:
df.drop(['authors','link','date'],axis=1,inplace=True)
df.dtypes

category             object
headline             object
short_description    object
dtype: object

In [4]:
df.category = df.category.astype('category')
df.dtypes

category             category
headline               object
short_description      object
dtype: object

In [5]:
df['cat_code'] = df.category.cat.codes
df['cat_code'].max() # our dataset has 40 different categories

40

In [6]:
normalizer = TextNormalizer()
headline_norm = normalizer.transform(df.headline)
description_norm = normalizer.transform(df['short_description'])

In [7]:
vectorizer = CountVectorizer(min_df=5,max_df=0.6,max_features=2000)
headline_vec = vectorizer.fit_transform(headline_norm)
description_vec = vectorizer.fit_transform(description_norm)

In [8]:
from sklearn.model_selection import KFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

def trainer(model, X, y, folds):
    count = 1
    for train_idx, test_idx in folds.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_pred, y_test)
        print('Split:',count,'Accuracy:',acc)
        count += 1
    return model

In [9]:
X = headline_vec
X2 = description_vec
y = df['cat_code']
folds = KFold(n_splits=7, shuffle=True)
clf = RandomForestClassifier(max_depth=7) # Bad efficiency in this problem 

In [10]:
trainer(SGDClassifier(),X,y,folds)

Split: 1 Accuracy: 0.4836551195371855
Split: 2 Accuracy: 0.4795427615529379
Split: 3 Accuracy: 0.48339316209528455
Split: 4 Accuracy: 0.4819642421496532
Split: 5 Accuracy: 0.47656222772104695
Split: 6 Accuracy: 0.4824173143275363
Split: 7 Accuracy: 0.47788659254870525


SGDClassifier()

In [11]:
trainer(SGDClassifier(),X2,y,folds)

Split: 1 Accuracy: 0.36739388025371156
Split: 2 Accuracy: 0.366696870425873
Split: 3 Accuracy: 0.3693235283867145
Split: 4 Accuracy: 0.3538493709267069
Split: 5 Accuracy: 0.35134004809535424
Split: 6 Accuracy: 0.36953263862266056
Split: 7 Accuracy: 0.3682779772069843


SGDClassifier()

In [12]:
vectorizer_tfidf = TfidfVectorizer()
headline_tfidf = vectorizer_tfidf.fit_transform(headline_norm)

In [13]:
trainer(SGDClassifier(),headline_tfidf,y,folds)

Split: 1 Accuracy: 0.541959991635882
Split: 2 Accuracy: 0.5435282637485188
Split: 3 Accuracy: 0.5471717840588297
Split: 4 Accuracy: 0.5465096016450005
Split: 5 Accuracy: 0.5474505977067577
Split: 6 Accuracy: 0.5412121423343673
Split: 7 Accuracy: 0.540027184330673


SGDClassifier()

In [14]:
vectorizer_hash = HashingVectorizer(2**10)
headline_hash = vectorizer_hash.fit_transform(headline_norm)



In [15]:
trainer(SGDClassifier(),headline_hash,y,folds)

Split: 1 Accuracy: 0.5388234474106085
Split: 2 Accuracy: 0.537080922841012
Split: 3 Accuracy: 0.540201442860628
Split: 4 Accuracy: 0.5347645767260307
Split: 5 Accuracy: 0.536019238141707
Split: 6 Accuracy: 0.5364026069076081
Split: 7 Accuracy: 0.5409333286864392


SGDClassifier()