## News Categorization

Using dataset from huffington post

In [47]:
"""
    Import required modules
"""
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection, naive_bayes, metrics, preprocessing, svm, decomposition, ensemble, linear_model
from random import shuffle
import warnings
warnings.filterwarnings("ignore")

### 1. Load dataset
Load and structure dataset into usable form

In [2]:
"""
    LOADS DATASET
    We have chosen Pandas Dataframe as it is easy to visualize and good for data analysis
"""
def load_dataset(filename):
    dataset = pd.read_json(open(filename), lines=True) # lines=True for file containing multiple JSON objects
    return dataset

In [3]:
filename = 'News_Category_Dataset.json'
dataset = load_dataset(filename)

### 2. Pre-process dataset
##### 2.1 Get details of dataset.

In [4]:
dataset.shape

(124989, 6)

In [5]:
dataset.columns

Index(['authors', 'category', 'date', 'headline', 'link', 'short_description'], dtype='object')

In [6]:
dataset.head()

Unnamed: 0,authors,category,date,headline,link,short_description
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...
4,Ron Dicker,ENTERTAINMENT,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ..."


##### 2.2 Drop irrelavant features

In [7]:
# Date and link can be considered irrelavant for our needs
dataset.drop(['date', 'link'], axis=1, inplace=True)

In [8]:
dataset.shape

(124989, 4)

In [9]:
# Check importance of author name to category
pd.crosstab(dataset['authors'], dataset['category'])

category,ARTS,ARTS & CULTURE,BLACK VOICES,BUSINESS,COLLEGE,COMEDY,CRIME,EDUCATION,ENTERTAINMENT,FIFTY,...,SPORTS,STYLE,TASTE,TECH,THE WORLDPOST,TRAVEL,WEIRD NEWS,WOMEN,WORLD NEWS,WORLDPOST
authors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,19,32,778,522,111,28,741,7,950,113,...,494,128,56,244,1466,54,386,369,351,774
"Bill Bigelow, ContributorRethinking Schools curriculum editor, Zinn Education Project c...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Bustle, ContributorNews, entertainment, lifestyle site",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
"Cristian Farias, Elise Foley, and Willa Frej",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Jamie Walker, ContributorCEO and Founder of SweatGuru & Fit Approach",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Josh Horwitz, ContributorExecutive Director, Coalition to Stop Gun Violence",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Logan (Mehl-Laituri) Isaac, ContributorAuthor of #ForGodandcountry & #Reborn4thJuly, speaker, educato...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"#DoctorsSpeakOut, Contributor#DoctorsSpeakOut is a coalition of physicians seeking high qua...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"'The Koch Sisters', ContributorSo not related to those guys",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"10,000 Small Businesses Program, ContributorAn investment to help entrepreneurs create jobs and economic o...",0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Drop author name also
dataset.drop(['authors'], axis=1, inplace=True)

In [11]:
dataset.shape

(124989, 3)

##### 2.3 Process and split dataset

In [12]:
# Merge headline and description into one to make data eligible for Naive Bayes Classification
dataset['text'] = dataset['headline'] + '. ' + dataset['short_description']
dataset.drop(['headline', 'short_description'], axis=1, inplace=True)

In [13]:
# Convert text to lowercase
dataset['text'] = dataset['text'].str.lower()

In [14]:
# Convert class names to integer values
le = preprocessing.LabelEncoder()
le.fit(dataset['category'])
x_dataset = dataset['text']
y_dataset = le.transform(dataset['category'])

In [15]:
train_dataset, valid_dataset, train_labels, valid_labels = model_selection.train_test_split(x_dataset, y_dataset)

### 3. Feature creation
##### TF-IDF Vector

In [16]:
"""
    We use TF-IDF because it has been found to give the best results for multiclass text classification systems.
    
    It calculates the term frequency (number of times word occurs in a document) and inverse document
    frequency (amount of information conveyed by the word)
"""
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern='\w{1,}', max_features=5000)
tfidf_vect.fit(x_dataset)

train_tfidf = tfidf_vect.transform(train_dataset)
valid_tfidf = tfidf_vect.transform(valid_dataset)

### 4. Building a model

In [24]:
def train_model(classifier, training_vector, training_labels, valid_vector, valid_labels):
    classifier.fit(training_vector, training_labels)
    
    predictions = classifier.predict(valid_vector)
    
    return metrics.accuracy_score(predictions, valid_labels), classifier

### 4.1 Naive Bayes Classifier
A simple classifier that gives excellent results in text classification models

In [25]:
accuracy, _ = train_model(naive_bayes.MultinomialNB(), train_tfidf, train_labels, valid_tfidf, valid_labels)
print("Naive Bayes accuracy:", accuracy)

Naive Bayes accuracy: 0.5003520225294419


### 4.2 Logistic Regression
Another simple classifier prone to overfitting

In [27]:
accuracy, classifier = train_model(linear_model.LogisticRegression(multi_class='ovr'), train_tfidf, train_labels, valid_tfidf, valid_labels)
print("Logistic Regression accuracy:", accuracy)

Logistic Regression accuracy: 0.5811891961085509


### 4.3 Support Vector Machine with online learning

In [23]:
accuracy = train_model(linear_model.SGDClassifier(loss='hinge', penalty='l2', max_iter=10), train_tfidf, train_labels, valid_tfidf, valid_labels)
print("Support Vector Machine with online learning:", accuracy)

Support Vector Machine with online learning: 0.5613799283154122


### 5 Actual work
Since logistic regression gives the best accuracy, we'll use this model to classify

##### 5.1 Actual example of prediction

In [40]:
sample_input = dataset.loc[0]

In [43]:
print("Input:", sample_input['text'])
print("Class:", sample_input['category'])

Input: there were 2 mass shootings in texas last week, but only 1 on tv. she left her husband. he killed their children. just another day in america.
Class: CRIME


In [49]:
print("Classifier output:", le.inverse_transform(classifier.predict(tfidf_vect.transform([sample_input['text']])))[0])

Classifier output: CRIME
