# News Classification Assignment

1.1.1 Import Libraries

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns

from seaborn import countplot

from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report

import warnings

warnings.filterwarnings('ignore')

1.1.2 Load Dataset from newsCorpora.csv

In [4]:
data_path = r'newsCorpora.csv'
col_names = ["ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"]
dataset = pd.read_csv(data_path, delimiter='\t', encoding='utf-8', names=col_names)

dataset.head(5)

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


1.1.3 Data Preprocessing: Get only the needed data for training


In [5]:
_dataset = dataset[['TITLE', 'CATEGORY']]
_dataset.to_csv('training_data.csv', index=False)

# Load and prepare data for training and testing

1.2.1 Load the Training Dataset from the training_data.csv file

In [6]:
data_path = r'training_data.csv'
training_df = pd.read_csv(data_path, encoding='utf-8')

x_train = training_df["TITLE"]
y_train = training_df["CATEGORY"]

training_df.head(10)

Unnamed: 0,TITLE,CATEGORY
0,"Fed official says weak data caused by weather,...",b
1,Fed's Charles Plosser sees high bar for change...,b
2,US open: Stocks fall after Fed official hints ...,b
3,"Fed risks falling 'behind the curve', Charles ...",b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b
5,Plosser: Fed May Have to Accelerate Tapering Pace,b
6,Fed's Plosser: Taper pace may be too slow,b
7,Fed's Plosser expects US unemployment to fall ...,b
8,US jobs growth last month hit by weather:Fed P...,b
9,ECB unlikely to end sterilisation of SMP purch...,b


1.2.2 Training Dataset Visualization

In [23]:
x_data_train = training_df.TITLE
y_data_train = training_df.CATEGORY

countplot(data=training_df, x='TITLE')
plt.show()


1.2.2 Load Test Dataset from test_dataset.csv

In [8]:
data_path = r'test_dataset.csv'
test_df = pd.read_csv(data_path, encoding='utf-8')

test_df.head(10) 

Unnamed: 0,TITLE,CATEGORY
0,Closing arguments next week in Trump civil fin...,b
1,Report released on Trumps foreign income,b
2,4th grader sells hot chocolate for good cause,b
3,House Dems report: Trump businesses got millio...,b
4,Trump received millions from foreign countries...,b
5,Jobs report remains strong,b
6,Apple wins major business battle,b
7,Celebrating small business success,b
8,'Never fight aging': Actress Melissa Gilbert s...,b
9,Celebrating Black Business Month,b


1.2.3 Show how many prediction class there are

In [51]:
# Show Unique Classes
unique_classes = training_df.CATEGORY.unique()

print(f"Classes: {unique_classes}")

Classes: ['b' 't' 'e' 'm']


1.2.4 Clean up data frame before testing

In [16]:
missing_values = test_df.isnull().sum()

test_df.dropna(inplace=True)
test_df.drop_duplicates(inplace=True)

x_test = test_df["TITLE"]
y_test = test_df["CATEGORY"]

# News Classification by Decision Tree Classifier

2.1.1 Train the data with training_df and then Test the model with test_df

In [59]:
# Train Model: Decision Tree Classifier
x_train = training_df["TITLE"]
y_train = training_df["CATEGORY"]


le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# TF-IDF vectorization for text data
tfidf = TfidfVectorizer()   
x_train_tfidf = tfidf.fit_transform(x_train)

# Train model using decision tree on the TF-IDF transformed data
DTclf = DecisionTreeClassifier(max_depth=15) 
DTclf = DTclf.fit(x_train_tfidf, y_train_encoded)

# Test Data for Decision Tree Classifier
y_test_encoded = le.transform(y_test)

x_test_tfidf = tfidf.transform(x_test)

# Make prediction on data using model: Decision Tree
y_pred_DT = DTclf.predict(x_test_tfidf)

2.1.2 Model Evaluation: Decision Tree Classifier

In [62]:
report = classification_report(y_test_encoded, y_pred_DT)

print(report)

              precision    recall  f1-score   support

           0       0.25      0.05      0.08      2008
           1       0.22      0.98      0.36      2440
           2       0.46      0.02      0.04      2510
           3       0.69      0.03      0.06      4698

    accuracy                           0.23     11656
   macro avg       0.40      0.27      0.14     11656
weighted avg       0.46      0.23      0.12     11656



2.1.3 Decision Tree Classifier Data Visualization using Seaborn

# News Classification by Mulitnomial Naive Bayes Classifier

3.1.1 Train the data with training_df and then Test the model with test_df

In [67]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# TF-IDF vectorization for text data
tfidf = TfidfVectorizer()   
x_train_tfidf = tfidf.fit_transform(x_train)

# Train model using Multinomial Naive Bayes on the TF-IDF transformed data
MNBclf = MultinomialNB()
MNBclf = MNBclf.fit(x_train_tfidf, y_train_encoded)

# Test Data for Decision Tree Classifier
y_test_encoded = le.transform(y_test)

x_test_tfidf = tfidf.transform(x_test)

# Make prediction on data using model: Multinomial Naive Bayes
y_pred_MB = MNBclf.predict(x_test_tfidf)

3.1.2 Model Evaluation: Multinomial Naive Bayes Classifier

In [68]:
# Model Evaluation
report = classification_report(y_test_encoded, y_pred_MB)

print(report)

              precision    recall  f1-score   support

           0       0.36      0.62      0.45      2008
           1       0.47      0.68      0.56      2440
           2       0.65      0.49      0.56      2510
           3       0.72      0.43      0.54      4698

    accuracy                           0.53     11656
   macro avg       0.55      0.55      0.53     11656
weighted avg       0.59      0.53      0.53     11656



3.1.3 Multinomail Naive Bayes Classifier Data Visualization using Seaborn

# News Classification by Artificial Neural Network Classifier

4.1.1 Train the data with training_df and then Test the model with test_df

In [None]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# TF-IDF vectorization for text data
tfidf = TfidfVectorizer()   
x_train_tfidf = tfidf.fit_transform(x_train)

# Train model using Neural Network Classifier on the TF-IDF transformed data
MLPclf = MLPClassifier(
    hidden_layer_sizes=(10, 10, 10, 10, 10),
    max_iter=50,
    solver='adam',
    learning_rate_init=0.2)

MLPclf = MLPclf.fit(x_train_tfidf, y_train_encoded)

# Make predictions on the test set
y_pred = MLPclf.predict(x_test_tfidf)

# Test Data for Neural Network Classifier
y_test_encoded = le.transform(y_test)

x_test_tfidf = tfidf.transform(x_test)

# Make prediction on data using model: Neural Network
y_pred_ANN = MLPclf.predict(x_test_tfidf)

4.1.2 Model Evaluation: Neural Network Classifier

In [None]:
report = classification_report(y_test_encoded, y_pred_ANN)

4.1.3 Neural Network Classifier Data Visualization using Seaborn