In [0]:
import pandas as pd
from sklearn import model_selection
from sklearn import ensemble
from sklearn import feature_extraction
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn import tree

In [0]:
df = pd.read_json('preprocessing_News_Category_Dataset_v2.json')
df

Unnamed: 0,category,text
0,CRIME,2 mass shootings texas last week 1 tv left hus...
1,ENTERTAINMENT,smith joins diplo nicky jam 2018 world cup off...
2,ENTERTAINMENT,hugh grant marries first time age 57 actor lon...
3,ENTERTAINMENT,jim carrey blasts castrato adam schiff democra...
4,ENTERTAINMENT,julianna margulies uses donald trump poop bags...
5,ENTERTAINMENT,morgan freeman devastated sexual harassment cl...
6,ENTERTAINMENT,donald trump lovin new mcdonald jingle tonight...
7,ENTERTAINMENT,watch amazon prime new week great mini-series ...
8,ENTERTAINMENT,mike myers reveals like fourth austin powers f...
9,ENTERTAINMENT,watch hulu new week getting recent academy awa...


In [0]:
x = df['text']
y = df['category']

In [0]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size = 0.1, random_state = 7, stratify = y)

In [0]:
vectorized = feature_extraction.text.CountVectorizer(max_features=5000)

In [0]:
vectorized.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=5000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [0]:
x_train_vectorized = vectorized.transform(x_train)
x_test_vectorized = vectorized.transform(x_test)

In [0]:
scaler_vectorized = StandardScaler(with_mean=False)

In [0]:
x_train_vectorized = scaler_vectorized.fit_transform(x_train_vectorized)
x_test_vectorized = scaler_vectorized.transform(x_test_vectorized)

In [0]:
vect_tfidf = feature_extraction.text.TfidfVectorizer(max_features = 10250)

In [0]:
vect_tfidf.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=10250,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [0]:
x_train_tfidf = vect_tfidf.transform(x_train)
x_test_tfidf = vect_tfidf.transform(x_test)

In [0]:
scaler_tfidf = StandardScaler(with_mean=False)

In [0]:
x_train_tfidf = scaler_tfidf.fit_transform(x_train_tfidf)
x_test_tfidf = scaler_tfidf.transform(x_test_tfidf)

In [0]:
rfc_vec = ensemble.RandomForestClassifier(n_estimators = 100, criterion = 'gini')

In [0]:
rfc_vec.fit(x_train_vectorized, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
y_pred_vec_train = rfc_vec.predict(x_train_vectorized)

In [0]:
metrics.accuracy_score(y_train, y_pred_vec_train)

0.9999296699174793

In [0]:
y_pred_vec_test = rfc_vec.predict(x_test_vectorized)

In [0]:
metrics.accuracy_score(y_test, y_pred_vec_test)

0.5895353130439369

In [0]:
cnf_matrix_vec = metrics.confusion_matrix(y_test, y_pred_vec_test)
df_cnf_matrix = pd.DataFrame(cnf_matrix_vec, index = rfc_vec.classes_, columns = rfc_vec.classes_)
df_cnf_matrix

Unnamed: 0,ARTS & CULTURE,BLACK VOICES,BUSINESS,COMEDY,CRIME,DIVORCE,EDUCATION,ENTERTAINMENT,ENVIRONMENT,FIFTY,FOOD & DRINK,GOOD NEWS,HEALTHY LIVING,HOME & LIVING,IMPACT,LATINO VOICES,MEDIA,MONEY,PARENTS,POLITICS,QUEER VOICES,RELIGION,SCIENCE,SPORTS,STYLE & BEAUTY,TECH,TRAVEL,WEDDINGS,WEIRD NEWS,WOMEN,WORLD NEWS
ARTS & CULTURE,122,2,6,2,1,1,0,59,5,0,14,1,39,8,0,0,1,0,6,26,5,1,3,0,16,0,17,1,1,11,8
BLACK VOICES,1,106,4,1,21,1,5,92,2,0,3,0,31,2,0,0,4,0,25,77,5,1,1,16,18,2,4,0,3,3,6
BUSINESS,5,2,157,2,0,1,2,19,12,0,14,0,127,10,0,0,4,2,21,95,1,1,0,16,6,12,10,1,0,5,17
COMEDY,4,3,2,146,1,1,1,75,4,0,9,1,32,7,0,0,5,3,15,94,3,1,2,9,13,6,7,1,7,5,3
CRIME,1,5,6,0,121,0,2,16,4,0,3,0,21,2,1,0,1,2,17,59,0,1,2,3,6,1,7,0,9,3,23
DIVORCE,0,0,1,0,0,209,0,22,0,0,2,0,31,2,0,0,0,2,26,4,2,0,0,0,2,1,5,27,1,5,0
EDUCATION,1,3,5,0,3,1,59,8,1,0,5,1,28,2,0,0,0,1,25,41,2,0,0,4,0,1,3,0,0,2,2
ENTERTAINMENT,17,16,7,35,9,5,1,949,6,0,13,1,83,9,0,0,7,0,59,116,10,1,1,27,40,3,15,11,6,7,7
ENVIRONMENT,5,2,8,4,2,0,1,22,93,0,17,3,57,5,1,0,1,0,24,61,1,1,8,5,7,2,14,0,5,0,17
FIFTY,3,0,1,0,0,1,0,4,0,0,6,0,57,3,0,0,0,0,17,4,0,0,0,1,3,1,7,2,0,5,0


In [0]:
print(metrics.classification_report(y_test, y_pred_vec_test), sep = '\n')

                precision    recall  f1-score   support

ARTS & CULTURE       0.50      0.34      0.41       356
  BLACK VOICES       0.51      0.24      0.33       434
      BUSINESS       0.41      0.29      0.34       542
        COMEDY       0.60      0.32      0.42       460
         CRIME       0.45      0.38      0.41       316
       DIVORCE       0.87      0.61      0.72       342
     EDUCATION       0.43      0.30      0.35       198
 ENTERTAINMENT       0.49      0.65      0.56      1461
   ENVIRONMENT       0.44      0.25      0.32       366
         FIFTY       0.00      0.00      0.00       115
  FOOD & DRINK       0.62      0.75      0.68       803
     GOOD NEWS       0.41      0.09      0.15       130
HEALTHY LIVING       0.53      0.79      0.64      2340
 HOME & LIVING       0.67      0.59      0.63       417
        IMPACT       0.72      0.07      0.13       328
 LATINO VOICES       0.88      0.06      0.12       109
         MEDIA       0.44      0.20      0.28  

In [0]:
rfc_tfidf = ensemble.RandomForestClassifier(n_estimators = 100, criterion = 'gini')

In [0]:
rfc_tfidf.fit(x_train_tfidf, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
y_pred_tfidf_train = rfc_tfidf.predict(x_train_tfidf)

In [0]:
metrics.accuracy_score(y_train, y_pred_tfidf_train)

0.9999296699174793

In [0]:
y_pred_tfidf_test = rfc_tfidf.predict(x_test_tfidf)

In [0]:
metrics.accuracy_score(y_test, y_pred_tfidf_test)

0.5954955430138721

In [0]:
cnf_matrix_tfidf = metrics.confusion_matrix(y_test, y_pred_tfidf_test)
df_cnf_matrix = pd.DataFrame(cnf_matrix_tfidf, index = rfc_tfidf.classes_, columns = rfc_tfidf.classes_)
df_cnf_matrix

Unnamed: 0,ARTS & CULTURE,BLACK VOICES,BUSINESS,COMEDY,CRIME,DIVORCE,EDUCATION,ENTERTAINMENT,ENVIRONMENT,FIFTY,FOOD & DRINK,GOOD NEWS,HEALTHY LIVING,HOME & LIVING,IMPACT,LATINO VOICES,MEDIA,MONEY,PARENTS,POLITICS,QUEER VOICES,RELIGION,SCIENCE,SPORTS,STYLE & BEAUTY,TECH,TRAVEL,WEDDINGS,WEIRD NEWS,WOMEN,WORLD NEWS
ARTS & CULTURE,118,2,5,3,1,1,1,60,7,0,11,0,46,5,1,0,0,0,9,24,5,0,3,0,20,0,19,2,2,7,4
BLACK VOICES,2,113,2,1,17,1,7,82,5,0,5,0,36,1,0,1,3,1,21,77,5,3,1,14,19,2,5,0,1,4,5
BUSINESS,4,2,169,1,0,0,3,20,11,0,14,0,129,9,0,0,3,4,18,91,0,2,0,16,8,8,9,1,0,3,17
COMEDY,2,3,2,160,2,1,0,59,5,0,9,1,42,4,0,0,2,1,13,98,3,0,1,11,13,4,8,2,9,4,1
CRIME,1,7,4,0,120,0,2,14,0,0,5,0,25,4,0,0,1,1,24,62,0,1,1,2,4,1,8,0,3,3,23
DIVORCE,0,0,1,0,0,212,0,14,1,0,2,0,32,3,0,0,0,1,26,5,3,0,0,0,1,1,6,30,1,3,0
EDUCATION,0,2,7,0,2,0,65,7,2,0,4,0,33,2,0,0,0,1,25,39,2,0,0,3,0,1,1,0,0,1,1
ENTERTAINMENT,8,10,5,31,12,6,1,944,7,0,15,0,92,5,0,0,7,1,66,131,8,2,2,19,42,2,17,10,4,5,9
ENVIRONMENT,2,2,6,3,2,0,1,21,97,0,14,6,64,6,1,0,0,0,23,63,1,0,8,5,6,1,16,0,4,0,14
FIFTY,2,0,2,0,0,1,0,4,0,1,3,0,62,1,0,0,0,0,16,5,0,0,0,1,5,0,6,2,0,4,0


In [0]:
print(metrics.classification_report(y_test, y_pred_tfidf_test), sep = '\n')

                precision    recall  f1-score   support

ARTS & CULTURE       0.63      0.33      0.43       356
  BLACK VOICES       0.56      0.26      0.35       434
      BUSINESS       0.45      0.31      0.37       542
        COMEDY       0.64      0.35      0.45       460
         CRIME       0.47      0.38      0.42       316
       DIVORCE       0.87      0.62      0.72       342
     EDUCATION       0.45      0.33      0.38       198
 ENTERTAINMENT       0.53      0.65      0.58      1461
   ENVIRONMENT       0.41      0.27      0.32       366
         FIFTY       1.00      0.01      0.02       115
  FOOD & DRINK       0.65      0.75      0.69       803
     GOOD NEWS       0.35      0.05      0.09       130
HEALTHY LIVING       0.51      0.80      0.63      2340
 HOME & LIVING       0.69      0.57      0.62       417
        IMPACT       0.74      0.07      0.13       328
 LATINO VOICES       0.80      0.07      0.13       109
         MEDIA       0.50      0.20      0.28  