In [0]:
import pandas as pd
from sklearn import model_selection
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import feature_extraction
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [0]:
df = pd.read_json('preprocessing_News_Category_Dataset_v2.json')
df

Unnamed: 0,category,text
0,CRIME,2 mass shootings texas last week 1 tv left hus...
1,ENTERTAINMENT,smith joins diplo nicky jam 2018 world cup off...
2,ENTERTAINMENT,hugh grant marries first time age 57 actor lon...
3,ENTERTAINMENT,jim carrey blasts castrato adam schiff democra...
4,ENTERTAINMENT,julianna margulies uses donald trump poop bags...
5,ENTERTAINMENT,morgan freeman devastated sexual harassment cl...
6,ENTERTAINMENT,donald trump lovin new mcdonald jingle tonight...
7,ENTERTAINMENT,watch amazon prime new week great mini-series ...
8,ENTERTAINMENT,mike myers reveals like fourth austin powers f...
9,ENTERTAINMENT,watch hulu new week getting recent academy awa...


In [0]:
x = df['text']
y = df['category']

In [0]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size = 0.1, random_state = 7, stratify = y)

In [0]:
vectorized = feature_extraction.text.CountVectorizer(max_features=5000)

In [0]:
vectorized.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=5000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [0]:
vectorized.vocabulary_

{'artist': 336,
 'inspiring': 2341,
 'national': 3011,
 'anthem': 277,
 'performance': 3307,
 'work': 4943,
 'art': 333,
 'wait': 4822,
 'end': 1527,
 'wear': 4867,
 'next': 3054,
 'wedding': 4873,
 'guest': 2032,
 'supreme': 4381,
 'court': 1087,
 'could': 1072,
 'use': 4743,
 'good': 1973,
 'public': 3560,
 'judge': 2472,
 'jane': 2418,
 'kelly': 2496,
 'still': 4287,
 'available': 393,
 'president': 3467,
 'listening': 2674,
 'christie': 856,
 'peter': 3322,
 'cook': 1048,
 'settle': 4025,
 'divorce': 1360,
 'dispute': 1351,
 'really': 3652,
 'parties': 3257,
 'quick': 3594,
 'issue': 2399,
 'statements': 4266,
 'settlement': 4026,
 'lawyer': 2590,
 'behalf': 489,
 'british': 641,
 'vogue': 4808,
 'features': 1720,
 'openly': 3160,
 'trans': 4613,
 'woman': 4931,
 'first': 1787,
 'time': 4551,
 'activist': 106,
 'paris': 3246,
 'magazine': 2741,
 'february': 1722,
 'ladies': 2551,
 'like': 2652,
 'drink': 1411,
 'little': 2677,
 'thanks': 4500,
 'celebrate': 768,
 'anniversary': 266

In [0]:
x_train_vectorized = vectorized.transform(x_train)
x_test_vectorized = vectorized.transform(x_test)

In [0]:
# scaler_vectorized = StandardScaler(with_mean=False)

In [0]:
# x_train_vectorized = scaler_vectorized.fit_transform(x_train_vectorized)
# x_test_vectorized = scaler_vectorized.transform(x_test_vectorized)

In [0]:
vect_tfidf = feature_extraction.text.TfidfVectorizer(max_features = 10250)

In [0]:
vect_tfidf.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=10250,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [0]:
vect_tfidf.vocabulary_

{'artist': 670,
 'inspiring': 4793,
 'national': 6153,
 'anthem': 546,
 'performance': 6755,
 'literal': 5470,
 'work': 10149,
 'art': 664,
 'wait': 9929,
 'end': 3157,
 'wear': 10003,
 'next': 6238,
 'wedding': 10013,
 'guest': 4180,
 'attire': 754,
 'requirements': 7689,
 'supreme': 9013,
 'court': 2204,
 'could': 2181,
 'use': 9758,
 'good': 4044,
 'public': 7280,
 'defender': 2506,
 'judge': 5058,
 'jane': 4965,
 'kelly': 5115,
 'still': 8820,
 'available': 795,
 'president': 7100,
 'listening': 5464,
 'christie': 1725,
 'brinkley': 1290,
 'peter': 6788,
 'cook': 2130,
 'settle': 8244,
 'divorce': 2814,
 'dispute': 2785,
 'really': 7459,
 'parties': 6656,
 'quick': 7349,
 'issue': 4932,
 'statements': 8772,
 'settlement': 8246,
 'lawyer': 5301,
 'behalf': 975,
 'british': 1292,
 'vogue': 9896,
 'features': 3534,
 'openly': 6448,
 'trans': 9482,
 'woman': 10133,
 'first': 3645,
 'time': 9355,
 'activist': 260,
 'paris': 6638,
 'magazine': 5608,
 'february': 3537,
 'ladies': 5226,
 '

In [0]:
x_train_tfidf = vect_tfidf.transform(x_train)
x_test_tfidf = vect_tfidf.transform(x_test)

In [0]:
# scaler_tfidf = StandardScaler(with_mean=False)

In [0]:
# x_train_tfidf = scaler_tfidf.fit_transform(x_train_tfidf)
# x_test_tfidf = scaler_tfidf.transform(x_test_tfidf)

In [0]:
x_train_bin = x_train_vectorized.toarray()
x_test_bin = x_test_vectorized.toarray()

In [0]:
scaler_bin = StandardScaler()

In [0]:
x_train_bin = scaler_bin.fit_transform(x_train_bin)
x_test_bin = scaler_bin.transform(x_test_bin)

In [0]:
parametersBNB = [{'alpha' : [0.0, 0.01, 0.1, 1, 10],
                  'fit_prior' : [True, False]
                 }]

In [0]:
bnb = model_selection.GridSearchCV(BernoulliNB(), param_grid = parametersBNB, scoring = 'accuracy', cv = 5, return_train_score = True)

In [0]:
bnb.fit(x_train_bin, y_train)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                                   fit_prior=True),
             iid='warn', n_jobs=None,
             param_grid=[{'alpha': [0.0, 0.01, 0.1, 1, 10],
                          'fit_prior': [True, False]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=0)

In [0]:
bnb.best_estimator_

BernoulliNB(alpha=1, binarize=0.0, class_prior=None, fit_prior=True)

In [0]:
bnb.best_params_

{'alpha': 1, 'fit_prior': True}

In [0]:
bnb.best_score_

0.6136358308327082

In [0]:
y_pred_bin_train = bnb.predict(x_train_bin)

In [0]:
metrics.accuracy_score(y_train, y_pred_bin_train)

0.6614427044261065

In [0]:
y_pred_bin_test = bnb.predict(x_test_bin)

In [0]:
metrics.accuracy_score(y_test, y_pred_bin_test)

0.6214990242101377

In [0]:
cnf_matrix_bin = metrics.confusion_matrix(y_test, y_pred_bin_test)
df_cnf_matrix = pd.DataFrame(cnf_matrix_bin, index = bnb.classes_, columns = bnb.classes_)

In [0]:
df_cnf_matrix

Unnamed: 0,ARTS & CULTURE,BLACK VOICES,BUSINESS,COMEDY,CRIME,DIVORCE,EDUCATION,ENTERTAINMENT,ENVIRONMENT,FIFTY,FOOD & DRINK,GOOD NEWS,HEALTHY LIVING,HOME & LIVING,IMPACT,LATINO VOICES,MEDIA,MONEY,PARENTS,POLITICS,QUEER VOICES,RELIGION,SCIENCE,SPORTS,STYLE & BEAUTY,TECH,TRAVEL,WEDDINGS,WEIRD NEWS,WOMEN,WORLD NEWS
ARTS & CULTURE,164,5,1,6,2,1,4,59,3,4,5,1,18,8,3,0,1,0,6,11,5,3,3,1,9,0,15,0,9,5,4
BLACK VOICES,9,133,4,2,39,1,10,88,2,0,4,0,12,0,10,0,11,1,13,32,5,7,0,19,15,3,3,1,3,3,4
BUSINESS,3,3,227,1,5,1,5,11,20,1,14,0,72,2,11,0,10,23,12,57,0,1,1,7,7,16,7,1,6,4,14
COMEDY,4,1,5,216,0,5,0,87,4,4,12,0,12,3,0,1,6,4,16,39,3,2,1,7,4,6,6,1,7,3,1
CRIME,3,7,4,0,207,0,1,13,3,0,1,1,2,1,1,0,1,0,7,20,1,1,1,2,0,4,5,0,22,2,6
DIVORCE,1,0,1,2,1,236,0,15,0,2,4,0,16,3,0,0,0,0,21,1,5,1,2,3,0,1,3,16,3,5,0
EDUCATION,2,4,6,1,11,1,82,3,0,2,3,0,18,0,8,0,3,0,15,22,2,1,0,5,0,1,3,0,1,2,2
ENTERTAINMENT,37,36,7,67,32,5,3,979,2,5,20,2,16,1,4,1,7,1,31,50,12,5,1,18,50,10,14,5,11,22,7
ENVIRONMENT,5,1,9,4,10,0,1,6,146,1,10,16,21,6,13,1,2,0,10,30,1,0,9,1,7,0,24,0,10,0,22
FIFTY,3,1,2,0,1,3,0,3,0,20,8,0,25,0,4,0,0,3,25,0,1,0,0,0,2,1,7,1,0,5,0


In [0]:
print(metrics.classification_report(y_test, y_pred_bin_test), sep = '\n')

                precision    recall  f1-score   support

ARTS & CULTURE       0.45      0.46      0.46       356
  BLACK VOICES       0.44      0.31      0.36       434
      BUSINESS       0.44      0.42      0.43       542
        COMEDY       0.48      0.47      0.47       460
         CRIME       0.39      0.66      0.49       316
       DIVORCE       0.71      0.69      0.70       342
     EDUCATION       0.41      0.41      0.41       198
 ENTERTAINMENT       0.54      0.67      0.60      1461
   ENVIRONMENT       0.46      0.40      0.43       366
         FIFTY       0.19      0.17      0.18       115
  FOOD & DRINK       0.63      0.81      0.71       803
     GOOD NEWS       0.36      0.27      0.31       130
HEALTHY LIVING       0.71      0.71      0.71      2340
 HOME & LIVING       0.73      0.65      0.69       417
        IMPACT       0.30      0.32      0.31       328
 LATINO VOICES       0.53      0.18      0.27       109
         MEDIA       0.42      0.36      0.39  

In [0]:
parametersMNB = [{'alpha' : [0.0, 0.01, 0.1, 1, 10],
                  'fit_prior' : [True, False]
                 }]

In [0]:
mnb_vect = model_selection.GridSearchCV(MultinomialNB(), param_grid = parametersMNB, scoring = 'accuracy', cv = 5, return_train_score = True)

In [0]:
mnb_vect.fit(x_train_vectorized, y_train)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='warn', n_jobs=None,
             param_grid=[{'alpha': [0.0, 0.01, 0.1, 1, 10],
                          'fit_prior': [True, False]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=0)

In [0]:
mnb_vect.best_estimator_

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [0]:
mnb_vect.best_params_

{'alpha': 1, 'fit_prior': True}

In [0]:
mnb_vect.best_score_

0.6171464741185296

In [0]:
y_pred_vect_train = mnb_vect.predict(x_train_vectorized)

In [0]:
metrics.accuracy_score(y_train, y_pred_vect_train)

0.6637167104276069

In [0]:
y_pred_vect_test = mnb_vect.predict(x_test_vectorized)

In [0]:
metrics.accuracy_score(y_test, y_pred_bin_test)

0.6214990242101377

In [0]:
cnf_matrix_vec = metrics.confusion_matrix(y_test, y_pred_vect_test)
df_cnf_matrix_vec = pd.DataFrame(cnf_matrix_vec, index = mnb_vect.classes_, columns = mnb_vect.classes_)

In [0]:
df_cnf_matrix_vec

Unnamed: 0,ARTS & CULTURE,BLACK VOICES,BUSINESS,COMEDY,CRIME,DIVORCE,EDUCATION,ENTERTAINMENT,ENVIRONMENT,FIFTY,FOOD & DRINK,GOOD NEWS,HEALTHY LIVING,HOME & LIVING,IMPACT,LATINO VOICES,MEDIA,MONEY,PARENTS,POLITICS,QUEER VOICES,RELIGION,SCIENCE,SPORTS,STYLE & BEAUTY,TECH,TRAVEL,WEDDINGS,WEIRD NEWS,WOMEN,WORLD NEWS
ARTS & CULTURE,171,9,1,6,1,0,3,39,7,3,2,2,15,6,4,1,1,0,7,12,9,4,4,1,10,0,17,0,6,10,5
BLACK VOICES,10,141,4,1,41,2,10,77,4,0,3,1,11,0,8,0,13,0,15,35,6,5,0,18,15,3,3,0,3,3,2
BUSINESS,4,4,230,5,3,1,4,5,20,1,9,0,63,3,14,0,10,32,17,52,1,1,2,7,4,14,7,1,5,6,17
COMEDY,6,3,4,218,0,4,1,66,7,3,11,1,13,2,0,1,6,3,17,39,4,3,1,8,4,11,6,3,9,5,1
CRIME,4,8,3,1,218,0,0,2,4,0,0,4,3,1,1,0,1,0,7,21,1,1,2,4,0,3,6,0,13,2,6
DIVORCE,1,0,1,0,1,238,1,15,0,4,2,1,12,3,0,0,1,2,20,0,6,1,2,2,1,2,3,17,2,4,0
EDUCATION,1,4,6,1,9,2,93,2,0,2,4,0,13,0,8,0,2,0,15,16,2,3,1,5,0,1,2,1,0,2,3
ENTERTAINMENT,36,42,8,80,35,12,2,900,3,2,12,4,21,0,2,1,8,3,51,48,15,6,3,18,72,12,14,10,10,25,6
ENVIRONMENT,5,1,10,3,7,0,3,4,159,1,8,22,19,4,13,0,1,0,11,28,1,0,10,1,6,2,23,0,7,0,17
FIFTY,1,1,4,0,1,4,0,2,0,23,6,0,23,0,4,0,0,3,28,0,1,1,1,0,1,0,7,1,1,2,0


In [0]:
print(metrics.classification_report(y_test, y_pred_vect_test), sep = '\n')

                precision    recall  f1-score   support

ARTS & CULTURE       0.47      0.48      0.48       356
  BLACK VOICES       0.43      0.32      0.37       434
      BUSINESS       0.45      0.42      0.44       542
        COMEDY       0.47      0.47      0.47       460
         CRIME       0.40      0.69      0.51       316
       DIVORCE       0.67      0.70      0.68       342
     EDUCATION       0.43      0.47      0.45       198
 ENTERTAINMENT       0.60      0.62      0.61      1461
   ENVIRONMENT       0.44      0.43      0.44       366
         FIFTY       0.21      0.20      0.20       115
  FOOD & DRINK       0.68      0.78      0.73       803
     GOOD NEWS       0.28      0.30      0.29       130
HEALTHY LIVING       0.72      0.71      0.71      2340
 HOME & LIVING       0.72      0.66      0.69       417
        IMPACT       0.30      0.34      0.32       328
 LATINO VOICES       0.47      0.27      0.34       109
         MEDIA       0.43      0.43      0.43  

In [0]:
mnb_tfidf = model_selection.GridSearchCV(MultinomialNB(), param_grid = parametersMNB, scoring = 'accuracy', cv = 5, return_train_score = True)

In [0]:
mnb_tfidf.fit(x_train_tfidf, y_train)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='warn', n_jobs=None,
             param_grid=[{'alpha': [0.0, 0.01, 0.1, 1, 10],
                          'fit_prior': [True, False]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=0)

In [0]:
mnb_tfidf.best_estimator_

MultinomialNB(alpha=1, class_prior=None, fit_prior=False)

In [0]:
mnb_tfidf.best_score_

0.6395700487621906

In [0]:
y_pred_tfidf_train = mnb_tfidf.predict(x_train_tfidf)

In [0]:
metrics.accuracy_score(y_train, y_pred_bin_train)

0.6614427044261065

In [0]:
y_pred_tfidf_test = mnb_tfidf.predict(x_test_tfidf)

In [0]:
metrics.accuracy_score(y_test, y_pred_tfidf_test)

0.6396961865077272

In [0]:
cnf_matrix_tfidf = metrics.confusion_matrix(y_test, y_pred_tfidf_test)
df_cnf_matrix_tfidf = pd.DataFrame(cnf_matrix_tfidf, index = mnb_tfidf.classes_, columns = mnb_tfidf.classes_)

In [0]:
df_cnf_matrix_tfidf

Unnamed: 0,ARTS & CULTURE,BLACK VOICES,BUSINESS,COMEDY,CRIME,DIVORCE,EDUCATION,ENTERTAINMENT,ENVIRONMENT,FIFTY,FOOD & DRINK,GOOD NEWS,HEALTHY LIVING,HOME & LIVING,IMPACT,LATINO VOICES,MEDIA,MONEY,PARENTS,POLITICS,QUEER VOICES,RELIGION,SCIENCE,SPORTS,STYLE & BEAUTY,TECH,TRAVEL,WEDDINGS,WEIRD NEWS,WOMEN,WORLD NEWS
ARTS & CULTURE,187,6,3,6,1,1,2,30,6,0,2,1,16,9,5,0,2,0,7,7,13,4,1,0,10,2,18,1,3,10,3
BLACK VOICES,7,165,3,2,41,1,9,70,2,0,3,0,11,0,8,0,8,0,17,29,8,7,0,20,14,2,3,1,0,1,2
BUSINESS,3,6,265,1,4,2,5,2,15,0,10,0,65,4,8,0,9,23,19,42,2,0,2,5,5,13,9,1,3,2,17
COMEDY,7,5,7,226,3,4,2,61,6,0,7,0,10,3,1,0,5,1,21,31,6,1,2,8,11,9,5,2,9,6,1
CRIME,2,5,3,3,231,0,0,1,3,0,1,0,1,0,2,0,2,1,11,17,2,2,1,2,0,1,6,0,8,1,10
DIVORCE,1,0,0,0,0,241,1,14,0,1,3,0,12,3,0,0,0,2,27,1,3,0,0,3,3,2,2,19,1,3,0
EDUCATION,0,4,7,2,9,2,82,2,0,0,2,0,11,0,10,0,1,1,25,22,4,2,0,5,0,0,4,1,0,1,1
ENTERTAINMENT,36,51,6,81,30,12,2,929,3,0,10,3,14,1,2,5,10,1,43,29,23,3,2,17,77,12,13,9,7,20,10
ENVIRONMENT,3,0,8,3,5,0,3,4,198,0,11,7,15,3,8,0,1,0,16,28,1,0,7,1,3,2,22,0,3,0,14
FIFTY,1,0,6,0,1,5,0,2,0,11,5,0,29,2,2,0,0,3,30,0,0,0,0,0,3,0,7,3,0,4,1


In [0]:
print(metrics.classification_report(y_test, y_pred_tfidf_test), sep = '\n')

                precision    recall  f1-score   support

ARTS & CULTURE       0.50      0.53      0.51       356
  BLACK VOICES       0.46      0.38      0.42       434
      BUSINESS       0.43      0.49      0.46       542
        COMEDY       0.46      0.49      0.47       460
         CRIME       0.40      0.73      0.52       316
       DIVORCE       0.67      0.70      0.68       342
     EDUCATION       0.43      0.41      0.42       198
 ENTERTAINMENT       0.65      0.64      0.64      1461
   ENVIRONMENT       0.47      0.54      0.50       366
         FIFTY       0.33      0.10      0.15       115
  FOOD & DRINK       0.69      0.83      0.75       803
     GOOD NEWS       0.41      0.21      0.28       130
HEALTHY LIVING       0.73      0.71      0.72      2340
 HOME & LIVING       0.71      0.70      0.71       417
        IMPACT       0.34      0.37      0.35       328
 LATINO VOICES       0.55      0.21      0.30       109
         MEDIA       0.45      0.47      0.46  