In [0]:
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import feature_extraction
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [0]:
df = pd.read_json('preprocessing_News_Category_Dataset_v2.json')
df

Unnamed: 0,category,text
0,CRIME,2 mass shootings texas last week 1 tv left hus...
1,ENTERTAINMENT,smith joins diplo nicky jam 2018 world cup off...
2,ENTERTAINMENT,hugh grant marries first time age 57 actor lon...
3,ENTERTAINMENT,jim carrey blasts castrato adam schiff democra...
4,ENTERTAINMENT,julianna margulies uses donald trump poop bags...
5,ENTERTAINMENT,morgan freeman devastated sexual harassment cl...
6,ENTERTAINMENT,donald trump lovin new mcdonald jingle tonight...
7,ENTERTAINMENT,watch amazon prime new week great mini-series ...
8,ENTERTAINMENT,mike myers reveals like fourth austin powers f...
9,ENTERTAINMENT,watch hulu new week getting recent academy awa...


In [0]:
x = df['text']
y = df['category']

In [0]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size = 0.1, random_state = 7, stratify = y)

In [0]:
vectorized = feature_extraction.text.CountVectorizer(max_features=5000)

In [0]:
vectorized.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=5000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [0]:
x_train_vectorized = vectorized.transform(x_train)
x_test_vectorized = vectorized.transform(x_test)

In [0]:
vect_tfidf = feature_extraction.text.TfidfVectorizer(max_features = 10250)

In [0]:
vect_tfidf.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=10250,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [0]:
x_train_tfidf = vect_tfidf.transform(x_train)
x_test_tfidf = vect_tfidf.transform(x_test)

In [0]:
parameters = [{'C' : [0.01, 1, 10],
               'solver' : ['newton-cg', 'sag'],
               'multi_class' : ['multinomial']
                
              }]

In [0]:
mlr_tfidfcv = model_selection.GridSearchCV(LogisticRegression(), param_grid = parameters, scoring='accuracy', cv=5, return_train_score=True)

In [0]:
mlr_tfidfcv.fit(x_train_tfidf, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [0.01, 1, 10], 'multi_class': ['multinomial'],
                          'solver': ['newton-cg', 'sag']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=0)

In [0]:
mlr_tfidfcv.estimator

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
mlr_tfidfcv.best_score_

0.660932811327832

In [0]:
y_pred_tfidf_train = mlr_tfidfcv.predict(x_train_tfidf)

In [0]:
metrics.accuracy_score(y_train, y_pred_tfidf_train)

0.7329039291072769

In [0]:
y_pred_tfidf_test = mlr_tfidfcv.predict(x_test_tfidf)

In [0]:
metrics.accuracy_score(y_test, y_pred_tfidf_test)

0.6672292842449496

In [0]:
cnf_matrix_tfidf = metrics.confusion_matrix(y_test, y_pred_tfidf_test)
df_cnf_matrix = pd.DataFrame(cnf_matrix_tfidf, index = mlr_tfidfcv.classes_, columns = mlr_tfidfcv.classes_)
df_cnf_matrix

Unnamed: 0,ARTS & CULTURE,BLACK VOICES,BUSINESS,COMEDY,CRIME,DIVORCE,EDUCATION,ENTERTAINMENT,ENVIRONMENT,FIFTY,FOOD & DRINK,GOOD NEWS,HEALTHY LIVING,HOME & LIVING,IMPACT,LATINO VOICES,MEDIA,MONEY,PARENTS,POLITICS,QUEER VOICES,RELIGION,SCIENCE,SPORTS,STYLE & BEAUTY,TECH,TRAVEL,WEDDINGS,WEIRD NEWS,WOMEN,WORLD NEWS
ARTS & CULTURE,157,7,2,2,0,1,3,52,6,0,3,0,39,6,3,0,0,0,8,20,6,1,1,0,6,0,17,1,2,9,4
BLACK VOICES,5,139,5,2,11,1,9,86,1,0,3,0,19,1,4,0,3,0,17,70,5,5,0,19,15,3,6,0,1,1,3
BUSINESS,3,2,243,1,0,0,2,12,9,0,12,0,88,4,4,0,4,11,20,85,0,0,0,3,5,8,10,1,1,0,14
COMEDY,2,1,5,192,0,2,0,86,7,1,4,0,32,1,0,0,1,2,14,71,2,1,0,6,5,7,6,1,6,4,1
CRIME,2,12,3,3,162,0,0,7,4,0,0,0,12,2,0,0,2,0,11,58,1,1,0,2,2,0,6,0,12,4,10
DIVORCE,1,1,1,0,0,224,1,18,0,2,4,0,22,3,0,0,0,2,24,2,2,0,1,2,3,1,3,21,0,4,0
EDUCATION,1,3,5,0,5,2,68,3,0,0,4,0,22,0,3,0,0,1,18,49,2,1,0,3,0,0,5,0,0,0,3
ENTERTAINMENT,13,25,9,43,13,7,1,1084,2,1,6,1,43,3,2,1,1,0,33,72,8,1,2,15,33,4,8,8,4,8,10
ENVIRONMENT,2,1,7,3,2,0,0,13,156,0,7,4,33,4,3,0,1,0,16,56,0,0,7,1,4,1,22,0,6,0,17
FIFTY,2,0,3,0,0,4,0,3,0,13,5,0,44,0,0,0,0,2,21,3,0,0,0,0,3,0,6,2,0,3,1


In [0]:
print(metrics.classification_report(y_test, y_pred_tfidf_test), sep = '\n')

                precision    recall  f1-score   support

ARTS & CULTURE       0.64      0.44      0.52       356
  BLACK VOICES       0.53      0.32      0.40       434
      BUSINESS       0.52      0.45      0.48       542
        COMEDY       0.60      0.42      0.49       460
         CRIME       0.58      0.51      0.54       316
       DIVORCE       0.85      0.65      0.74       342
     EDUCATION       0.53      0.34      0.42       198
 ENTERTAINMENT       0.59      0.74      0.65      1461
   ENVIRONMENT       0.52      0.43      0.47       366
         FIFTY       0.57      0.11      0.19       115
  FOOD & DRINK       0.75      0.80      0.77       803
     GOOD NEWS       0.66      0.19      0.30       130
HEALTHY LIVING       0.63      0.84      0.72      2340
 HOME & LIVING       0.79      0.68      0.73       417
        IMPACT       0.46      0.23      0.31       328
 LATINO VOICES       0.84      0.25      0.38       109
         MEDIA       0.58      0.33      0.42  

In [0]:
mlr_vectcv = model_selection.GridSearchCV(LogisticRegression(), param_grid = parameters, scoring='accuracy', cv=5, return_train_score=True)

In [0]:
mlr_vectcv.fit(x_train_vectorized, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [0.01, 1, 10], 'multi_class': ['multinomial'],
                          'solver': ['newton-cg', 'sag']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=0)

In [0]:
mlr_vectcv.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
mlr_vectcv.best_score_

0.6248065922730682

In [0]:
y_pred_vec_train = mlr_vectcv.predict(x_train_vectorized)

In [0]:
metrics.accuracy_score(y_train, y_pred_vec_train)

0.7590081114028507

In [0]:
y_pred_vec_test = mlr_vectcv.predict(x_test_vectorized)

In [0]:
metrics.accuracy_score(y_test, y_pred_vec_test)

0.6328392847724036

In [0]:
cnf_matrix_vec = metrics.confusion_matrix(y_test, y_pred_vec_test)
df_cnf_matrix = pd.DataFrame(cnf_matrix_vec, index = mlr_vectcv.classes_, columns = mlr_vectcv.classes_)
df_cnf_matrix

Unnamed: 0,ARTS & CULTURE,BLACK VOICES,BUSINESS,COMEDY,CRIME,DIVORCE,EDUCATION,ENTERTAINMENT,ENVIRONMENT,FIFTY,FOOD & DRINK,GOOD NEWS,HEALTHY LIVING,HOME & LIVING,IMPACT,LATINO VOICES,MEDIA,MONEY,PARENTS,POLITICS,QUEER VOICES,RELIGION,SCIENCE,SPORTS,STYLE & BEAUTY,TECH,TRAVEL,WEDDINGS,WEIRD NEWS,WOMEN,WORLD NEWS
ARTS & CULTURE,151,5,3,4,0,1,2,47,4,4,7,4,22,8,7,1,3,0,8,17,9,1,3,3,9,0,15,2,4,4,8
BLACK VOICES,8,144,6,3,17,1,6,79,2,0,2,1,17,1,6,1,9,0,14,63,5,6,0,21,15,1,3,0,1,1,1
BUSINESS,6,4,206,1,1,0,3,8,11,0,14,1,76,5,13,2,7,23,17,83,0,0,1,9,7,10,9,2,3,3,17
COMEDY,4,3,5,194,1,4,1,93,6,2,7,0,20,1,2,0,4,1,11,57,5,2,1,9,6,6,4,0,7,3,1
CRIME,2,14,3,4,155,0,1,17,6,0,0,1,6,4,1,0,1,3,11,38,4,1,1,3,0,0,8,0,18,3,11
DIVORCE,1,1,2,1,0,240,1,15,0,2,3,0,20,4,0,0,0,2,14,2,3,1,0,1,5,1,2,18,0,3,0
EDUCATION,1,5,7,1,6,2,66,5,1,2,3,0,16,0,8,0,1,0,12,41,2,1,1,3,1,1,5,0,1,2,4
ENTERTAINMENT,27,38,9,45,14,9,2,1000,6,0,16,3,28,5,4,1,4,0,32,64,25,2,1,24,43,5,13,9,6,17,9
ENVIRONMENT,3,2,5,3,3,0,1,12,146,0,7,11,23,6,9,0,0,1,12,52,0,1,15,2,4,1,21,0,7,1,18
FIFTY,1,0,3,0,1,4,0,3,0,16,5,0,34,0,0,0,0,3,19,2,4,0,2,0,3,0,8,3,0,4,0


In [0]:
print(metrics.classification_report(y_test, y_pred_vec_test), sep = '\n')

                precision    recall  f1-score   support

ARTS & CULTURE       0.49      0.42      0.46       356
  BLACK VOICES       0.45      0.33      0.38       434
      BUSINESS       0.44      0.38      0.41       542
        COMEDY       0.53      0.42      0.47       460
         CRIME       0.51      0.49      0.50       316
       DIVORCE       0.77      0.70      0.74       342
     EDUCATION       0.46      0.33      0.39       198
 ENTERTAINMENT       0.56      0.68      0.62      1461
   ENVIRONMENT       0.45      0.40      0.43       366
         FIFTY       0.25      0.14      0.18       115
  FOOD & DRINK       0.69      0.76      0.73       803
     GOOD NEWS       0.34      0.25      0.29       130
HEALTHY LIVING       0.67      0.76      0.71      2340
 HOME & LIVING       0.71      0.65      0.68       417
        IMPACT       0.38      0.28      0.32       328
 LATINO VOICES       0.48      0.28      0.35       109
         MEDIA       0.46      0.31      0.37  