In [2]:
import pandas as pd
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import metrics
from sklearn.metrics import accuracy_score


In [3]:
df_titles_info = pd.read_csv('./output/US_count_vectorizer_dataset.csv')
df_titles_info.head()

Unnamed: 0,title,filter_title,filter_title_no_stops,category_id
0,WE WANT TO TALK ABOUT OUR MARRIAGE,we want to talk about our marriage,want talk marriage,People & Blogs
1,The Trump Presidency: Last Week Tonight with J...,the trump presidency last week tonight with jo...,trump presidency last week tonight john oliver...,Entertainment
2,"Racist Superman | Rudy Mancuso, King Bach & Le...",racist superman rudy mancuso king bach lele pons,racist superman rudy mancuso king bach lele pons,Comedy
3,Nickelback Lyrics: Real or Fake?,nickelback lyrics real or fake,nickelback lyrics real fake,Entertainment
4,I Dare You: GOING BALD!?,i dare you going bald,dare going bald,Entertainment


In [27]:
df_x = df_titles_info['filter_title']
df_y = df_titles_info['category_id']

df_titles_info.to_csv('./output/US_count_vectorizer_dataset.csv', index=False)

target_names = list(df_titles_info['category_id'].unique())

In [28]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=117)

In [29]:
count_vectorizer = CountVectorizer()
train_count_vector = count_vectorizer.fit_transform(x_train)
test_count_vector = count_vectorizer.transform(x_test)
print(train_count_vector.shape)
# print(count_vectorizer.vocabulary_)
print(train_count_vector.toarray())

(32759, 10574)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [30]:
tfidf_vectorizer = TfidfTransformer()
x_trained_tfidf_vector = tfidf_vectorizer.fit_transform(train_count_vector)
x_test_tfidf_vector = tfidf_vectorizer.transform(test_count_vector)
print(x_trained_tfidf_vector.shape)
# print(count_vectorizer.vocabulary_)
print(x_trained_tfidf_vector.toarray())

(32759, 10574)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [31]:
clf_tfidf = MultinomialNB().fit(x_trained_tfidf_vector, y_train)

In [32]:
pred_tfidf = clf_tfidf.predict(x_test_tfidf_vector)
print("Accuracy: ", accuracy_score(y_test, pred_tfidf))
print(metrics.classification_report(y_test, pred_tfidf, target_names=target_names))

Accuracy:  0.860927960927961
                       precision    recall  f1-score   support

       People & Blogs       1.00      0.20      0.33        91
        Entertainment       0.92      0.77      0.84       664
               Comedy       0.98      0.79      0.88       329
 Science & Technology       0.72      0.97      0.82      2016
     Film & Animation       0.99      0.75      0.86       495
      News & Politics       1.00      0.59      0.74       149
               Sports       0.89      0.91      0.90       823
                Music       0.91      0.96      0.93      1245
       Pets & Animals       0.96      0.85      0.90       523
            Education       0.00      0.00      0.00        10
        Howto & Style       0.96      0.72      0.82       660
     Autos & Vehicles       0.99      0.77      0.87       186
      Travel & Events       0.91      0.84      0.87       461
               Gaming       1.00      0.57      0.73        14
Nonprofits & Activism    

  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
clf_count = MultinomialNB().fit(train_count_vector, y_train)
pred_count = clf_count.predict(test_count_vector)

In [34]:
print("Accuracy: ", accuracy_score(y_test, pred_count))
print(metrics.classification_report(y_test, pred_count, target_names=target_names))

Accuracy:  0.8996336996336997
                       precision    recall  f1-score   support

       People & Blogs       0.93      0.75      0.83        91
        Entertainment       0.91      0.82      0.86       664
               Comedy       0.91      0.95      0.93       329
 Science & Technology       0.89      0.89      0.89      2016
     Film & Animation       0.93      0.88      0.90       495
      News & Politics       0.96      0.89      0.92       149
               Sports       0.85      0.92      0.89       823
                Music       0.92      0.96      0.94      1245
       Pets & Animals       0.92      0.93      0.92       523
            Education       1.00      0.20      0.33        10
        Howto & Style       0.91      0.83      0.87       660
     Autos & Vehicles       0.94      0.89      0.91       186
      Travel & Events       0.87      0.92      0.89       461
               Gaming       1.00      0.86      0.92        14
Nonprofits & Activism   