In [31]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import metrics
from sklearn.metrics import accuracy_score


import pandas as pd

import string
from nltk.corpus import stopwords


In [2]:
df_dataset = pd.read_csv('./output/US_cleaned_data_2018')
df_dataset.tail()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,video_error_or_removed,description
41410,BZt0qjTWNhw,18.06.14,The Cat Who Caught the Laser,AaronsAnimals,Pets & Animals,18.05.18,"['aarons animals', '""aarons""', '""animals""', '""...",1685609.0,38160.0,1385.0,2657.0,False,The Cat Who Caught the Laser - Aaron's Animals
41411,1h7KV2sjUWY,18.06.14,True Facts : Ant Mutualism,zefrank1,People & Blogs,18.05.18,['[none]'],1064798.0,60008.0,382.0,3936.0,False,
41412,D6Oy4LfoqsU,18.06.14,I GAVE SAFIYA NYGAARD A PERFECT HAIR MAKEOVER ...,Brad Mondo,Entertainment,18.05.18,['I gave safiya nygaard a perfect hair makeove...,1066451.0,48068.0,1032.0,3992.0,False,I had so much fun transforming Safiyas hair in...
41413,oV0zkMe1K8s,18.06.14,How Black Panther Should Have Ended,How It Should Have Ended,Film & Animation,18.05.17,"['Black Panther', '""HISHE""', '""Marvel""', '""Inf...",5660813.0,192957.0,2846.0,13088.0,False,How Black Panther Should Have EndedWatch More ...
41414,ooyjaVdt-jA,18.06.14,Official Call of Duty®: Black Ops 4 — Multipla...,Call of Duty,Gaming,18.05.17,"['call of duty', '""cod""', '""activision""', '""Bl...",10306119.0,357079.0,212976.0,144795.0,False,Call of Duty: Black Ops 4 Multiplayer raises t...


In [3]:
df_titles_info = pd.DataFrame(df_dataset['title'])
df_titles_info['filter_title'] = df_titles_info['title']
df_titles_info['category_id'] = df_dataset['category_id']

df_titles_info = df_titles_info.dropna()
df_titles_info

Unnamed: 0,title,filter_title,category_id
0,WE WANT TO TALK ABOUT OUR MARRIAGE,WE WANT TO TALK ABOUT OUR MARRIAGE,People & Blogs
1,The Trump Presidency: Last Week Tonight with J...,The Trump Presidency: Last Week Tonight with J...,Entertainment
2,"Racist Superman | Rudy Mancuso, King Bach & Le...","Racist Superman | Rudy Mancuso, King Bach & Le...",Comedy
3,Nickelback Lyrics: Real or Fake?,Nickelback Lyrics: Real or Fake?,Entertainment
4,I Dare You: GOING BALD!?,I Dare You: GOING BALD!?,Entertainment
...,...,...,...
41410,The Cat Who Caught the Laser,The Cat Who Caught the Laser,Pets & Animals
41411,True Facts : Ant Mutualism,True Facts : Ant Mutualism,People & Blogs
41412,I GAVE SAFIYA NYGAARD A PERFECT HAIR MAKEOVER ...,I GAVE SAFIYA NYGAARD A PERFECT HAIR MAKEOVER ...,Entertainment
41413,How Black Panther Should Have Ended,How Black Panther Should Have Ended,Film & Animation


In [4]:
# https://adataanalyst.com/scikit-learn/countvectorizer-sklearn-example/
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    3. Return String
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    list_words = nopunc.split()
    
    sentence = ' '.join(list_words)
    
    return sentence.lower()


In [38]:
df_titles_info['filter_title'] = df_titles_info['title'].apply(text_process)
df_x = df_titles_info['filter_title']
df_y = df_titles_info['category_id']
target_names = list(df_titles_info['category_id'].unique())

In [40]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=117)

In [49]:
count_vectorizer = CountVectorizer()
train_count_vector = count_vectorizer.fit_transform(x_train)
test_count_vector = count_vectorizer.transform(x_test)
print(count_vector.shape)
# print(count_vectorizer.vocabulary_)
print(count_vector.toarray())

(32759, 10605)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [42]:
tfidf_vectorizer = TfidfTransformer()
x_trained_tfidf_vector = tfidf_vectorizer.fit_transform(train_count_vector)
x_test_tfidf_vector = tfidf_vectorizer.transform(test_count_vector)
print(x_trained_tfidf_vector.shape)
# print(count_vectorizer.vocabulary_)
print(x_trained_tfidf_vector.toarray())

(32759, 10574)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [46]:
clf_tfidf = MultinomialNB().fit(x_trained_tfidf_vector, y_train)

In [48]:
pred_tfidf = clf_tfidf.predict(x_test_tfidf_vector)
print("Accuracy: ", accuracy_score(y_test, pred_tfidf))
print(metrics.classification_report(y_test, pred_tfidf, target_names=target_names))

Accuracy:  0.860927960927961
                       precision    recall  f1-score   support

       People & Blogs       1.00      0.20      0.33        91
        Entertainment       0.92      0.77      0.84       664
               Comedy       0.98      0.79      0.88       329
 Science & Technology       0.72      0.97      0.82      2016
     Film & Animation       0.99      0.75      0.86       495
      News & Politics       1.00      0.59      0.74       149
               Sports       0.89      0.91      0.90       823
                Music       0.91      0.96      0.93      1245
       Pets & Animals       0.96      0.85      0.90       523
            Education       0.00      0.00      0.00        10
        Howto & Style       0.96      0.72      0.82       660
     Autos & Vehicles       0.99      0.77      0.87       186
      Travel & Events       0.91      0.84      0.87       461
               Gaming       1.00      0.57      0.73        14
Nonprofits & Activism    

  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
clf_count = MultinomialNB().fit(train_count_vector, y_train)
pred_count = clf_count.predict(test_count_vector)

In [51]:
print("Accuracy: ", accuracy_score(y_test, pred_count))
print(metrics.classification_report(y_test, pred_count, target_names=target_names))

Accuracy:  0.8996336996336997
                       precision    recall  f1-score   support

       People & Blogs       0.93      0.75      0.83        91
        Entertainment       0.91      0.82      0.86       664
               Comedy       0.91      0.95      0.93       329
 Science & Technology       0.89      0.89      0.89      2016
     Film & Animation       0.93      0.88      0.90       495
      News & Politics       0.96      0.89      0.92       149
               Sports       0.85      0.92      0.89       823
                Music       0.92      0.96      0.94      1245
       Pets & Animals       0.92      0.93      0.92       523
            Education       1.00      0.20      0.33        10
        Howto & Style       0.91      0.83      0.87       660
     Autos & Vehicles       0.94      0.89      0.91       186
      Travel & Events       0.87      0.92      0.89       461
               Gaming       1.00      0.86      0.92        14
Nonprofits & Activism   