In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


In [2]:
df_titles_info = pd.read_csv('./output/US_count_vectorizer_dataset.csv')
df_titles_info.head()

Unnamed: 0,title,filter_title,filter_title_no_stops,category_id
0,WE WANT TO TALK ABOUT OUR MARRIAGE,we want to talk about our marriage,want talk marriage,People & Blogs
1,The Trump Presidency: Last Week Tonight with J...,the trump presidency last week tonight with jo...,trump presidency last week tonight john oliver...,Entertainment
2,"Racist Superman | Rudy Mancuso, King Bach & Le...",racist superman rudy mancuso king bach lele pons,racist superman rudy mancuso king bach lele pons,Comedy
3,Nickelback Lyrics: Real or Fake?,nickelback lyrics real or fake,nickelback lyrics real fake,Entertainment
4,I Dare You: GOING BALD!?,i dare you going bald,dare going bald,Entertainment


In [3]:
df_x = df_titles_info['filter_title']
df_y = df_titles_info['category_id']

target_names = list(df_titles_info['category_id'].unique())

In [4]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=117)

In [5]:
count_vectorizer = CountVectorizer()
train_count_vector = count_vectorizer.fit_transform(x_train)
test_count_vector = count_vectorizer.transform(x_test)

tfidf_vectorizer = TfidfTransformer()
x_trained_tfidf_vector = tfidf_vectorizer.fit_transform(train_count_vector)
x_test_tfidf_vector = tfidf_vectorizer.transform(test_count_vector)

In [8]:
svm_tfidf = SVC(kernel='linear', gamma='scale').fit(x_trained_tfidf_vector, y_train)
svm_pred_tfidf = svm_tfidf.predict(x_test_tfidf_vector)

print("Accuracy: ", accuracy_score(y_test, svm_pred_tfidf))
print(metrics.classification_report(y_test, svm_pred_tfidf, target_names=target_names))

Accuracy:  0.9807081807081807
                       precision    recall  f1-score   support

       People & Blogs       0.98      0.95      0.96        91
        Entertainment       0.98      0.96      0.97       664
               Comedy       0.98      0.99      0.99       329
 Science & Technology       0.96      0.99      0.97      2016
     Film & Animation       1.00      0.98      0.99       495
      News & Politics       1.00      0.99      0.99       149
               Sports       0.99      0.99      0.99       823
                Music       0.99      0.99      0.99      1245
       Pets & Animals       0.99      0.98      0.98       523
            Education       1.00      1.00      1.00        10
        Howto & Style       0.98      0.96      0.97       660
     Autos & Vehicles       0.98      0.99      0.99       186
      Travel & Events       0.97      0.98      0.97       461
               Gaming       1.00      1.00      1.00        14
Nonprofits & Activism   

In [9]:
svm_count = SVC(kernel='linear', gamma='scale').fit(train_count_vector, y_train)
svm_pred_count = svm_count.predict(test_count_vector)

print("Accuracy: ", accuracy_score(y_test, svm_pred_count))
print(metrics.classification_report(y_test, svm_pred_count, target_names=target_names))

Accuracy:  0.9887667887667888
                       precision    recall  f1-score   support

       People & Blogs       0.99      0.95      0.97        91
        Entertainment       0.98      0.98      0.98       664
               Comedy       0.98      1.00      0.99       329
 Science & Technology       0.99      0.99      0.99      2016
     Film & Animation       0.99      1.00      0.99       495
      News & Politics       1.00      1.00      1.00       149
               Sports       1.00      0.99      1.00       823
                Music       1.00      1.00      1.00      1245
       Pets & Animals       0.99      0.99      0.99       523
            Education       1.00      1.00      1.00        10
        Howto & Style       0.99      0.97      0.98       660
     Autos & Vehicles       0.98      0.99      0.99       186
      Travel & Events       0.98      0.98      0.98       461
               Gaming       1.00      1.00      1.00        14
Nonprofits & Activism   