# Part I: Data Gathering and Preprocessing

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import collections
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

### Importing scikit-learn classifiers

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

### Importing Data

In [3]:
# video = pd.read_csv("C:/Users/manud/Documents/Youtube-Video-Analysis-Classification-and-Prediction-master/Dataset/USvideos.csv", header=0)
# video.head(5)

### Creating a DataFrame for the Dictionary

In [4]:
category_json = pd.read_json("C:/Users/manud/Documents/Youtube-Video-Analysis-Classification-and-Prediction-master/Dataset/category_id.JSON")
category_json.head(5)

Unnamed: 0,kind,etag,items
0,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."
1,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."
2,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."
3,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."
4,youtube#videoCategoryListResponse,"""m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv...","{'kind': 'youtube#videoCategory', 'etag': '""m2..."


In [5]:
category_json.columns

Index(['kind', 'etag', 'items'], dtype='object')

In [6]:
category_dict = [{'id': item['id'], 'title': item['snippet']['title']} for item in category_json['items']]
category_dict

[{'id': '1', 'title': 'Film & Animation'},
 {'id': '2', 'title': 'Autos & Vehicles'},
 {'id': '10', 'title': 'Music'},
 {'id': '15', 'title': 'Pets & Animals'},
 {'id': '17', 'title': 'Sports'},
 {'id': '18', 'title': 'Short Movies'},
 {'id': '19', 'title': 'Travel & Events'},
 {'id': '20', 'title': 'Gaming'},
 {'id': '21', 'title': 'Videoblogging'},
 {'id': '22', 'title': 'People & Blogs'},
 {'id': '23', 'title': 'Comedy'},
 {'id': '24', 'title': 'Entertainment'},
 {'id': '25', 'title': 'News & Politics'},
 {'id': '26', 'title': 'Howto & Style'},
 {'id': '27', 'title': 'Education'},
 {'id': '28', 'title': 'Science & Technology'},
 {'id': '29', 'title': 'Nonprofits & Activism'},
 {'id': '30', 'title': 'Movies'},
 {'id': '31', 'title': 'Anime/Animation'},
 {'id': '32', 'title': 'Action/Adventure'},
 {'id': '33', 'title': 'Classics'},
 {'id': '34', 'title': 'Comedy'},
 {'id': '35', 'title': 'Documentary'},
 {'id': '36', 'title': 'Drama'},
 {'id': '37', 'title': 'Family'},
 {'id': '38', '

In [7]:
categories = pd.read_csv("C:/Users/manud/Documents/Youtube-Video-Analysis-Classification-and-Prediction-master/Dataset/categories.csv", header=0)
categories.head(20)


Unnamed: 0,Category_ID,Category
0,1,Non-Educational (Film & Animation)
1,2,Non-Educational(Autos & Vehicles)
2,10,Non-Educational (Music)
3,15,Non-Educational (Pets & Animals)
4,17,Non-Educational (Sports)
5,18,Non-Educational (Short Movies)
6,19,Non-Educational (Travel & Events)
7,20,Non-Educational (Gaming)
8,21,Non-Educational (Videoblogging)
9,22,Non-Educational (People & Blogs)


In [8]:
categories['Outcome'] = categories['Category'].str.contains("Non-Educational")

In [9]:
categories['Outcome'].replace({True:0,False:1}, inplace=True)

In [10]:
categories['Outcome'].value_counts()

0    31
1     1
Name: Outcome, dtype: int64

In [11]:
categories

Unnamed: 0,Category_ID,Category,Outcome
0,1,Non-Educational (Film & Animation),0
1,2,Non-Educational(Autos & Vehicles),0
2,10,Non-Educational (Music),0
3,15,Non-Educational (Pets & Animals),0
4,17,Non-Educational (Sports),0
5,18,Non-Educational (Short Movies),0
6,19,Non-Educational (Travel & Events),0
7,20,Non-Educational (Gaming),0
8,21,Non-Educational (Videoblogging),0
9,22,Non-Educational (People & Blogs),0


### Deleting unused columns and renaming the remaining columns

In [12]:
# new_columns = ['category_id','title','description','tags']
# new_video = video[new_columns]
# new_video.to_csv("C:/Users/manud/Documents/Youtube-Video-Analysis-Classification-and-Prediction-master/Dataset/new_Usvideos.csv", index=False)
# new_video = pd.read_csv("C:/Users/manud/Documents/Youtube-Video-Analysis-Classification-and-Prediction-master/Dataset/new_Usvideos.csv", header=0,
#             names=['Category_ID','Title','Description','Tags'])
# new_video.head(5)

new_video = pd.read_csv("C:/Users/manud/Documents/Youtube-Video-Analysis-Classification-and-Prediction-master/Dataset/mydata.csv", header=0)
new_video.head(5)

Unnamed: 0,Category_ID,Title
0,27,Learning Shapes And Bubbles With Blippi | Educ...
1,24,Five Little Princesses | Little Angel And Frie...
2,27,Blippi&#39;s NEW Halloween Music Video! | Blip...
3,24,Diana and Roma Logic Games and Activities / Co...
4,27,Learn About Boats And Other Fun Vehicles With ...


In [13]:
new_video['Outcome'] = (new_video['Category_ID'] == 27) | (new_video['Category_ID'] == 28)

In [14]:
new_video['Outcome'].replace({True:1, False:0}, inplace=True)

In [15]:
new_video['Outcome'].value_counts()

0    2384
1    1834
Name: Outcome, dtype: int64

In [16]:
new_video

Unnamed: 0,Category_ID,Title,Outcome
0,27,Learning Shapes And Bubbles With Blippi | Educ...,1
1,24,Five Little Princesses | Little Angel And Frie...,0
2,27,Blippi&#39;s NEW Halloween Music Video! | Blip...,1
3,24,Diana and Roma Logic Games and Activities / Co...,0
4,27,Learn About Boats And Other Fun Vehicles With ...,1
...,...,...,...
4213,22,கண்கள் கதறும் FEEL GOOD படம் |TVO|Tamil Voice ...,0
4214,24,Thalapathy Vijay Tamil Super Hit Samantha Inte...,0
4215,1,இயேசுவின் வாழ்க்கை | Tamil | Official Full HD ...,0
4216,22,நண்பனின் குடும்பத்தை தேடி சாகச பயணம்|TVO|Tamil...,0


In [17]:
new_video.loc[2316,'Title']

'What If the Yellowstone Volcano Erupted Tomorrow?what if what happens if scifi science documentary what if scenario mysteries what if humanity what if earth yellowstone what if yellowstone explodes yellowstone volcano eruption yellowstone eruption simulation supervolcano yellowstone supervolcano supervolcano eruption how dangerous is yellowstone volcanoes earth volcanoes documentary geology volcanologist yellowstone national park super-eruptions biggest volcano eruptions'

# Part II: Training

### Splitting 'title' into string of words using CountVectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vector = TfidfVectorizer(min_df=1)
counts = vector.fit_transform(new_video['Title'].values)


In [19]:
vector.get_feature_names_out()

array(['000', '0001', '000fps', ..., '화사', '휘인', '𝙊𝙎𝙎𝘾ossc'], dtype=object)

### Using various classification models and targetting 'Category'

In [20]:
NB_Model = MultinomialNB()
RFC_Model = RandomForestClassifier()
SVC_Model = SVC()
KNC_Model = KNeighborsClassifier()
DTC_Model = DecisionTreeClassifier()

In [21]:
output = new_video['Category_ID'].values

In [22]:
NB_Model.fit(counts,output)

In [23]:
RFC_Model.fit(counts,output)

In [24]:
SVC_Model.fit(counts,output)

In [25]:
KNC_Model.fit(counts,output)

In [26]:
DTC_Model.fit(counts,output)

### Checking the accuracy using 80/20 train/test split

In [27]:
X = counts
Y = new_video['Outcome']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [28]:
NBtest = MultinomialNB().fit(X_train,Y_train)
nb_predictions = NBtest.predict(X_test)
acc_nb = NBtest.score(X_test, Y_test)
print('The Naive Bayes Algorithm has an accuracy of', acc_nb)

The Naive Bayes Algorithm has an accuracy of 0.8957345971563981


In [29]:
RFCtest = RandomForestClassifier().fit(X_train,Y_train)
rfc_predictions = RFCtest.predict(X_test)
acc_rfc = RFCtest.score(X_test, Y_test)
print('The Random Forest Algorithm has an accuracy of', acc_rfc)

The Random Forest Algorithm has an accuracy of 0.9170616113744076


In [30]:
SVCtest = SVC().fit(X_train,Y_train)
svc_predictions = SVCtest.predict(X_test)
acc_svc = SVCtest.score(X_test, Y_test)
print('The Support Vector Algorithm has an accuracy of', acc_svc)

The Support Vector Algorithm has an accuracy of 0.8850710900473934


In [31]:
KNCtest = KNeighborsClassifier().fit(X_train,Y_train)
knc_predictions = KNCtest.predict(X_test)
acc_knc = KNCtest.score(X_test, Y_test)
print('The K Neighbors Algorithm has an accuracy of', acc_knc)

The K Neighbors Algorithm has an accuracy of 0.8613744075829384


In [32]:
DTCtest = DecisionTreeClassifier().fit(X_train,Y_train)
dtc_predictions = DTCtest.predict(X_test)
acc_dtc = DTCtest.score(X_test, Y_test)
print('The Decision Tree Algorithm has an accuracy of', acc_dtc)

The Decision Tree Algorithm has an accuracy of 0.9111374407582938


In [33]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, Y_train)
print('The XGBoost has an accuracy of', xgb_clf.score(X_test, Y_test))

The XGBoost has an accuracy of 0.9123222748815166


# Part III: Test

### Entering hypothetical titles to predict the category

In [34]:
Titles = ["The Riddle That Seems Impossible Even If You Know The Answer"]

### Inserting above titles into each classifier model

In [35]:
Titles_counts = vector.transform(Titles)

### Naive Bayes Model

In [36]:
PredictNB = NB_Model.predict(Titles_counts)
PredictNB

array([28], dtype=int64)

### Random Forest Model

In [37]:
PredictRFC = RFC_Model.predict(Titles_counts)
PredictRFC

array([27], dtype=int64)

### SVC Model

In [38]:
PredictSVC = SVC_Model.predict(Titles_counts)
PredictSVC

array([28], dtype=int64)

### K Neighbors Model

In [39]:
PredictKNC = KNC_Model.predict(Titles_counts)
PredictKNC

array([1], dtype=int64)

### Decision Tree Model

In [40]:
PredictDTC = DTC_Model.predict(Titles_counts)
PredictDTC

array([22], dtype=int64)

### Output will be an array of numbers. Iterate through the Category Dictionary (from JSON file) to find "title"

In [41]:
CategoryNamesListNB = []
for Category_ID in PredictNB:
    MatchingCategoriesNB = [x for x in category_dict if x["id"] == str(Category_ID)]
    if MatchingCategoriesNB:
        CategoryNamesListNB.append(MatchingCategoriesNB[0]["title"])

In [42]:
CategoryNamesListRFC = []
for Category_ID in PredictRFC:
    MatchingCategoriesRFC = [x for x in category_dict if x["id"] == str(Category_ID)]
    if MatchingCategoriesRFC:
        CategoryNamesListRFC.append(MatchingCategoriesRFC[0]["title"])

In [43]:
CategoryNamesListSVC = []
for Category_ID in PredictSVC:
    MatchingCategoriesSVC = [x for x in category_dict if x["id"] == str(Category_ID)]
    if MatchingCategoriesSVC:
        CategoryNamesListSVC.append(MatchingCategoriesSVC[0]["title"])

In [44]:
CategoryNamesListKNC = []
for Category_ID in PredictKNC:
    MatchingCategoriesKNC = [x for x in category_dict if x["id"] == str(Category_ID)]
    if MatchingCategoriesKNC:
        CategoryNamesListKNC.append(MatchingCategoriesKNC[0]["title"])

In [45]:
CategoryNamesListDTC = []
for Category_ID in PredictDTC:
    MatchingCategoriesDTC = [x for x in category_dict if x["id"] == str(Category_ID)]
    if MatchingCategoriesDTC:
        CategoryNamesListDTC.append(MatchingCategoriesDTC[0]["title"])

### Mapping these values to the Titles we want to Predict

In [46]:
TitleDataFrameNB = []
for i in range(0, len(Titles)):
    TitleToCategoriesNB = {'Title': Titles[i],  'Category': CategoryNamesListNB[i]}
    TitleDataFrameNB.append(TitleToCategoriesNB)

In [47]:
TitleDataFrameRFC = []
for i in range(0, len(Titles)):
    TitleToCategoriesRFC = {'Title': Titles[i],  'Category': CategoryNamesListRFC[i]}
    TitleDataFrameRFC.append(TitleToCategoriesRFC)

In [48]:
TitleDataFrameSVC = []
for i in range(0, len(Titles)):
    TitleToCategoriesSVC = {'Title': Titles[i],  'Category': CategoryNamesListSVC[i]}
    TitleDataFrameSVC.append(TitleToCategoriesSVC)

In [49]:
TitleDataFrameKNC = []
for i in range(0, len(Titles)):
    TitleToCategoriesKNC = {'Title': Titles[i],  'Category': CategoryNamesListKNC[i]}
    TitleDataFrameKNC.append(TitleToCategoriesKNC)

In [50]:
TitleDataFrameDTC = []
for i in range(0, len(Titles)):
    TitleToCategoriesDTC = {'Title': Titles[i],  'Category': CategoryNamesListDTC[i]}
    TitleDataFrameDTC.append(TitleToCategoriesDTC)

### Converting the resulting Dictionary to a Data Frame

In [51]:
PredictDFnb = pd.DataFrame(PredictNB)
TitleDFnb = pd.DataFrame(TitleDataFrameNB)
PreFinalDFnb = pd.concat([PredictDFnb, TitleDFnb], axis=1)
PreFinalDFnb.columns = (['Categ_ID', 'Predicted Category', 'Hypothetical Video Title'])
FinalDFnb = PreFinalDFnb.drop(['Categ_ID'],axis=1)
colsNB = FinalDFnb.columns.tolist()
colsNB = colsNB[-1:] + colsNB[:-1]
FinalDFnb= FinalDFnb[colsNB]

In [52]:
PredictDFrfc = pd.DataFrame(PredictRFC)
TitleDFrfc = pd.DataFrame(TitleDataFrameRFC)
PreFinalDFrfc = pd.concat([PredictDFrfc, TitleDFrfc], axis=1)
PreFinalDFrfc.columns = (['Categ_ID', 'Predicted Category', 'Hypothetical Video Title'])
FinalDFrfc = PreFinalDFrfc.drop(['Categ_ID'],axis=1)
colsRFC = FinalDFrfc.columns.tolist()
colsRFC = colsRFC[-1:] + colsRFC[:-1]
FinalDFrfc= FinalDFrfc[colsRFC]

In [53]:
PredictDFsvc = pd.DataFrame(PredictSVC)
TitleDFsvc = pd.DataFrame(TitleDataFrameSVC)
PreFinalDFsvc = pd.concat([PredictDFsvc, TitleDFsvc], axis=1)
PreFinalDFsvc.columns = (['Categ_ID', 'Predicted Category', 'Hypothetical Video Title'])
FinalDFsvc = PreFinalDFsvc.drop(['Categ_ID'],axis=1)
colsSVC = FinalDFsvc.columns.tolist()
colsSVC = colsSVC[-1:] + colsSVC[:-1]
FinalDFsvc= FinalDFsvc[colsSVC]

In [54]:
PredictDFknc = pd.DataFrame(PredictKNC)
TitleDFknc = pd.DataFrame(TitleDataFrameKNC)
PreFinalDFknc = pd.concat([PredictDFknc, TitleDFknc], axis=1)
PreFinalDFknc.columns = (['Categ_ID', 'Predicted Category', 'Hypothetical Video Title'])
FinalDFknc = PreFinalDFknc.drop(['Categ_ID'],axis=1)
colsKNC = FinalDFknc.columns.tolist()
colsKNC = colsKNC[-1:] + colsKNC[:-1]
FinalDFknc= FinalDFknc[colsKNC]

In [55]:
PredictDFdtc = pd.DataFrame(PredictDTC)
TitleDFdtc = pd.DataFrame(TitleDataFrameDTC)
PreFinalDFdtc = pd.concat([PredictDFdtc, TitleDFdtc], axis=1)
PreFinalDFdtc.columns = (['Categ_ID', 'Predicted Category', 'Hypothetical Video Title'])
FinalDFdtc = PreFinalDFdtc.drop(['Categ_ID'],axis=1)
colsDTC = FinalDFdtc.columns.tolist()
colsDTC = colsDTC[-1:] + colsDTC[:-1]
FinalDFdtc= FinalDFdtc[colsDTC]

### Viewing the Final Prediction Results

# Demo

In [56]:
Titles = ["LIFE IS TOUGH (Official Video) - Guru Mann Hindi Song | Cinematic Bollywood || Rubbal GTR"
         ,"The Riddle That Seems Impossible Even If You Know The Answer","How Electricity Actually Works",
         "Future Computers Will Be Radically Different (Analog Computing)","The Man Who Accidentally Killed The Most People In History",
         "Why can't you go faster than light?","One of the most counterintuitive facts of our universe is that you can’t go faster than the speed of light.  From this single observation arise all of the mind-bending behaviors of special relativity.  But why is this so?  In this in-depth video, Fermilab’s Dr. Don Lincoln explains the real reason that you can’t go faster than the speed of light.  It will blow your mind"]

In [57]:
Titles_counts = vector.transform(Titles)
PredictDTC = DTC_Model.predict(Titles_counts)

CategoryNamesListDTC = []
for Category_ID in PredictDTC:
    MatchingCategoriesDTC = [x for x in category_dict if x["id"] == str(Category_ID)]
    if MatchingCategoriesDTC:
        CategoryNamesListDTC.append(MatchingCategoriesDTC[0]["title"])

TitleDataFrameDTC = []
for i in range(0, len(Titles)):
    TitleToCategoriesDTC = {'Title': Titles[i],  'Category': CategoryNamesListDTC[i]}
    TitleDataFrameDTC.append(TitleToCategoriesDTC)
    
PredictDFdtc = pd.DataFrame(PredictDTC)
TitleDFdtc = pd.DataFrame(TitleDataFrameDTC)
PreFinalDFdtc = pd.concat([PredictDFdtc, TitleDFdtc], axis=1)
PreFinalDFdtc.columns = (['Categ_ID', 'Predicted Category', 'Hypothetical Video Title'])
FinalDFdtc = PreFinalDFdtc.drop(['Categ_ID'],axis=1)
colsDTC = FinalDFdtc.columns.tolist()
colsDTC = colsDTC[-1:] + colsDTC[:-1]
FinalDFdtc= FinalDFdtc[colsDTC]

# Decision Trees
FinalDFdtc

Unnamed: 0,Hypothetical Video Title,Predicted Category
0,Music,LIFE IS TOUGH (Official Video) - Guru Mann Hin...
1,People & Blogs,The Riddle That Seems Impossible Even If You K...
2,People & Blogs,How Electricity Actually Works
3,Education,Future Computers Will Be Radically Different (...
4,Science & Technology,The Man Who Accidentally Killed The Most Peopl...
5,Gaming,Why can't you go faster than light?
6,Education,One of the most counterintuitive facts of our ...


In [58]:
import pickle

In [59]:
vec_file = 'vectorizer.pickle'
pickle.dump(vector, open(vec_file, 'wb'))

In [60]:
pickle.dump(xgb_clf, open('premodel.model', 'wb'))

In [63]:
vectorizer = pickle.load(open('C:/Users/manud/Documents/MyYtServer/models/vectorizer.pickle', 'rb'))
model = pickle.load(open('C:/Users/manud/Documents/MyYtServer/models/premodel.model', 'rb'))
# Title = ["Why jump cat dog you"]

Titles = ["Why this kolaveri di  ","LIFE IS TOUGH (Official Video) - Guru Mann Hindi Song | Cinematic Bollywood || Rubbal GTR"
         ,"The Riddle That Seems Impossible Even If You Know The Answer","How Electricity Actually Works",
         "Future Computers Will Be Radically Different (Analog Computing)","The Man Who Accidentally Killed The Most People In History",
         "Why can't you go faster than light?","One of the most counterintuitive facts of our universe is that you can’t go faster than the speed of light.  From this single observation arise all of the mind-bending behaviors of special relativity.  But why is this so?  In this in-depth video, Fermilab’s Dr. Don Lincoln explains the real reason that you can’t go faster than the speed of light.  It will blow your mind"]
print(model.predict(vectorizer.transform(Titles)))

[0 0 0 0 0 1 0 0]
