## Recommender System Notebook ##

The two most common types of recommender systems are Content-Based and Collaborative Filtering(CF)
> Collaborative filtering produces recommendations based on knowledge of user's attitude to items. Two sub categories:

    > Memory based
    > Model based

> Content based systems focus on the attributes of the items.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import tensorflow as tf
import seaborn as sns
%matplotlib inline
import re
!pip install nltk
import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.corpus import stopwords
from googletrans import Translator
trans = Translator()



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\meetd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
country_list = ["AU", "BR", "CA", "FR", "DE", "IN", "IT", "MX", "RU", "GB", "US"]
data = pd.DataFrame()
column_names = ['Publish-Time','Title','Description','Video-URL','Thumbnail',
                'Channel-Name','Localized-Title','Localized-Description',
                'Upload-Status','Privacy','License','Embeddable-Status',
                'Public-Stats','Made-For-Kids','View-Count','Likes','Dislikes',
                'Embed-HTML']
for country in country_list:
    temp_data = pd.read_csv('C:/Users/meetd/OneDrive/Desktop/Tech-Mahindra-ML/dataset/Cleaned_Data/'+country+'_data.csv',
                      names=column_names)
    temp_data = temp_data.drop(0)
    temp_data['Country'] = country
    data = data.append(temp_data)

In [3]:
data.head()

Unnamed: 0,Publish-Time,Title,Description,Video-URL,Thumbnail,Channel-Name,Localized-Title,Localized-Description,Upload-Status,Privacy,License,Embeddable-Status,Public-Stats,Made-For-Kids,View-Count,Likes,Dislikes,Embed-HTML,Country
1,2021-07-21T18:00:06Z,I FILLED MY ISLAND HOUSE WITH PACKING PEANUTS!,CHECK OUT CHASECRAFT on iOS :: https://apps.ap...,https://i.ytimg.com/vi/Y3sj_v62dpc/default.jpg,https://i.ytimg.com/vi/Y3sj_v62dpc/mqdefault.jpg,Unspeakable,I FILLED MY ISLAND HOUSE WITH PACKING PEANUTS!,CHECK OUT CHASECRAFT on iOS :: https://apps.ap...,processed,public,youtube,True,False,False,3945994,101089,3084,"<iframe width=""480"" height=""270"" src=""//www.yo...",AU
2,2021-07-21T21:41:14Z,"Best Sand Art Wins $5,000 Challenge! | ZHC Crafts",I can't believe we made art out of sand\nSubsc...,https://i.ytimg.com/vi/aVxHSHzm4kE/default.jpg,https://i.ytimg.com/vi/aVxHSHzm4kE/mqdefault.jpg,ZHC Crafts,"Best Sand Art Wins $5,000 Challenge! | ZHC Crafts",I can't believe we made art out of sand\nSubsc...,processed,public,youtube,True,True,False,2204582,89100,1375,"<iframe width=""480"" height=""270"" src=""//www.yo...",AU
3,2021-07-22T13:00:04Z,Dune | Official Main Trailer,IT’S TIME. #DuneMovie coming October 22.\n\n--...,https://i.ytimg.com/vi/8g18jFHCLXk/default.jpg,https://i.ytimg.com/vi/8g18jFHCLXk/mqdefault.jpg,Warner Bros. Pictures,Dune | Official Main Trailer,IT’S TIME. #DuneMovie coming October 22.\n\n--...,processed,public,youtube,True,False,False,3725450,145277,2302,"<iframe width=""480"" height=""270"" src=""//www.yo...",AU
4,2021-07-22T17:44:43Z,Battlefield 2042 | Battlefield Portal Official...,"Play Battlefield™ 2042 starting October 22, 20...",https://i.ytimg.com/vi/q4qWMcQfOCc/default.jpg,https://i.ytimg.com/vi/q4qWMcQfOCc/mqdefault.jpg,Battlefield,Battlefield 2042 | Battlefield Portal Official...,"Play Battlefield™ 2042 starting October 22, 20...",processed,public,youtube,True,False,False,1443476,154428,1905,"<iframe width=""480"" height=""270"" src=""//www.yo...",AU
5,2021-07-21T05:30:14Z,2nd ODI Highlights | Sri Lanka vs India 2021,2nd ODI Highlights | Sri Lanka vs India 2021\n...,https://i.ytimg.com/vi/8J7BoMdU-qw/default.jpg,https://i.ytimg.com/vi/8J7BoMdU-qw/mqdefault.jpg,Sri Lanka Cricket,2nd ODI Highlights | Sri Lanka vs India 2021,2nd ODI Highlights | Sri Lanka vs India 2021\n...,processed,public,youtube,True,True,False,12056482,296859,9993,"<iframe width=""480"" height=""270"" src=""//www.yo...",AU


In [4]:
data = data.drop(['Publish-Time','Video-URL','Thumbnail','Localized-Title','Localized-Description',
                'Upload-Status','Privacy','License','Embeddable-Status',
                'Public-Stats','Made-For-Kids','Embed-HTML'] , axis=1)
data.head()

Unnamed: 0,Title,Description,Channel-Name,View-Count,Likes,Dislikes,Country
1,I FILLED MY ISLAND HOUSE WITH PACKING PEANUTS!,CHECK OUT CHASECRAFT on iOS :: https://apps.ap...,Unspeakable,3945994,101089,3084,AU
2,"Best Sand Art Wins $5,000 Challenge! | ZHC Crafts",I can't believe we made art out of sand\nSubsc...,ZHC Crafts,2204582,89100,1375,AU
3,Dune | Official Main Trailer,IT’S TIME. #DuneMovie coming October 22.\n\n--...,Warner Bros. Pictures,3725450,145277,2302,AU
4,Battlefield 2042 | Battlefield Portal Official...,"Play Battlefield™ 2042 starting October 22, 20...",Battlefield,1443476,154428,1905,AU
5,2nd ODI Highlights | Sri Lanka vs India 2021,2nd ODI Highlights | Sri Lanka vs India 2021\n...,Sri Lanka Cricket,12056482,296859,9993,AU


## Multinomial Naive Bayes

In [5]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score,accuracy_score
import pickle

In [6]:
stop_words = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf = True, lowercase=True, strip_accents='ascii', stop_words=stop_words)

In [7]:
X_find = vectorizer.fit_transform(data['Title'])
Y_find = data['View-Count']

In [8]:
pickle.dump(vectorizer,open('transform.pkl','wb'))

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X_find, Y_find, test_size=0.1, random_state=42)

In [10]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train,Y_train)

MultinomialNB()

In [11]:
print(X_train,X_test,Y_train,Y_test)

  (0, 1866)	0.33667852809806487
  (0, 458)	0.1896492578500298
  (0, 392)	0.33667852809806487
  (0, 651)	0.33667852809806487
  (0, 98)	0.3160530709078878
  (0, 123)	0.33667852809806487
  (0, 1125)	0.3160530709078878
  (0, 389)	0.29006806794723783
  (0, 580)	0.33667852809806487
  (0, 667)	0.33667852809806487
  (1, 1664)	0.3652039533510452
  (1, 897)	0.3652039533510452
  (1, 1688)	0.3652039533510452
  (1, 840)	0.3652039533510452
  (1, 1608)	0.31540024720183124
  (1, 1734)	0.3652039533510452
  (1, 1941)	0.34829415802329844
  (1, 104)	0.33517790177113904
  (2, 1513)	0.29367478039052897
  (2, 1528)	0.29367478039052897
  (2, 541)	0.29367478039052897
  (2, 51)	0.29367478039052897
  (2, 1014)	0.29367478039052897
  (2, 466)	0.29367478039052897
  (2, 58)	0.29367478039052897
  :	:
  (489, 945)	0.46140775021652475
  (489, 1695)	0.47466318118548023
  (489, 1792)	0.33344786897019624
  (490, 132)	0.47155714295074797
  (490, 1338)	0.47155714295074797
  (490, 531)	0.42217221032597585
  (490, 1872)	0.471

In [12]:
accuracy_score(Y_test,clf.predict(X_test))*100

21.818181818181817

In [13]:
filename = 'nlp_model.pkl'
pickle.dump(clf,open(filename,'wb'))