## YouTube Recomendation system

### Content-based filtering:
### is a recommendation system that makes recommendations to users based on their past interactions with content. The idea behind content-based filtering is to recommend content that is similar to what the user has already shown an interest in.

In [1]:
#pip install --upgrade pandas==1.3.4

In [2]:
import joblib
import pickle

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import re
import nltk
from surprise import Reader, Dataset, SVD, SVDpp, model_selection, NormalPredictor, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, BaselineOnly, NMF, SlopeOne, CoClustering, accuracy
from surprise.accuracy import rmse
from surprise.model_selection import cross_validate, train_test_split

# Set up inline plotting
%matplotlib inline

# Silence warnings
import warnings
warnings.simplefilter('ignore')

# Uncomment if needed:
# nltk.download('stopwords')

In [4]:
pd.__version__

'1.3.4'

In [5]:
youtube= pd.read_csv("IN_youtube_trending_data.csv")

In [6]:
youtube=youtube[:30000]

In [7]:
youtube.head(2)

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description
0,Iot0eF6EoNA,Sadak 2 | Official Trailer | Sanjay | Pooja | ...,2020-08-12T04:31:41Z,UCGqvJPRcv7aVFun-eTsatcA,FoxStarHindi,24,2020-08-12T00:00:00Z,sadak|sadak 2|mahesh bhatt|vishesh films|pooja...,9885899,224925,3979409,350210,https://i.ytimg.com/vi/Iot0eF6EoNA/default.jpg,False,False,Three Streams. Three Stories. One Journey. Sta...
1,x-KbnJ9fvJc,Kya Baat Aa : Karan Aujla (Official Video) Tan...,2020-08-11T09:00:11Z,UCm9SZAl03Rev9sFwloCdz1g,Rehaan Records,10,2020-08-12T00:00:00Z,[None],11308046,655450,33242,405146,https://i.ytimg.com/vi/x-KbnJ9fvJc/default.jpg,False,False,Singer/Lyrics: Karan Aujla Feat Tania Music/ D...


In [8]:
quartiles = youtube['likes'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1.0])

In [9]:
# Create a new column for ratings based on the quartiles
youtube['Rating'] = pd.cut(youtube['likes'], bins=quartiles, labels=[0,1,2,3,4])

In [10]:
youtube.head(5)

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description,Rating
0,Iot0eF6EoNA,Sadak 2 | Official Trailer | Sanjay | Pooja | ...,2020-08-12T04:31:41Z,UCGqvJPRcv7aVFun-eTsatcA,FoxStarHindi,24,2020-08-12T00:00:00Z,sadak|sadak 2|mahesh bhatt|vishesh films|pooja...,9885899,224925,3979409,350210,https://i.ytimg.com/vi/Iot0eF6EoNA/default.jpg,False,False,Three Streams. Three Stories. One Journey. Sta...,4
1,x-KbnJ9fvJc,Kya Baat Aa : Karan Aujla (Official Video) Tan...,2020-08-11T09:00:11Z,UCm9SZAl03Rev9sFwloCdz1g,Rehaan Records,10,2020-08-12T00:00:00Z,[None],11308046,655450,33242,405146,https://i.ytimg.com/vi/x-KbnJ9fvJc/default.jpg,False,False,Singer/Lyrics: Karan Aujla Feat Tania Music/ D...,4
2,KX06ksuS6Xo,Diljit Dosanjh: CLASH (Official) Music Video |...,2020-08-11T07:30:02Z,UCZRdNleCgW-BGUJf-bbjzQg,Diljit Dosanjh,10,2020-08-12T00:00:00Z,clash diljit dosanjh|diljit dosanjh|diljit dos...,9140911,296533,6179,30058,https://i.ytimg.com/vi/KX06ksuS6Xo/default.jpg,False,False,CLASH official music video performed by DILJIT...,4
3,UsMRgnTcchY,Dil Ko Maine Di Kasam Video | Amaal M Ft.Ariji...,2020-08-10T05:30:49Z,UCq-Fj5jknLsUf-MWSy4_brA,T-Series,10,2020-08-12T00:00:00Z,hindi songs|2020 hindi songs|2020 new songs|t-...,23564512,743931,84162,136942,https://i.ytimg.com/vi/UsMRgnTcchY/default.jpg,False,False,Gulshan Kumar and T-Series presents Bhushan Ku...,4
4,WNSEXJJhKTU,"Baarish (Official Video) Payal Dev,Stebin Ben ...",2020-08-11T05:30:13Z,UCye6Oz0mg46S362LwARGVcA,VYRLOriginals,10,2020-08-12T00:00:00Z,VYRL Original|Mohsin Khan|Shivangi Joshi|Payal...,6783649,268817,8798,22984,https://i.ytimg.com/vi/WNSEXJJhKTU/default.jpg,False,False,VYRL Originals brings to you ‘Baarish’ - the b...,4


In [11]:
youtube['Rating'].unique()

[4, 2, 1, 3, 0, NaN]
Categories (5, int64): [0 < 1 < 2 < 3 < 4]

In [12]:
youtube.dropna(inplace=True)

In [13]:
# Get the number of rows in the youtube DataFrame
num_rows = len(youtube)

# Create a list of values for the new column
values = list(range(1, num_rows + 1))

# Assign the new column to 'userID' in the youtube DataFrame
youtube['userID'] = values

In [14]:
youtube.shape

(29400, 18)

In [15]:
youtube.columns

Index(['video_id', 'title', 'publishedAt', 'channelId', 'channelTitle',
       'categoryId', 'trending_date', 'tags', 'view_count', 'likes',
       'dislikes', 'comment_count', 'thumbnail_link', 'comments_disabled',
       'ratings_disabled', 'description', 'Rating', 'userID'],
      dtype='object')

In [16]:
youtube.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29400 entries, 0 to 29999
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   video_id           29400 non-null  object  
 1   title              29400 non-null  object  
 2   publishedAt        29400 non-null  object  
 3   channelId          29400 non-null  object  
 4   channelTitle       29400 non-null  object  
 5   categoryId         29400 non-null  int64   
 6   trending_date      29400 non-null  object  
 7   tags               29400 non-null  object  
 8   view_count         29400 non-null  int64   
 9   likes              29400 non-null  int64   
 10  dislikes           29400 non-null  int64   
 11  comment_count      29400 non-null  int64   
 12  thumbnail_link     29400 non-null  object  
 13  comments_disabled  29400 non-null  bool    
 14  ratings_disabled   29400 non-null  bool    
 15  description        29400 non-null  object  
 16  Rati

In [17]:
YouTube_df = youtube[['userID','video_id','title','channelTitle','categoryId','tags','likes','description','channelId','thumbnail_link','Rating']]

In [18]:
YouTube_df.head(2)

Unnamed: 0,userID,video_id,title,channelTitle,categoryId,tags,likes,description,channelId,thumbnail_link,Rating
0,1,Iot0eF6EoNA,Sadak 2 | Official Trailer | Sanjay | Pooja | ...,FoxStarHindi,24,sadak|sadak 2|mahesh bhatt|vishesh films|pooja...,224925,Three Streams. Three Stories. One Journey. Sta...,UCGqvJPRcv7aVFun-eTsatcA,https://i.ytimg.com/vi/Iot0eF6EoNA/default.jpg,4
1,2,x-KbnJ9fvJc,Kya Baat Aa : Karan Aujla (Official Video) Tan...,Rehaan Records,10,[None],655450,Singer/Lyrics: Karan Aujla Feat Tania Music/ D...,UCm9SZAl03Rev9sFwloCdz1g,https://i.ytimg.com/vi/x-KbnJ9fvJc/default.jpg,4


In [19]:
YouTube_df.title

0        Sadak 2 | Official Trailer | Sanjay | Pooja | ...
1        Kya Baat Aa : Karan Aujla (Official Video) Tan...
2        Diljit Dosanjh: CLASH (Official) Music Video |...
3        Dil Ko Maine Di Kasam Video | Amaal M Ft.Ariji...
4        Baarish (Official Video) Payal Dev,Stebin Ben ...
                               ...                        
29995    India celebrate a win for the ages at the Gabb...
29996    Jethalal Ke Haath Ki Chai | Taarak Mehta Ka Oo...
29997    Bruised, abused but conquered! India stun Aust...
29998    Kumkum Bhagya | Premiere Episode 1751 Preview ...
29999    SISTERS Season 2 | Episode 5 | Girl Formula | ...
Name: title, Length: 29400, dtype: object

In [20]:
YouTube_df["title"] = YouTube_df["title"].str.split('|').str[0].str.strip()

In [21]:
YouTube_df.head(2)

Unnamed: 0,userID,video_id,title,channelTitle,categoryId,tags,likes,description,channelId,thumbnail_link,Rating
0,1,Iot0eF6EoNA,Sadak 2,FoxStarHindi,24,sadak|sadak 2|mahesh bhatt|vishesh films|pooja...,224925,Three Streams. Three Stories. One Journey. Sta...,UCGqvJPRcv7aVFun-eTsatcA,https://i.ytimg.com/vi/Iot0eF6EoNA/default.jpg,4
1,2,x-KbnJ9fvJc,Kya Baat Aa : Karan Aujla (Official Video) Tania,Rehaan Records,10,[None],655450,Singer/Lyrics: Karan Aujla Feat Tania Music/ D...,UCm9SZAl03Rev9sFwloCdz1g,https://i.ytimg.com/vi/x-KbnJ9fvJc/default.jpg,4


In [22]:
YouTube_df['tags'] = YouTube_df['tags'].str.replace('|', ' ')

In [23]:
YouTube_df.head(3)

Unnamed: 0,userID,video_id,title,channelTitle,categoryId,tags,likes,description,channelId,thumbnail_link,Rating
0,1,Iot0eF6EoNA,Sadak 2,FoxStarHindi,24,sadak sadak 2 mahesh bhatt vishesh films pooja...,224925,Three Streams. Three Stories. One Journey. Sta...,UCGqvJPRcv7aVFun-eTsatcA,https://i.ytimg.com/vi/Iot0eF6EoNA/default.jpg,4
1,2,x-KbnJ9fvJc,Kya Baat Aa : Karan Aujla (Official Video) Tania,Rehaan Records,10,[None],655450,Singer/Lyrics: Karan Aujla Feat Tania Music/ D...,UCm9SZAl03Rev9sFwloCdz1g,https://i.ytimg.com/vi/x-KbnJ9fvJc/default.jpg,4
2,3,KX06ksuS6Xo,Diljit Dosanjh: CLASH (Official) Music Video,Diljit Dosanjh,10,clash diljit dosanjh diljit dosanjh diljit dos...,296533,CLASH official music video performed by DILJIT...,UCZRdNleCgW-BGUJf-bbjzQg,https://i.ytimg.com/vi/KX06ksuS6Xo/default.jpg,4


In [24]:
YouTube_df.isnull().sum()

userID            0
video_id          0
title             0
channelTitle      0
categoryId        0
tags              0
likes             0
description       0
channelId         0
thumbnail_link    0
Rating            0
dtype: int64

In [25]:
YouTube_df.dropna(inplace=True)

In [26]:
YouTube_df.drop_duplicates(subset=['title'], inplace=True)

In [27]:
YouTube_df['description'].apply(lambda x:x.split())

0        [Three, Streams., Three, Stories., One, Journe...
1        [Singer/Lyrics:, Karan, Aujla, Feat, Tania, Mu...
2        [CLASH, official, music, video, performed, by,...
3        [Gulshan, Kumar, and, T-Series, presents, Bhus...
4        [VYRL, Originals, brings, to, you, ‘Baarish’, ...
                               ...                        
29984    [Voilà!, Digi, presents, the, official, music,...
29987    [#AUSvIND, #India, #Australia, Aakash, Chopra,...
29990    [Click, here, to, watch, the, full, episode, o...
29994    [Click, here, to, Subscribe, to, SAB, TV, Chan...
29996    [Click, here, to, Subscribe, to, Taarak, Mehta...
Name: description, Length: 7312, dtype: object

In [28]:
YouTube_df.head(2)

Unnamed: 0,userID,video_id,title,channelTitle,categoryId,tags,likes,description,channelId,thumbnail_link,Rating
0,1,Iot0eF6EoNA,Sadak 2,FoxStarHindi,24,sadak sadak 2 mahesh bhatt vishesh films pooja...,224925,Three Streams. Three Stories. One Journey. Sta...,UCGqvJPRcv7aVFun-eTsatcA,https://i.ytimg.com/vi/Iot0eF6EoNA/default.jpg,4
1,2,x-KbnJ9fvJc,Kya Baat Aa : Karan Aujla (Official Video) Tania,Rehaan Records,10,[None],655450,Singer/Lyrics: Karan Aujla Feat Tania Music/ D...,UCm9SZAl03Rev9sFwloCdz1g,https://i.ytimg.com/vi/x-KbnJ9fvJc/default.jpg,4


In [29]:
YouTube_df['joint_tags'] = YouTube_df['channelTitle'] + YouTube_df['description'] + YouTube_df['tags'] 


In [30]:
YouTube_df['joint_tags'][:5]

0    FoxStarHindiThree Streams. Three Stories. One ...
1    Rehaan RecordsSinger/Lyrics: Karan Aujla Feat ...
2    Diljit DosanjhCLASH official music video perfo...
3    T-SeriesGulshan Kumar and T-Series presents Bh...
4    VYRLOriginalsVYRL Originals brings to you ‘Baa...
Name: joint_tags, dtype: object

In [31]:
YouTube_df.isnull().sum()

userID            0
video_id          0
title             0
channelTitle      0
categoryId        0
tags              0
likes             0
description       0
channelId         0
thumbnail_link    0
Rating            0
joint_tags        0
dtype: int64

In [32]:
YouTube_df['joint_tags']=YouTube_df['joint_tags'].apply(lambda x:x.split())

In [33]:
YouTube_df['joint_tags'][1]

['Rehaan',
 'RecordsSinger/Lyrics:',
 'Karan',
 'Aujla',
 'Feat',
 'Tania',
 'Music/',
 'Desi',
 'Crew',
 'Mix',
 '&',
 'Master',
 '/',
 'Dc',
 'Studio’sVideo/',
 'Sukh',
 'Sanghera',
 'Project',
 'By',
 '/',
 'Deep',
 'Rehaan',
 'Sukh',
 'Bajwa',
 '&',
 'Jeewan',
 'Chahal',
 'Produced',
 'By',
 '/',
 'Sandeep',
 'RehaanLabel',
 '/',
 'Rehaan',
 'RecordsOnline',
 'Promotions',
 '-',
 'Global',
 'Digital',
 'SolutionDigital',
 'Partner',
 '/',
 'Coin',
 'Digital',
 'Social',
 'media',
 'Promotions:',
 'Gk',
 'DigitialSpotify:',
 'https://spoti.fi/3kDfnNuYoutube',
 'Music:',
 'https://bit.ly/31N2tUkGaana:',
 'https://bit.ly/2PHrUkLhttps://wynk.in/u/SSWNC5VO3Website:',
 'WWW.RehaanRecords.CAFB:',
 'https://m.facebook.com/RehaanRecords/INSTA:',
 'Instagram/rehaanrecords[None]']

In [34]:
new = YouTube_df.drop(columns=['channelTitle','categoryId','tags','description','channelId'])#'likes'
#new.head()

In [35]:
new['joint_tags'] = new['joint_tags'].apply(lambda x: " ".join(x))
new.head()

Unnamed: 0,userID,video_id,title,likes,thumbnail_link,Rating,joint_tags
0,1,Iot0eF6EoNA,Sadak 2,224925,https://i.ytimg.com/vi/Iot0eF6EoNA/default.jpg,4,FoxStarHindiThree Streams. Three Stories. One ...
1,2,x-KbnJ9fvJc,Kya Baat Aa : Karan Aujla (Official Video) Tania,655450,https://i.ytimg.com/vi/x-KbnJ9fvJc/default.jpg,4,Rehaan RecordsSinger/Lyrics: Karan Aujla Feat ...
2,3,KX06ksuS6Xo,Diljit Dosanjh: CLASH (Official) Music Video,296533,https://i.ytimg.com/vi/KX06ksuS6Xo/default.jpg,4,Diljit DosanjhCLASH official music video perfo...
3,4,UsMRgnTcchY,Dil Ko Maine Di Kasam Video,743931,https://i.ytimg.com/vi/UsMRgnTcchY/default.jpg,4,T-SeriesGulshan Kumar and T-Series presents Bh...
4,5,WNSEXJJhKTU,"Baarish (Official Video) Payal Dev,Stebin Ben",268817,https://i.ytimg.com/vi/WNSEXJJhKTU/default.jpg,4,VYRLOriginalsVYRL Originals brings to you ‘Baa...


In [36]:
#from nltk.stem.porter import PorterStemme
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [37]:
def preprocessing(data):
    stemmer = nltk.stem.RSLPStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    corpus = []
    for text in data:
                
        review = re.sub(r"@[A-Za-z0-9_]+", " ", text)
        review = re.sub(r"https?://[A-Za-z0-9./]+", " ", review)
        review = re.sub(r"https?", " ", review)
        review = re.sub('[^a-zA-Z]', ' ', review)
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in set(all_stopwords) if len(word) > 2]
        review = ' '.join(review)
        corpus.append(review)

    return np.array(corpus)

In [38]:
new['cleen_joint_tags'] = preprocessing(new['joint_tags'] .values)

In [39]:
new['cleen_joint_tags'][1]

'rehaan recordssing lyric karan aujla feat tania music desi crew mix master studio svideo sukh sanghera project deep rehaan sukh bajwa jeewan chahal produc sandeep rehaanlabel rehaan recordsonlin promot global digit solutiondigit partner coin digit social media promot digitialspotifi music wynk sswnc websit www rehaanrecord cafb instagram rehaanrecord none'

In [40]:
#new['joint_tags'] = new['joint_tags'].astype('str').str.lower()

In [41]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
# Define additional parameters
ngram_range = (2,3)
max_features = 4000

# Create the TF-IDF vectorizer with the specified parameters
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=ngram_range,
    max_features=max_features
)


In [42]:
vector = tfidf_vectorizer.fit_transform(new['cleen_joint_tags']).toarray()

In [43]:
vector.shape

(7312, 4000)

In [44]:
from sklearn.preprocessing import MinMaxScaler ,StandardScaler
# Apply min-max scaling to your data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(vector)

In [45]:
scaled_data

array([[-0.05242819, -0.04969901, -0.05194276, ..., -0.06944292,
        -0.10583266, -0.10583266],
       [-0.05242819, -0.04969901, -0.05194276, ..., -0.06944292,
        -0.10583266, -0.10583266],
       [-0.05242819, -0.04969901, -0.05194276, ..., -0.06944292,
        -0.10583266, -0.10583266],
       ...,
       [-0.05242819, -0.04969901, -0.05194276, ..., -0.06944292,
        -0.10583266, -0.10583266],
       [-0.05242819, -0.04969901, -0.05194276, ..., -0.06944292,
        -0.10583266, -0.10583266],
       [-0.05242819, -0.04969901, -0.05194276, ..., -0.06944292,
         9.69054177,  9.69054177]])

In [46]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.99)

vector_reduced = pca.fit_transform(scaled_data)

In [47]:
vector.shape

(7312, 4000)

In [48]:
from sklearn.neighbors import NearestNeighbors

# Assuming 'X' is your feature matrix and 'k' is the number of neighbors to consider
knn_model = NearestNeighbors(n_neighbors=6, metric='cosine')
knn_model.fit(vector)


In [49]:
new[new['title'] == 'Dil Ko Maine Di Kasam Video'].index[0]

3

### Prediction 

In [50]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    query_vector = vector[index].reshape(1, -1)  # Reshape the query vector for KNN
    distances, indices = knn_model.kneighbors(query_vector)

    # 'indices' contains the indices of the k-nearest neighbors, including the query itself
    # 'distances' contains the corresponding distances/similarities

    # Print the top 5 recommended movies (excluding the query itself)
    recommended_movies = new.iloc[indices[0][1:6]]
    for movie in recommended_movies['title']:
        print(movie)

In [51]:
# def recommend(movie):
#     index = new[new['title'] == movie].index[0]
#     distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
#     for i in distances[1:6]:
#         print(new.iloc[i[0]].title)
        
    

In [52]:
with open("KNN_model.pkl", "wb") as f: 
    joblib.dump(knn_model, f) 

In [53]:
recommend('Diljit Dosanjh: CLASH (Official) Music Video')

Diljit Dosanjh: Born To Shine (Official Music Video) G.O.A.T
PEED: Diljit Dosanjh (Official) Music Video
Peed Diljit Dosanjh Lyrical Video Song
HABIT Lyric Video: Diljit Dosanjh
Diljit Dosanjh: Whiskey lyrical Video Song


In [54]:
recommend('NEW! Ep 2967 - Jethalal Drops Babita!')

NEW! Ep 2975 - Taarak's Boss!
NEW! Ep 3066 - Party Cancel?!
NEW! Ep 3038 - महीनों बाद Soda Shop पे स्वागत
NEW! Ep 3048 - Popatlal को ढूंढ़ने का प्लान
NEW! Ep 3051 - Tapu की पढाई


In [55]:
recommend("first Crazy RIDE on Kawasaki NINJA H2")

Finally bought my DreamBike - Ninja H2
bass Life mai HAYABUSA honi chaheye
I Surprised my Friends With NINJA H2!!
When 10R meets NINJA H2 !!
Surprising my Friends with NEW Bike!!


In [56]:
recommend('IPL 2020 - Patanjali IPL As Sponsor With 10 Big News')

Armenia vs Azerbaijan
Top of the Island Sunset
Top of Switzerland 🇨🇭
Best Cinema in the World!
My German Girlfriend's Home


### Some advantages of content-based filtering are that it does not rely on other users' behavior, it can handle new items without a large user base, and it can provide personalized recommendations even for niche interests. However, one limitation is that it may not be able to capture the diversity of user interests, as it tends to recommend similar items to what the user has already consumed.

In [57]:
import pickle
with open("vector.pkl", "wb") as f: 
    joblib.dump(vector, f) 

## colaborative recomendation ststem

### Scikit-learn's Surprise library is a popular Python library for building collaborative filtering recommendation systems. Here's a general overview of how to build a collaborative recommendation system using the Surprise library:

#### Data preparation: The first step is to prepare your data in a format that can be used by the Surprise library. This typically involves creating a user-item matrix, where each row represents a user and each column represents an item. The values in the matrix represent the user's rating or interaction with the item.

In [58]:
new = new.reset_index()
titles = new[['title']]
indices = pd.Series(new.index, index=new['title'])

In [59]:
indices[100:150]

title
TSP's Rabish ki report Unlock 2.0                                                                       100
BADSHAH – BKL (Official Lyrical Video)                                                                  101
அட இவ்வளவு நாள் இது தெரியாம போச்சே                                                                      102
GEOMETRIC SHAPE FOOD CHALLENGE                                                                          103
DIPIKA’ S BIRTHDAY CELEBRATION                                                                          104
DNA: Russia August 12 को लॉन्च करेगा Corona Vaccine                                                     105
MASTER CHEF                                                                                             106
We Made 4 Wheeler Quad-Cycle                                                                            107
Yeh Rishta Kya Kehlata Hai: Meet Naira's daughter!                                                      108
રણછોડ રંગીલા ( ગોવાળીય

In [60]:
#pickle.dump(indices,open('indices.pkl','wb'))
with open("indices.pkl", "wb") as f: 
    joblib.dump(indices, f) 

### Create the dataset: Use Surprise's Dataset module to create a dataset object from your Pandas DataFrame.

In [61]:
reader=Reader(rating_scale=(0, 4))
data = Dataset.load_from_df(youtube[['categoryId','video_id','Rating']], reader)#  'categoryId','video_id','channelId'

### Select an algorithm: Choose a collaborative filtering algorithm to use with the Surprise library. Some popular algorithms include Singular Value Decomposition (SVD), k-Nearest Neighbors (k-NN), and Non-negative Matrix Factorization (NMF).

In [62]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['MAE'], verbose=False,cv = 3)  # cv = 1 onluy try
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [63]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_mae')

In [64]:
surprise_results

Unnamed: 0_level_0,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline,0.270719,0.072983,0.17081
SlopeOne,0.282048,7.216309,26.000961
KNNBasic,0.286013,0.0,0.218814
NMF,0.290314,0.91207,0.08353
SVDpp,0.352268,226.978082,65.541083
SVD,0.42508,0.353534,0.181286
NormalPredictor,1.522449,0.036465,0.062519


### Hyperparameter tuning: Finally, you may want to fine-tune the hyperparameters of your model to improve its performance. The Surprise library includes functions for performing grid search or random search to find the optimal hyperparameters for your model.

In [65]:
from surprise import Dataset, SVD
from surprise.model_selection import GridSearchCV

In [66]:
knn = KNNBaseline()
param_grid = {"n_epochs": [250,500,650],"k" :[10,20,30,40,50]}
gs = GridSearchCV(KNNBaseline, param_grid, measures=["rmse", "mae"], cv=3)

In [67]:
gs.fit(data)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

In [68]:

# combination of parameters that gave the best RMSE score
print(gs.best_params["mae"])

{'n_epochs': 250, 'k': 10}


### Model training & Model evaluation: After training the model, evaluate its performance using metrics such as Root Mean Squared Error (RMSE), Mean Absolute Error (MAE)

In [69]:
knn = KNNBaseline(n_epochs= 250,k=10, min_k=1,verbose=False)
cross_validate(knn_baseline, data, measures=['RMSE','MAE'],cv=3)

NameError: name 'knn_baseline' is not defined

In [None]:
trainset = data.build_full_trainset()
knn_baseline.fit(trainset)

In [None]:
#youtube[youtube['likes'] >= 5000]

### Generate recommendations: Use the trained model to generate personalized recommendations for users based on their past interactions with items.

In [None]:
knn.predict(1, 302, 3)

In [None]:
# Save the SVD object as a pickle file
with open('KNNBaseline.pkl', 'wb') as f:
    joblib.dump(knn_baseline, f)


In [None]:
#youtube[youtube['categoryId'] ==1]

### Hybrid :
### A hybrid recommendation system combines multiple recommendation techniques, such as content-based and collaborative filtering, to provide more accurate and diverse recommendations to users. The system involves collecting user behavior and preference data, selecting and training the recommendation algorithms, combining the recommendations, and evaluating and refining the system. Additional features, such as user demographics, can be incorporated for further personalization. The key is to fine-tune the system to meet the specific needs of the application.

In [None]:
smd =YouTube_df[['title','likes','categoryId','video_id','thumbnail_link','Rating']]
smd.head(5)

In [None]:
id_map= YouTube_df[['video_id','title','categoryId','channelId']].set_index('title')
id_map.head(2)#	movieId	id  title	

In [None]:
indices_map = id_map.set_index('categoryId')

In [None]:
indices_map.head(2)

In [None]:
#pickle.dump(indices_map,open('indices_map.pkl','wb'))
#indices_map=indices_map.to_csv('indices_map.csv')
with open('indices_map.pkl', 'wb') as f:
    joblib.dump(indices_map, f)

In [None]:
indices_map

In [None]:
id_map.loc['Sadak 2']['categoryId']

### Prediction 

In [None]:
def hybrid(userId, title):
    index = new[new['title'] == title].index[0]
    query_vector = vector[index].reshape(1, -1)
    distances, indices = knn_model.kneighbors(query_vector)

    # Extract the indices of the recommended videos (excluding the query itself)
    video_indices = indices[0][1:6]
    
    # Use .iloc to select rows by integer position from the 'smd' DataFrame
    recommended_videos = smd.iloc[video_indices][['title', 'categoryId', 'video_id', 'likes']]
    
    # Calculate the 'est' value for each recommended video based on user preferences
    recommended_videos['est'] = recommended_videos['categoryId'].apply(lambda x: knn_baseline.predict(userId, indices_map.iloc[x]['video_id']).est)
    
    # Sort the recommended videos by the number of likes in descending order
    recommended_videos = recommended_videos.sort_values('likes', ascending=False)
    
    # Return the top 5 recommended videos
    return recommended_videos.head(5)


In [None]:
 knn.predict(1, indices_map.iloc[24]['video_id'])

In [None]:
hybrid(1,'Ahmedabad welcomes PM Modi!')

In [None]:
hybrid(1,"BTS (방탄소년단) ‘Life Goes On’ Official MV : on my pillow")

In [None]:
hybrid(1,'Apple iPhone 12 Pro')

In [None]:
hybrid(1,'Baarish (Official Video) Payal Dev,Stebin Ben')

In [None]:
hybrid(1,'Types of Ex Girlfriends')

In [None]:
hybrid(1,'NEW! Ep 3006 - Abdul Missing?!')

In [None]:
hybrid(1,'When 10R meets NINJA H2 !!')

In [None]:
hybrid(1,'FULL MATCH - Roman Reigns vs. Murphy: SmackDown LIVE, August 13, 2019')

In [None]:
hybrid(1,'IPL 2020 - Patanjali IPL As Sponsor With 10 Big News')

In [None]:
hybrid(1,"DNA: Russia August 12 को लॉन्च करेगा Corona Vaccine")

In [None]:
hybrid(1,"KOHLI & CO start with a fabulous WIN!")

In [None]:
hybrid(1,"Finally bought my DreamBike - Ninja H2")

In [None]:
hybrid(1,"Gujarati song pucho to khara")

In [None]:
hybrid(1,"BADSHAH – BKL (Official Lyrical Video)")

In [None]:
hybrid(1,'Diljit Dosanjh: CLASH (Official) Music Video')

In [None]:
hybrid(1,'After TikTok Ban')

In [None]:
smd.head()


In [None]:
#smd.to_csv('smd.csv')

In [None]:
#pickle.dump(smd,open('smd.pkl','wb'))
with open("smd.pkl", "wb") as f: 
    joblib.dump(smd, f) 

In [None]:
#pickle.dump(smd,open('smd.pkl','wb'))
with open("new.pkl", "wb") as f: 
    joblib.dump(new, f) 

In [None]:
smd['thumbnail_link'][0][:]

In [None]:
video_list=smd[['title','likes','thumbnail_link']]

In [None]:
with open("video_list.pkl", "wb") as f: 
    joblib.dump(video_list, f) 

In [None]:
video_list['title'][150:200]

In [None]:
smd.info()

In [None]:
new.info()