In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet


import warnings; warnings.simplefilter('ignore')

In [5]:
md = pd. read_csv('moviesmetadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,30-10-1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,15-12-1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,22-12-1995,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,22-12-1995,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,10-02-1995,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [7]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [9]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [11]:
m = vote_counts.quantile(0.95)
m


434.0

In [15]:
print(md.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


In [17]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').dt.year

In [19]:
qualified = md[(md['vote_count'] >= m) & 
               (md['vote_count'].notnull()) & 
               (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]

In [21]:
qualified['vote_count'] = qualified['vote_count'].astype(int)
qualified['vote_average'] = qualified['vote_average'].astype(int)

In [23]:
print(qualified.shape)

(2274, 6)


In [25]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [27]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [29]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [31]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010.0,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008.0,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014.0,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999.0,9678,8,63.869599,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001.0,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994.0,8670,8,140.950236,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994.0,8358,8,51.645403,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003.0,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994.0,8147,8,48.307194,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002.0,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.851924


In [33]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [35]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [37]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995.0,661,9,34.457024,8.565285
351,Forrest Gump,1994.0,8147,8,48.307194,7.971357
876,Vertigo,1958.0,1162,8,18.20822,7.811667
40251,Your Name.,2016.0,1030,8,34.461252,7.789489
883,Some Like It Hot,1959.0,835,8,11.845107,7.745154
1132,Cinema Paradiso,1988.0,834,8,14.177005,7.744878
19901,Paperman,2012.0,734,8,7.198633,7.713951
37863,Sing Street,2016.0,669,8,10.672862,7.689483
882,The Apartment,1960.0,498,8,11.994281,7.599317
38718,The Handmaiden,2016.0,453,8,16.727405,7.566166


In [39]:
build_chart('Horror').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
1213,The Shining,1980.0,3890,8,19.611589,7.901294
1176,Psycho,1960.0,2405,8,36.826309,7.843335
1171,Alien,1979.0,4564,7,23.37742,6.941936
41492,Split,2016.0,4461,7,28.920839,6.940631
14236,Zombieland,2009.0,3655,7,11.063029,6.927969
1158,Aliens,1986.0,3282,7,21.761179,6.920081
21276,The Conjuring,2013.0,3169,7,14.90169,6.917338
42169,Get Out,2017.0,2978,7,36.894806,6.912248
1338,Jaws,1975.0,2628,7,19.726114,6.901088
8147,Shaun of the Dead,2004.0,2479,7,14.902948,6.895426


In [41]:
build_chart('Thriller').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010.0,14075,8,29.108149,7.95646
12481,The Dark Knight,2008.0,12269,8,123.167259,7.950165
292,Pulp Fiction,1994.0,8670,8,140.950236,7.929996
46,Se7en,1995.0,5915,8,18.45743,7.898573
24860,The Imitation Game,2014.0,5895,8,31.59594,7.898242
586,The Silence of the Lambs,1991.0,4549,8,4.307222,7.869538
11354,The Prestige,2006.0,4510,8,16.94556,7.868463
289,Leon: The Professional,1994.0,4293,8,20.477329,7.862142
4099,Memento,2000.0,4168,8,15.450789,7.858217
1213,The Shining,1980.0,3890,8,19.611589,7.848633


In [43]:
#testing time

input_genre=input("Enter the genre of your choice")
build_chart(input_genre).head(15)


Enter the genre of your choice Romance


Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995.0,661,9,34.457024,8.565285
351,Forrest Gump,1994.0,8147,8,48.307194,7.971357
876,Vertigo,1958.0,1162,8,18.20822,7.811667
40251,Your Name.,2016.0,1030,8,34.461252,7.789489
883,Some Like It Hot,1959.0,835,8,11.845107,7.745154
1132,Cinema Paradiso,1988.0,834,8,14.177005,7.744878
19901,Paperman,2012.0,734,8,7.198633,7.713951
37863,Sing Street,2016.0,669,8,10.672862,7.689483
882,The Apartment,1960.0,498,8,11.994281,7.599317
38718,The Handmaiden,2016.0,453,8,16.727405,7.566166


In [55]:
pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [58]:
import sklearn

In [62]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [64]:
df=pd.read_csv("ratingsmain.csv")

In [66]:
df.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [102]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
df = pd.DataFrame({
    'userId': df['userId'],
    'movieId':df ['movieId'],
    'rating': df['rating'],
    
})

In [104]:
df.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [106]:
user_item_matrix = df.pivot(index='userId', columns='movieId', values='rating')

In [108]:
print(user_item_matrix)

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
2           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5           NaN     NaN     4.0     NaN     NaN     NaN     NaN     NaN   
...         ...     ...     ...     ...     ...     ...     ...     ...   
667         NaN     NaN     NaN     NaN     NaN     4.0     NaN     NaN   
668         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
669         NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
670         4.0     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
671         5.0     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

movieId  9       10     

In [110]:
user_mean = user_item_matrix.mean(axis=1)
normalized_matrix = user_item_matrix.sub(user_mean, axis=0)


In [112]:
normalized_matrix.fillna(0, inplace=True)

In [114]:
def cosine_similarity(matrix):
    similarity = np.zeros((matrix.shape[0], matrix.shape[0]))
    for i in range(matrix.shape[0]):
        for j in range(matrix.shape[0]):
            if i != j:
                similarity[i, j] = 1 - cosine(matrix[i], matrix[j])
            else:
                similarity[i, j] = 1  # Cosine similarity with itself is 1
    return similarity

similarity_matrix = cosine_similarity(normalized_matrix.values)

In [116]:
def predict_rating(user_id, item_id):
    user_index = user_id - 1  # Adjust for zero-based index
    if item_id not in user_item_matrix.columns:
        return user_mean[user_id]
    user_similarities = similarity_matrix[user_index]
    item_ratings = user_item_matrix[item_id]
    weighted_sum = np.dot(user_similarities, item_ratings.fillna(0))
    sum_of_similarities = np.sum(user_similarities)
    if sum_of_similarities == 0:
        return user_mean[user_id]  # Avoid division by zero, return the mean rating of the user
    return weighted_sum / sum_of_similarities

In [120]:
user_id = 1
item_id = 110
predicted_rating = predict_rating(user_id, item_id)
print(f"Predicted rating for user {user_id} and item {item_id}: {predicted_rating:.2f}")

Predicted rating for user 1 and item 110: 0.14
