#data link download : https://grouplens.org/datasets/movielens/25m/
#1- data loading:

In [1]:
import pandas as pd 
movie=pd.read_csv("src/movies.csv")
movie

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [2]:
rating=pd.read_csv("src/ratings.csv")
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [3]:
tags=pd.read_csv("src/tags.csv")
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455
...,...,...,...,...
1093355,162521,66934,Neil Patrick Harris,1427311611
1093356,162521,103341,cornetto trilogy,1427311259
1093357,162534,189169,comedy,1527518175
1093358,162534,189169,disabled,1527518181


In [4]:
genome_score=pd.read_csv("src/genome-scores.csv")
genome_score

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.06250
3,1,4,0.07575
4,1,5,0.14075
...,...,...,...
15584443,206499,1124,0.11000
15584444,206499,1125,0.04850
15584445,206499,1126,0.01325
15584446,206499,1127,0.14025


#pd.set_option('display.max_rows', None)
genome_tags=pd.read_csv("../src/genome-tags.csv")
genome_tags

#2- search engine :

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Example movie titles with writing changes and special characters
titles = movie.title

# Preprocessing function to handle writing changes and special characters
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters using regex
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove leading/trailing whitespaces
    text = text.strip()

    # Remove consecutive whitespaces
    text = re.sub('\s+', ' ', text)

    return text

# Preprocess the movie titles
preprocessed_titles = [preprocess_text(title) for title in titles]
movie['preprocessed_titles']=preprocessed_titles



In [6]:
# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()



In [7]:
from scipy.sparse import csr_matrix



# Convert preprocessed titles to a sparse matrix
title_vectors = vectorizer.fit_transform(preprocessed_titles).astype('float32')

# Convert to CSR matrix for memory efficiency
title_vectors = csr_matrix(title_vectors)

# Calculate cosine similarities between the sparse vectors
cosine_similarities = cosine_similarity(title_vectors, dense_output=False)



In [8]:
# Function to search for movie titles
def search_movie(query, titles, cosine_similarities, top_n=5):
    preprocessed_query = preprocess_text(query)
    query_vector = vectorizer.transform([preprocessed_query])
    similarity_scores = cosine_similarity(query_vector, title_vectors).flatten()
    top_indices = similarity_scores.argsort()[::-1][:top_n]
    top_titles = [titles[i] for i in top_indices]
    return top_titles

# Example search
search_query = preprocess_text(input('tap movie title'))
search_results = search_movie(search_query, titles, cosine_similarities)
print("Search Results:")
for result in search_results:
    print(result)


tap movie title pot


Search Results:
Sex Pot (2009)
Pot O' Gold (1941)
Honey Pot, The (1967)
Pot v raj (2014)
Chongqing Hot Pot (2016)


We can use the search engine to build a customized list of movies that align with our taste and preferences.

But I will use user data from a rating.csv file to personalize the movie recommendations based on individual ratings

#3- recommendation 

**virsion I (based on user movies rating)**

remove movies with insufficient ratings from the movie ratings dataset, you can apply a filter based on a threshold of minimum ratings per movie. This ensures that only movies with a certain number of ratings are included in the final dataset.

In [9]:
import csv
from collections import defaultdict

def filter_movies_by_ratings(min_ratings):
    movie_ratings = defaultdict(list)

    # Step 1: Read the ratings file and collect ratings for each movie
    with open('src/ratings.csv', 'r') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip the header row
        for row in reader:
            movie_id = int(row[1])
            rating = float(row[2])
            movie_ratings[movie_id].append(rating)

    # Step 2: Filter movies based on minimum ratings
    filtered_movies = []
    for movie_id, ratings in movie_ratings.items():
        if len(ratings) >= min_ratings:
            filtered_movies.append(movie_id)

    return filtered_movies

# Example usage
min_ratings = 30  # Minimum number of ratings required per movie
filtered_movies = filter_movies_by_ratings(min_ratings)
#print(f"Filtered Movies: {filtered_movies}")


In [10]:
data=rating[rating.movieId.isin(filtered_movies)]

In [11]:
data.shape

(24750090, 4)

In [12]:
data['user_index']=data['userId'].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['user_index']=data['userId'].astype('category').cat.codes


In [13]:
len(data['user_index'].unique())

162540

In [14]:
data['movies_index']=data['movieId'].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['movies_index']=data['movieId'].astype('category').cat.codes


In [15]:
len(data['movies_index'].unique())

15915

In [16]:
from scipy.sparse import coo_matrix

ratings_mat_coo=coo_matrix((data['rating'],(data['user_index'],data["movies_index"])))

In [17]:
ratings_mat_coo

<162540x15915 sparse matrix of type '<class 'numpy.float64'>'
	with 24750090 stored elements in COOrdinate format>

In [18]:
ratings_mat=ratings_mat_coo.tocsr()

In [19]:
data[data['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp,user_index,movies_index
0,1,296,5.0,1147880044,0,288
1,1,306,3.5,1147868817,0,298
2,1,307,5.0,1147868828,0,299
3,1,665,5.0,1147878820,0,633
4,1,899,3.5,1147868510,0,838
...,...,...,...,...,...,...
65,1,27193,3.0,1147879774,0,8102
66,1,27266,4.5,1147879365,0,8110
67,1,27721,3.0,1147869115,0,8203
68,1,31956,3.5,1147877610,0,8443


In [20]:
my_index=0

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

similarity=cosine_similarity(ratings_mat[my_index,:],ratings_mat).flatten()

In [22]:
similarity[18]

0.08067674381508005

In [23]:
import numpy as np
indices=np.argpartition(similarity,-15)[-15:]

In [24]:
indices

array([ 92682,  62205,  67905,  30076,  95512, 161800, 144594,  81511,
       140234,  87387,  97638,  77501,  10561,  88294,      0],
      dtype=int64)

In [25]:
similar_users=data[data["user_index"].isin(indices)].copy()

In [26]:
similar_users=similar_users[similar_users["userId"]!=1]

In [27]:
similar_users

Unnamed: 0,userId,movieId,rating,timestamp,user_index,movies_index
1577544,10563,6,4.5,1179070397,10561,5
1577545,10563,26,4.5,1179070731,10561,25
1577546,10563,32,4.0,1179070267,10561,31
1577547,10563,47,4.0,1179070296,10561,46
1577548,10563,105,5.0,1178123214,10561,102
...,...,...,...,...,...,...
24882123,161802,48783,3.5,1171153863,161800,9374
24882124,161802,48997,3.5,1173977478,161800,9386
24882125,161802,49272,0.5,1172428522,161800,9402
24882126,161802,49651,1.0,1171122162,161800,9432


In [28]:
movies=similar_users.groupby("movieId").rating.agg(['count','mean' ])

In [29]:
movies

Unnamed: 0_level_0,count,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2,3.5
6,1,4.5
10,1,1.0
16,1,4.5
18,1,4.5
...,...,...
53161,1,4.0
55444,1,2.5
55820,1,3.5
56367,1,4.0


In [30]:
movie_recs = movies.merge(movie, how="inner", on="movieId")

In [31]:
movie_recs

Unnamed: 0,movieId,count,mean,title,genres,preprocessed_titles
0,1,2,3.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story 1995
1,6,1,4.5,Heat (1995),Action|Crime|Thriller,heat 1995
2,10,1,1.0,GoldenEye (1995),Action|Adventure|Thriller,goldeneye 1995
3,16,1,4.5,Casino (1995),Crime|Drama,casino 1995
4,18,1,4.5,Four Rooms (1995),Comedy,four rooms 1995
...,...,...,...,...,...,...
892,53161,1,4.0,"I'm a Cyborg, But That's OK (Saibogujiman kwen...",Comedy|Drama|Romance|Sci-Fi,im a cyborg but thats ok saibogujiman kwenchan...
893,55444,1,2.5,Control (2007),Drama,control 2007
894,55820,1,3.5,No Country for Old Men (2007),Crime|Drama,no country for old men 2007
895,56367,1,4.0,Juno (2007),Comedy|Drama|Romance,juno 2007


In [32]:
movie_recs["adjusted_count"] = movie_recs["count"] * movie_recs["mean"] 


In [33]:
movie_recs["adjusted_count"]


0      7.0
1      4.5
2      1.0
3      4.5
4      4.5
      ... 
892    4.0
893    2.5
894    3.5
895    4.0
896    3.5
Name: adjusted_count, Length: 897, dtype: float64

In [34]:
movie_recs = movie_recs[~movie_recs["movieId"].isin(data[data["userId"]==1]["movieId"])]

In [35]:
movie_recs["mod_title"] = movie_recs["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_recs["mod_title"] = movie_recs["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()


In [36]:
movie_recs = movie_recs[movie_recs["mean"] >=4]
movie_recs = movie_recs[movie_recs["count"]>2]

In [37]:
top_recs = movie_recs.sort_values("adjusted_count", ascending=False)


In [38]:
 top_recs

Unnamed: 0,movieId,count,mean,title,genres,preprocessed_titles,adjusted_count,mod_title
502,4235,10,4.200000,Amores Perros (Love's a Bitch) (2000),Drama|Thriller,amores perros loves a bitch 2000,42.0,amores perros loves a bitch 2000
96,750,8,4.375000,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,dr strangelove or how i learned to stop worryi...,35.0,dr strangelove or how i learned to stop worryi...
407,3083,7,4.571429,All About My Mother (Todo sobre mi madre) (1999),Drama,all about my mother todo sobre mi madre 1999,32.0,all about my mother todo sobre mi madre 1999
158,1206,7,4.428571,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller,clockwork orange a 1971,31.0,clockwork orange a 1971
39,308,7,4.285714,Three Colors: White (Trzy kolory: Bialy) (1994),Comedy|Drama,three colors white trzy kolory bialy 1994,30.0,three colors white trzy kolory bialy 1994
...,...,...,...,...,...,...,...,...
538,4967,3,4.000000,No Man's Land (2001),Drama|War,no mans land 2001,12.0,no mans land 2001
366,2726,3,4.000000,"Killing, The (1956)",Crime|Film-Noir,killing the 1956,12.0,killing the 1956
725,7941,3,4.000000,Smiles of a Summer Night (Sommarnattens leende...,Comedy|Romance,smiles of a summer night sommarnattens leende ...,12.0,smiles of a summer night sommarnattens leende ...
116,922,3,4.000000,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Drama|Film-Noir|Romance,sunset blvd aka sunset boulevard 1950,12.0,sunset blvd aka sunset boulevard 1950


In [39]:
# Display the first 15 movie titles and genres
num_movies = 15
movie_subset = top_recs[['title', 'genres']].head(num_movies)

# Format and display the movie titles and genres
print(f"{'Title': <50} {'Genres'}")
print("-" * 70)
for _, row in movie_subset.iterrows():
    title = row['title']
    genres = row['genres']
    print(f"{title: <50} {genres}")

Title                                              Genres
----------------------------------------------------------------------
Amores Perros (Love's a Bitch) (2000)              Drama|Thriller
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) Comedy|War
All About My Mother (Todo sobre mi madre) (1999)   Drama
Clockwork Orange, A (1971)                         Crime|Drama|Sci-Fi|Thriller
Three Colors: White (Trzy kolory: Bialy) (1994)    Comedy|Drama
Mulholland Drive (2001)                            Crime|Drama|Film-Noir|Mystery|Thriller
Blue Velvet (1986)                                 Drama|Mystery|Thriller
Reservoir Dogs (1992)                              Crime|Mystery|Thriller
Shining, The (1980)                                Horror
Monty Python and the Holy Grail (1975)             Adventure|Comedy|Fantasy
Seven Samurai (Shichinin no samurai) (1954)        Action|Adventure|Drama
Psycho (1960)                                      Crime|Horror
American