In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [2]:
movies = pd.read_csv('data/movies.csv', sep=',', usecols=range(2))
ratings = pd.read_csv('data/ratings.csv', sep=',', usecols=range(3))


In [8]:
df = pd.merge(movies, ratings)

In [4]:
df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [9]:
dfRating= df.pivot_table(columns='title', values='rating', index='userId')

In [6]:
dfRating.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


### We could at correlation between users or correlation between movies

In [10]:
# Extract users that rated Toy Story (1995)

toyStor = dfRating['Toy Story (1995)']
toyStor.tail()

userId
606    2.5
607    4.0
608    2.5
609    3.0
610    5.0
Name: Toy Story (1995), dtype: float64

In [11]:
similarMovie = dfRating.corrwith(toyStor)
similarMovie = similarMovie.dropna()
similarMovie.sort_values(ascending=False)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


title
Land Before Time III: The Time of the Great Giving (1995)    1.0
St. Vincent (2014)                                           1.0
Mind Game (2004)                                             1.0
Misfits, The (1961)                                          1.0
Veronica Mars (2014)                                         1.0
                                                            ... 
True Story (2015)                                           -1.0
Truly, Madly, Deeply (1991)                                 -1.0
Mo' Money (1992)                                            -1.0
Magic Mike (2012)                                           -1.0
Red 2 (2013)                                                -1.0
Length: 4767, dtype: float64

## This is not giving a good result because some other movies that have less user ratings are obscoring the correlation

In [22]:
movieStat = df.groupby('title').agg({'rating': [np.size, np.mean]})
movieStat.tail()

  movieStat = df.groupby('title').agg({'rating': [np.size, np.mean]})


Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
eXistenZ (1999),22,3.863636
xXx (2002),24,2.770833
xXx: State of the Union (2005),5,2.0
¡Three Amigos! (1986),26,3.134615
À nous la liberté (Freedom for Us) (1931),1,1.0


In [23]:
popularMovies = movieStat['rating']['size'] >= 120
movieStat[popularMovies].sort_values([('rating', 'size')], ascending=False)[:20]

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Forrest Gump (1994),329,4.164134
"Shawshank Redemption, The (1994)",317,4.429022
Pulp Fiction (1994),307,4.197068
"Silence of the Lambs, The (1991)",279,4.16129
"Matrix, The (1999)",278,4.192446
Star Wars: Episode IV - A New Hope (1977),251,4.231076
Jurassic Park (1993),238,3.75
Braveheart (1995),237,4.031646
Terminator 2: Judgment Day (1991),224,3.970982
Schindler's List (1993),220,4.225


## Similar movies based on popular rated movies

In [29]:
movieStat.columns = ['rating_size', 'rating_mean']
popularMovies = movieStat['rating_size'] >= 110

similarMovies = movieStat[popularMovies].join(
    pd.DataFrame(similarMovie, columns=['similarity'])
)
similarMovies.sort_values(by='similarity', ascending=False)

Unnamed: 0_level_0,rating_size,rating_mean,similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Toy Story (1995),215,3.920930,1.000000
Toy Story 2 (1999),97,3.860825,0.699211
"Incredibles, The (2004)",125,3.836000,0.643301
Finding Nemo (2003),141,3.960993,0.618701
Aladdin (1992),183,3.792350,0.611892
...,...,...,...
"Bourne Identity, The (2002)",112,3.816964,-0.070729
"Matrix Reloaded, The (2003)",96,3.354167,-0.087528
Natural Born Killers (1994),92,3.233696,-0.091430
Stargate (1994),140,3.375000,-0.124225


Unnamed: 0_level_0,rating_size,rating_mean,similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Toy Story (1995),215,3.920930,1.000000
"Incredibles, The (2004)",125,3.836000,0.643301
Finding Nemo (2003),141,3.960993,0.618701
Aladdin (1992),183,3.792350,0.611892
"Monsters, Inc. (2001)",132,3.871212,0.490231
...,...,...,...
"Rock, The (1996)",121,3.640496,-0.038208
Good Will Hunting (1997),141,4.078014,-0.044629
Interview with the Vampire: The Vampire Chronicles (1994),109,3.458716,-0.061637
"Bourne Identity, The (2002)",112,3.816964,-0.070729


In [46]:
import joblib

In [30]:
corrMatrix = dfRating.corr(method='pearson', min_periods=100)

In [47]:
joblib.dump(corrMatrix, "corr_matrix.joblib")

['corr_matrix.joblib']

In [48]:
corrMatrix = joblib.load("corr_matrix.joblib")

In [42]:
def recommendationUserFiltering(userRatingDict):
    similarMovies = pd.Series(dtype='float64')
    
    for movieTitle, rating in userRatingDict.items():
        similar = corrMatrix[movieTitle].dropna()
        similar = similar.map(lambda x: x * rating )
        
        similarMovies = similarMovies.add(similar, fill_value=0)
        
    # Remove the movies the user already rated
    similarMovies = similarMovies.drop(labels=userRatingDict.keys(), errors="ignore")    
    similarMovies = similarMovies.sort_values(ascending=False)
        
    return similarMovies
    

In [43]:
userRatingDict = {'Good Will Hunting (1997)': 2, 'Star Wars: Episode IV - A New Hope (1977)': 4 }
recommendationUserFiltering(userRatingDict)

Good Will Hunting (1997)
Star Wars: Episode IV - A New Hope (1977)


title
Star Wars: Episode V - The Empire Strikes Back (1980)                             3.111881
Star Wars: Episode VI - Return of the Jedi (1983)                                  2.93692
Matrix, The (1999)                                                                 1.93035
Shawshank Redemption, The (1994)                                                  1.670588
Indiana Jones and the Last Crusade (1989)                                         1.643663
Lord of the Rings: The Return of the King, The (2003)                             1.626408
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    1.539116
Godfather, The (1972)                                                              1.46368
Jurassic Park (1993)                                                              1.460523
Terminator 2: Judgment Day (1991)                                                 1.405914
Gladiator (2000)                                                                  1.

In [51]:
from fastapi import FastAPI
from model import recommender

In [50]:
pip install fastapi

python3.10(10289) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Collecting fastapi
  Downloading fastapi-0.116.1-py3-none-any.whl.metadata (28 kB)
Collecting starlette<0.48.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.47.2-py3-none-any.whl.metadata (6.2 kB)
Collecting pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4 (from fastapi)
  Downloading pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting annotated-types>=0.6.0 (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.33.2 (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi)
  Downloading pydantic_core-2.33.2-cp310-cp310-macosx_10_12_x86_64.whl.metadata (6.8 kB)
Collecting typing-inspection>=0.4.0 (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi)
  Downloading typing_inspection-0.4.1-py3-none-any.whl.metadata (2.6 kB)
Downloading fastapi-0.116.1-py3-none-any.whl (95 kB)
Downloading pydantic-2.11.7-p

In [52]:
app = FastAPI()

@app.post("/recommend/")
def get_recommendations(userRatings: dict):
    recommendations = recommender(userRatings)
    return recommendations.head(10).to_dict()

In [53]:
userRatingDict = {'Good Will Hunting (1997)': 2, 'Star Wars: Episode IV - A New Hope (1977)': 4 }
get_recommendations(userRatingDict)

{'Star Wars: Episode V - The Empire Strikes Back (1980)': 3.111881244509786,
 'Star Wars: Episode VI - Return of the Jedi (1983)': 2.936920097453675,
 'Matrix, The (1999)': 1.930349931251961,
 'Shawshank Redemption, The (1994)': 1.6705877020974076,
 'Indiana Jones and the Last Crusade (1989)': 1.6436634531130165,
 'Lord of the Rings: The Return of the King, The (2003)': 1.6264078978355,
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)': 1.5391163104383685,
 'Godfather, The (1972)': 1.4636804285716432,
 'Jurassic Park (1993)': 1.4605233340603483,
 'Terminator 2: Judgment Day (1991)': 1.40591353962007}