# Import Library

In [1]:
import numpy as np
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate


import warnings
warnings.filterwarnings('ignore')

# Describe The Data

In [2]:
df = pd.read_csv("data/collaborative_filtering.csv")
df.head()

Unnamed: 0,userId,movie,rating
0,1,One Flew Over the Cuckoo's Nest (1975),5
1,1,James and the Giant Peach (1996),3
2,1,My Fair Lady (1964),3
3,1,Erin Brockovich (2000),4
4,1,"Bug's Life, A (1998)",5


In [3]:
df.userId.nunique()

6040

In [4]:
df.shape 

(1000209, 3)

In [5]:
df.isnull().sum()

userId    0
movie     0
rating    0
dtype: int64

In [6]:
df.dropna(thresh=2, inplace=True)
df.fillna(method='ffill', inplace=True)
df.drop_duplicates(inplace=True)

In [7]:
df['movie'].value_counts().head()

American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
Name: movie, dtype: int64

# Training dan Modeling

In [8]:
data = Dataset.load_from_df(df, Reader())
trainset = data.build_full_trainset()

In [9]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1cd80038460>

# Result

### Sebutkan user mana yang ingin di rekomendasikan ! 

In [10]:
user_id=3

### Berikut adalah semua nama film yang sudah di rating 

In [11]:
all_movies = df.movie.unique()
all_movies

array(["One Flew Over the Cuckoo's Nest (1975)",
       'James and the Giant Peach (1996)', 'My Fair Lady (1964)', ...,
       'White Boys (1999)', 'One Little Indian (1973)',
       'Five Wives, Three Secretaries and Me (1998)'], dtype=object)

### Berikut semua nama film yang sudah di rating oleh user id = 3

In [12]:
watched = df[df.userId == 3].movie
watched.head()

182           Animal House (1978)
183        Full Monty, The (1997)
184    Mission: Impossible (1996)
185        Raising Arizona (1987)
186                28 Days (2000)
Name: movie, dtype: object

### Berikut adalah nama film yang belum dirating oleh user id = 3 

In [13]:
not_watched = [movie for movie in all_movies if movie not in watched]
not_watched

["One Flew Over the Cuckoo's Nest (1975)",
 'James and the Giant Peach (1996)',
 'My Fair Lady (1964)',
 'Erin Brockovich (2000)',
 "Bug's Life, A (1998)",
 'Princess Bride, The (1987)',
 'Ben-Hur (1959)',
 'Christmas Story, A (1983)',
 'Snow White and the Seven Dwarfs (1937)',
 'Wizard of Oz, The (1939)',
 'Beauty and the Beast (1991)',
 'Gigi (1958)',
 'Miracle on 34th Street (1947)',
 "Ferris Bueller's Day Off (1986)",
 'Sound of Music, The (1965)',
 'Airplane! (1980)',
 'Tarzan (1999)',
 'Bambi (1942)',
 'Awakenings (1990)',
 'Big (1988)',
 'Pleasantville (1998)',
 'Wallace & Gromit: The Best of Aardman Animation (1996)',
 'Back to the Future (1985)',
 "Schindler's List (1993)",
 'Meet Joe Black (1998)',
 'Pocahontas (1995)',
 'E.T. the Extra-Terrestrial (1982)',
 'Titanic (1997)',
 'Ponette (1996)',
 'Close Shave, A (1995)',
 'Antz (1998)',
 'Girl, Interrupted (1999)',
 'Hercules (1997)',
 'Aladdin (1992)',
 'Mulan (1998)',
 'Hunchback of Notre Dame, The (1996)',
 'Last Days of Di

In [14]:
score = [model.predict(user_id, movie).est for movie in not_watched]
score

[4.635190200022771,
 3.741177183047011,
 4.231431191409329,
 3.812561470257648,
 4.157091302161083,
 4.561848924685062,
 4.187808388803168,
 4.3327104469814035,
 4.067422899543056,
 4.488919759019826,
 4.1389617393646025,
 3.689062071603995,
 4.098230611631356,
 4.245052930423249,
 4.390554028913032,
 4.057276506344334,
 3.9049384395521947,
 3.9337487332105674,
 4.010547800708797,
 4.325794475033181,
 3.3404883035650856,
 4.6418564693750675,
 3.7913538273661467,
 4.4999616378786556,
 3.6456646798947867,
 3.4224518824142183,
 4.3294282717666075,
 3.3967412861765025,
 4.239752158182225,
 4.664898265539716,
 3.9057639634787042,
 3.5213898371650743,
 3.349016002248321,
 4.057324413662117,
 4.047731185423235,
 3.5664256515993946,
 3.265277724329681,
 4.19267299486932,
 4.440177617353751,
 4.154440008349314,
 4.121621786649576,
 4.299786231056573,
 4.306883765475977,
 4.273238924438374,
 4.564184133579995,
 4.321853607779657,
 4.012484814674951,
 4.481064890316792,
 4.435825755961417,
 4.018

### Berikut film-film yang direkomendasikan untuk user_id = 3 

In [15]:
result = pd.DataFrame({"movie" : not_watched, "pred_score":score})
result.sort_values("pred_score", ascending=False, inplace=True)
result.head(10)

Unnamed: 0,movie,pred_score
2617,Sanjuro (1962),4.881108
199,Butch Cassidy and the Sundance Kid (1969),4.713002
2473,Pather Panchali (1955),4.711485
1520,Creature Comforts (1990),4.686397
2471,Cold Fever (� k�ldum klaka) (1994),4.677015
29,"Close Shave, A (1995)",4.664898
21,Wallace & Gromit: The Best of Aardman Animatio...,4.641856
0,One Flew Over the Cuckoo's Nest (1975),4.63519
1182,Some Folks Call It a Sling Blade (1993),4.628328
167,"Shawshank Redemption, The (1994)",4.613352


In [16]:
df[df['userId']==3]

Unnamed: 0,userId,movie,rating
182,3,Animal House (1978),4
183,3,"Full Monty, The (1997)",2
184,3,Mission: Impossible (1996),3
185,3,Raising Arizona (1987),4
186,3,28 Days (2000),3
187,3,Happy Gilmore (1996),4
188,3,"Golden Child, The (1986)",4
189,3,Star Wars: Episode VI - Return of the Jedi (1983),4
190,3,Beverly Hills Ninja (1997),3
191,3,"Naked Gun: From the Files of Police Squad!, Th...",3


# Summary

In [17]:
def recommendations(user_id):
    all_movies = df.movie.unique()
    watched = df[df.userId == user_id].movie
    not_watched = [movie for movie in all_movies if movie not in watched]
    score = [model.predict(user_id, movie).est for movie in not_watched]
    result = pd.DataFrame({"movie" : not_watched, "pred_score":score})
    result.sort_values("pred_score", ascending=False, inplace=True)
    return result.head(10)


recommendations(user_id=90)

Unnamed: 0,movie,pred_score
206,Monty Python and the Holy Grail (1974),4.841328
132,"Matrix, The (1999)",4.648727
244,Pulp Fiction (1994),4.63406
349,Shanghai Noon (2000),4.61604
756,Monty Python's Life of Brian (1979),4.501911
51,Fargo (1996),4.491476
7,"Christmas Story, A (1983)",4.47619
2617,Sanjuro (1962),4.475618
177,Raising Arizona (1987),4.469787
669,"Godfather, The (1972)",4.447809


In [39]:
import mysql.connector
import pandas as pd
import numpy as np
from scipy.spatial.distance import hamming
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)


df = pd.read_csv('data/collaborative_filtering.csv')

df.dropna(thresh=2, inplace=True)
df.fillna(method='ffill', inplace=True)
df.drop_duplicates(inplace=True)

userRatingMatrix = pd.pivot_table(df, index=['userId'],columns=['movie'] ,values='rating')

def distance(user1, user2):
    try:
        user1_ratings = userRatingMatrix.transpose()[user1]
        user2_ratings = userRatingMatrix.transpose()[user2]
        distance = hamming(user1_ratings, user2_ratings)
    except:
        distance = np.nan
    return distance

def nearestNeighbours(userId, K=10):
    allusers = pd.DataFrame(userRatingMatrix.index)
    allusers = allusers[allusers.userId != userId]
    allusers['distance'] = allusers['userId'].apply(lambda x: distance(userId, x))
    KNearestUsers = allusers.sort_values(['distance'], ascending=True)['userId'][:K]
    return KNearestUsers


def recommend(userId, id_client = None, N=10):
    KnearestUsers = nearestNeighbours(userId)
    NNRatings = userRatingMatrix[userRatingMatrix.index.isin(KnearestUsers)]
    avgRating = NNRatings.apply(np.nanmean).dropna()
    ProductAlreadyRate = userRatingMatrix.transpose()[userId].dropna().index
    avgRating = avgRating[~avgRating.index.isin(ProductAlreadyRate)]
    topNProductId = avgRating.sort_values(ascending=False).index[:N]
    result1 =  pd.DataFrame({'movie':pd.Series(topNProductId)})
    return result1

recommend(userId=1)

Unnamed: 0,movie
0,"Night to Remember, A (1958)"
1,What Ever Happened to Baby Jane? (1962)
2,Cinema Paradiso (1988)
3,Get Real (1998)
4,Without Limits (1998)
5,On the Town (1949)
6,Gentleman's Agreement (1947)
7,"Mis�rables, Les (1995)"
8,Gaslight (1944)
9,"Year My Voice Broke, The (1987)"


In [38]:
def recommendations(user_id):
    all_movies = df.movie.unique()
    watched = df[df.userId == user_id].movie
    not_watched = [movie for movie in all_movies if movie not in watched]
    score = [model.predict(user_id, movie).est for movie in not_watched]
    result = pd.DataFrame({"movie" : not_watched, "pred_score":score})
    result.sort_values("pred_score", ascending=False, inplace=True)
    return result.head(10)


recommendations(user_id=1)

Unnamed: 0,movie,pred_score
23,Schindler's List (1993),4.956423
38,"Sixth Sense, The (1999)",4.836377
167,"Shawshank Redemption, The (1994)",4.829839
48,Saving Private Ryan (1998),4.821355
40,Toy Story (1995),4.775883
171,Forrest Gump (1994),4.764282
2180,Central Station (Central do Brasil) (1998),4.747664
2617,Sanjuro (1962),4.692411
92,Braveheart (1995),4.677459
0,One Flew Over the Cuckoo's Nest (1975),4.670534


In [36]:
df[df['movie']=='Creature Comforts (1990)']

Unnamed: 0,userId,movie,rating
4216,32,Creature Comforts (1990),3
8221,57,Creature Comforts (1990),4
13935,112,Creature Comforts (1990),3
16660,131,Creature Comforts (1990),4
19967,149,Creature Comforts (1990),3
...,...,...,...
986599,5956,Creature Comforts (1990),4
986885,5957,Creature Comforts (1990),3
988068,5964,Creature Comforts (1990),5
995618,6013,Creature Comforts (1990),5


In [37]:
df[df['userId']==1]

Unnamed: 0,userId,movie,rating
0,1,One Flew Over the Cuckoo's Nest (1975),5
1,1,James and the Giant Peach (1996),3
2,1,My Fair Lady (1964),3
3,1,Erin Brockovich (2000),4
4,1,"Bug's Life, A (1998)",5
5,1,"Princess Bride, The (1987)",3
6,1,Ben-Hur (1959),5
7,1,"Christmas Story, A (1983)",5
8,1,Snow White and the Seven Dwarfs (1937),4
9,1,"Wizard of Oz, The (1939)",4
