In [96]:
#Importieren von wichtigen Plugins: Pandas für Dataframes, Surprise für Empfehlungsalgorithmen
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
sns.set_style("darkgrid")

In [97]:
#Importieren vom Datensatz
#jeder Eintrag besteht aus: einem User (mit ID), der einem Film (mit ID), ein Rating auf einer Skala von 1-5 (als Float) gibt 
df = pd.read_csv('ratings.csv')
print(df.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [98]:
#Umbenennen der Spalten, um sie nutzbar für die Surprise Algorithmen zu machen
df = df.rename(columns={"userId": "Cust_Id", "rating": "Rating", "movieId": "Movie_Id"})
print(df.head())

   Cust_Id  Movie_Id  Rating  timestamp
0        1         1     4.0  964982703
1        1         3     4.0  964981247
2        1         6     4.0  964982224
3        1        47     5.0  964983815
4        1        50     5.0  964982931


In [99]:
#Timestamp ist momentan unwichtig und kann aus dem Dataframe entfernt werden
df = df.drop(['timestamp'], axis=1)
print(df.head())

   Cust_Id  Movie_Id  Rating
0        1         1     4.0
1        1         3     4.0
2        1         6     4.0
3        1        47     5.0
4        1        50     5.0


In [100]:
#Die Spalten Rating und Movie_Id werden getauscht, um das Dataframe in ein nutzbareres Format zu bringen
df = df[['Cust_Id','Rating','Movie_Id']]
print(df.head())

   Cust_Id  Rating  Movie_Id
0        1     4.0         1
1        1     4.0         3
2        1     4.0         6
3        1     5.0        47
4        1     5.0        50


In [101]:
#Datensatz wird getrimmt auf Filme und Nutzer, die genügend Bewertungen haben
f = ['count','mean']

df_movie_summary = df.groupby('Movie_Id')['Rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

print('Movie minimum times of review: {}'.format(movie_benchmark))

df_cust_summary = df.groupby('Cust_Id')['Rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

print('Customer minimum times of review: {}'.format(cust_benchmark))

Movie minimum times of review: 7.0
Customer minimum times of review: 140.0


In [102]:
#Vergleichen von Datensatz vorher/nachher
print('Original Shape: {}'.format(df.shape))
df = df[~df['Movie_Id'].isin(drop_movie_list)]
df = df[~df['Cust_Id'].isin(drop_cust_list)]
print('After Trim Shape: {}'.format(df.shape))

Original Shape: (100836, 3)
After Trim Shape: (63950, 3)


In [103]:
#Umformung vom Datensatz in eine Matrix
df_p = pd.pivot_table(df,values='Rating',index='Cust_Id',columns='Movie_Id')

print(df_p.shape)
print(df_p.head())

(185, 2949)
Movie_Id  1       2       3       4       5       6       7       8       \
Cust_Id                                                                    
1            4.0     NaN     4.0     NaN     NaN     4.0     NaN     NaN   
4            NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
6            NaN     4.0     5.0     3.0     5.0     4.0     4.0     3.0   
7            4.5     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
10           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

Movie_Id  9       10      ...  171763  174055  175303  176101  176371  177593  \
Cust_Id                   ...                                                   
1            NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
4            NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
6            NaN     3.0  ...     NaN     NaN     NaN     NaN     NaN     NaN   
7            NaN     NaN  ...     NaN     NaN     

In [104]:

from surprise.model_selection import cross_validate

In [105]:
#Cross Validieren
reader = Reader()

data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']], reader)

svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8284  0.8352  0.8303  0.8372  0.8308  0.8324  0.0033  
MAE (testset)     0.6335  0.6401  0.6406  0.6432  0.6386  0.6392  0.0032  
Fit time          3.91    3.94    3.91    3.92    3.86    3.91    0.03    
Test time         0.11    0.11    0.24    0.11    0.11    0.14    0.05    


{'test_rmse': array([0.82844403, 0.83523676, 0.8303467 , 0.8371507 , 0.830843  ]),
 'test_mae': array([0.63353926, 0.64005855, 0.64061441, 0.64318186, 0.63859391]),
 'fit_time': (3.9121649265289307,
  3.9405651092529297,
  3.908690929412842,
  3.923476219177246,
  3.8638060092926025),
 'test_time': (0.11148691177368164,
  0.11118316650390625,
  0.24071097373962402,
  0.11483120918273926,
  0.10877013206481934)}

In [14]:
#Aufteilen des Datensatzes in Trainings- und Testdaten
from surprise import accuracy
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25)

In [15]:
#Anwenden des SVD Algorithmus, um Ratings vorherzusagen 
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)

#Ausgabe des RootMeanSquaredError und MeanAbsoluteError
print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

RMSE: 0.8333
MAE:  0.6401


0.6401135311573707

In [21]:
pred_df = pd.DataFrame(predictions)
print(pred_df.head())

Unnamed: 0,uid,iid,r_ui,est,details
0,139,52722,1.0,1.529562,{'was_impossible': False}
1,599,733,3.5,2.789235,{'was_impossible': False}
2,91,3070,5.0,3.431258,{'was_impossible': False}
3,177,79695,2.0,3.049844,{'was_impossible': False}
4,414,2640,3.0,3.797993,{'was_impossible': False}


In [24]:
#Zusammenfassung der Film_Ids und Titel
df_title = pd.read_csv('movies.csv')
df_title.set_index('movieId', inplace = True)
df_title = df_title.drop(['genres'], axis=1)
print (df_title.head(10))

                                      title
movieId                                    
1                          Toy Story (1995)
2                            Jumanji (1995)
3                   Grumpier Old Men (1995)
4                  Waiting to Exhale (1995)
5        Father of the Bride Part II (1995)
6                               Heat (1995)
7                            Sabrina (1995)
8                       Tom and Huck (1995)
9                       Sudden Death (1995)
10                         GoldenEye (1995)


In [26]:
#Formatierung derselbigen
df_title = df_title.rename(columns={"movieId": "Movie_Id", "title": "Name"})
df_title.index.names = ['Movie_Id']
print(df_title.head())

Unnamed: 0_level_0,Name
Movie_Id,Unnamed: 1_level_1
1,Toy Story (1995)
2,Jumanji (1995)
3,Grumpier Old Men (1995)
4,Waiting to Exhale (1995)
5,Father of the Bride Part II (1995)
...,...
193581,Black Butler: Book of the Atlantic (2017)
193583,No Game No Life: Zero (2017)
193585,Flint (2017)
193587,Bungo Stray Dogs: Dead Apple (2018)


In [72]:
#Methode, um 5 Star Ratings einzelner User auszulesen
def customersupport(user):
    dframeUser = df[(df['Cust_Id'] == user) & (df['Rating'] == 5)]
    dframeUser = dframeUser.set_index('Movie_Id')
    dframeUser = dframeUser.join(df_title)['Name']
    print(dframeUser)

In [73]:
#Beispiel
customersupport(1)

Movie_Id
47               Seven (a.k.a. Se7en) (1995)
50                Usual Suspects, The (1995)
101                     Bottle Rocket (1996)
151                           Rob Roy (1995)
157                    Canadian Bacon (1995)
                        ...                 
3671                  Blazing Saddles (1974)
3702                          Mad Max (1979)
3703    Road Warrior, The (Mad Max 2) (1981)
3793                            X-Men (2000)
5060            M*A*S*H (a.k.a. MASH) (1970)
Name: Name, Length: 118, dtype: object


In [74]:
#Nutzen der Daten, um Vorschläge für User zu machen
def makerecommendation(user):
    userdf = df_title.copy()
    userdf = userdf.reset_index()
    userdf = userdf[~userdf['Movie_Id'].isin(drop_movie_list)]

    # getting full dataset
    data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']], reader)

    trainset = data.build_full_trainset()
    algo.fit(trainset)

    userdf['Estimate_Score'] = userdf['Movie_Id'].apply(lambda x: svd.predict(user, x).est)

    userdf = userdf.drop('Movie_Id', axis = 1)

    userdf = userdf.sort_values('Estimate_Score', ascending=False)
    print(userdf.head(10))

In [86]:
#Beispiel anhand von User 100
print(makerecommendation(100))

                                                   Name  Estimate_Score
906                           Lawrence of Arabia (1962)        4.840041
690                           North by Northwest (1959)        4.779253
2226                                  Fight Club (1999)        4.773524
254   Léon: The Professional (a.k.a. The Professiona...        4.738114
704       Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)        4.708379
4909       Eternal Sunshine of the Spotless Mind (2004)        4.681310
6710                            Dark Knight, The (2008)        4.665758
920                                       Psycho (1960)        4.644980
2996                                      Snatch (2000)        4.640222
911   Star Wars: Episode VI - Return of the Jedi (1983)        4.635410


In [87]:
good_prediction =  pred_df['est']>4.0
df_good_prediction = pred_df[good_prediction]
df_good_prediction = df_good_prediction.filter(["uid", "iid", "r_ui", "est"])
print(df_good_prediction.shape)

(3017, 4)

In [88]:
good_ratings =  pred_df['r_ui']>4.0
df_good_ratings = pred_df[good_ratings]
df_good_ratings = df_good_ratings.filter(["uid", "iid", "r_ui", "est"])
print(df_good_ratings.shape)

(3198, 4)

In [106]:
pred_df['diff'] = np.abs(pred_df['est'] - pred_df['r_ui'])
print(pred_df.head())
print(pred_df.shape)

   uid    iid  r_ui       est                    details      diff
0  139  52722   1.0  1.529562  {'was_impossible': False}  0.529562
1  599    733   3.5  2.789235  {'was_impossible': False}  0.710765
2   91   3070   5.0  3.431258  {'was_impossible': False}  1.568742
3  177  79695   2.0  3.049844  {'was_impossible': False}  1.049844
4  414   2640   3.0  3.797993  {'was_impossible': False}  0.797993
(15988, 6)


In [90]:
bad_predictions =  pred_df['diff']>2
df_bad_predictions = pred_df[bad_predictions]
df_bad_predictions = df_bad_predictions.filter(["uid", "iid", "r_ui", "est", "diff"])
print(df_bad_predictions.head())
print(df_bad_predictions.shape)


     uid   iid  r_ui       est      diff
39   477  2798   0.5  2.755889  2.255889
69    68  2542   1.0  3.813525  2.813525
91    51  2402   1.0  3.429765  2.429765
110  603  3566   1.0  3.507752  2.507752
134   42  3438   1.0  3.313618  2.313618
(402, 5)


In [91]:
max_values_df = pred_df.max()
print(max_values_df)

uid        610.000000
iid     180031.000000
r_ui         5.000000
est          5.000000
diff         3.820974
dtype: float64


In [92]:
maxdiff = pred_df['diff']==pred_df['diff'].max()
maxdiffrow = pred_df[maxdiff]
maxdiffrow = maxdiffrow.filter(["uid", "iid", "r_ui", "est", "diff"])
print(maxdiffrow)

      uid   iid  r_ui       est      diff
8912  580  1203   0.5  4.320974  3.820974


In [93]:
print(customersupport(580))

Movie_Id
32               Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
47                             Seven (a.k.a. Se7en) (1995)
50                              Usual Suspects, The (1995)
70                              From Dusk Till Dawn (1996)
293      Léon: The Professional (a.k.a. The Professiona...
296                                    Pulp Fiction (1994)
318                       Shawshank Redemption, The (1994)
541                                    Blade Runner (1982)
778                                   Trainspotting (1996)
1080                   Monty Python's Life of Brian (1979)
1089                                 Reservoir Dogs (1992)
1136                Monty Python and the Holy Grail (1975)
1222                              Full Metal Jacket (1987)
1884                 Fear and Loathing in Las Vegas (1998)
2028                            Saving Private Ryan (1998)
2571                                    Matrix, The (1999)
2858                                American Be

In [94]:
print(makerecommendation(580))

                                  Name  Estimate_Score
659              Godfather, The (1972)        4.785878
828              Reservoir Dogs (1992)        4.743021
510   Silence of the Lambs, The (1991)        4.704300
2226                 Fight Club (1999)        4.660820
1503        Saving Private Ryan (1998)        4.615548
1939                Matrix, The (1999)        4.596039
257                Pulp Fiction (1994)        4.590528
1298          Big Lebowski, The (1998)        4.548849
906          Lawrence of Arabia (1962)        4.541292
4615          Kill Bill: Vol. 1 (2003)        4.509380


In [95]:
maxdiffrow = maxdiffrow.rename(columns={"iid": "Movie_Id"})
maxdiffrow = maxdiffrow.set_index('Movie_Id')
maxdiffrow = maxdiffrow.join(df_title)['Name']
print(maxdiffrow)

Movie_Id
1203    12 Angry Men (1957)
Name: Name, dtype: object

In [109]:
def movieid2name(id):
    movie = df_title.loc[id]
    print(movie)

In [112]:
movieid2name(100)

Name    City Hall (1996)
Name: 100, dtype: object
