In [1]:
import pandas as pd
import seaborn as sns
import random
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Ranking algoritm: weighted rank
# We are using a **ranking algorithm** to combine mean and count into a single number.
#
#  R - average rating for one movie
#  v - number of votes for one movie
#  C - average rating of all movies
#  m - threshold hyperparameter (set this manually)

In [3]:
# Load movie ratings data (small set)
df_movies = pd.read_csv('ml-latest-small/movies.csv')
df_ratings = pd.read_csv('ml-latest-small/ratings.csv')
df_tags = pd.read_csv('ml-latest-small/tags.csv')
df_links = pd.read_csv('ml-latest-small/links.csv')

In [4]:
data=pd.merge(df_ratings,df_movies,on='movieId')

In [5]:
#number of different movie Ids
#df_ls.movieId.unique().shape

In [6]:
#set(df_ls['rating'])

In [7]:
#df_ls_chartlist

In [8]:
# sns.histplot(df_ls_chartlist[['weighted_rank','av_rating']])
# #plt.ylim(0, 1500)
# #plt.xlim(0, 5.2)
# plt.show()
# plt.close()
# sns.histplot(df_ls_chartlist['no_of_ratings'],bins=30)
# plt.ylim(0, 2000)  # first bin -> ~8000
# plt.show()

In [9]:
# Corelation between Av.ratings and volume
# df_ls_chartlist[['no_of_ratings','av_rating']].corr()

In [10]:
#new column with mean rating per movie
def calculate_weighted_rank(data,m=100):
    data['av_rating']=data.groupby(['movieId'])['rating'].transform('mean')

#new column with number of ratings per movie
    data['no_of_ratings']=data.groupby(['movieId'])['rating'].transform('count')
    
    # new column with ranking of movie according to weighted_rank()
    v = data['no_of_ratings']
    R = data['av_rating']
    C = data['rating'].mean()
    #m = 100.0 
    def weighted_rank(R, v, C, m):
        return R * v / (v+m) + C * m / (v+m)
    
    data['weighted_rank']=weighted_rank(R,v,C,m) # use function from above as input for new column/variable
    return data
#df_ratings['weighted_rank']=R * v / (v+m) + C * m / (v+m) # using this would have the same result

In [11]:
data=calculate_weighted_rank(data,m=100)

In [12]:
#sort movies by weighted rating and drop duplicates
def create_chartlis(data):
    chartlist=data.sort_values('weighted_rank',ascending=False).drop_duplicates('movieId')['movieId']
    return chartlist

In [13]:
create_chartlis(data)

16475     318
14513    2959
1918      296
1697      260
45417     858
         ... 
27778      19
59698    1562
44067    1882
31959     435
34698    2701
Name: movieId, Length: 9724, dtype: int64

In [14]:
def create_cossim_matrix(data):
    multi = data.set_index(['userId', 'movieId'])['rating']
    matrix = multi.unstack().T.fillna(0)
    cos_sim_matrix=cosine_similarity(matrix.T)
    cos_sim_matrix=pd.DataFrame(cos_sim_matrix,columns=set(data['userId']),index=set(data['userId']))
    #print('shape of cossim_matrix: ',cos_sim_matrix.shape)
    return cos_sim_matrix

In [15]:
#Heatmap to see if some users have similiar tase
#plt.figure(figsize=(8, 8))
#sns.heatmap(matrix, cbar=None)

In [16]:
#Create list with most similiar 5 users for each user
def create_closest_user_table(cos_sim_matrix):
    k_closest_users=[] # matrix with user Ids of closest users for each user
    k_highest_cossims=[] # maxtrix with corresponding cos-sim values
    k=6
    for i in range(1,cos_sim_matrix.shape[0]+1):
        # lists with user ids
        most_sim_=cos_sim_matrix.sort_values(by=i,ascending=False)[i][0:k].index
        #print(most_sim_)
        k_closest_users.append(most_sim_)
        # lists with cosinus similiarity values
        #most_sim_=list(cos_sim_matrix.sort_values(by=i,ascending=False)[i][0:k])  # list??? why
        #k_highest_cossims.append(most_sim_)


    #Create DataFrame
    k_closest_users=pd.DataFrame(k_closest_users,index=cos_sim_matrix.columns)
    #k_highest_cossims=pd.DataFrame(k_highest_cossims)

    # Transpose and drop first row (equal to index)
    k_closest_users=k_closest_users.T.drop(0,axis=0)
    #k_highest_cossims=k_highest_cossims.T.drop(0,axis=0)
    
    return k_closest_users

In [17]:
# find needed list selection
#list(df_ls[df_ls['userId']==1].sort_values(by='rating',ascending=False)['movieId'][0:20])

In [18]:
# m most favourite movies per user
def create_favourites_table(data):
    m_favourite_moviesId=[] #matrix with movie Ids
    #m_favourite_moviesName=[] #matrix with movie names
    m=20

    for i in set(data['userId']):
        #lists with Ids
        m_favourites_=list(data[data['userId']==i].sort_values(by='rating',ascending=False)['movieId'][0:m])
        m_favourite_moviesId.append(m_favourites_)
        #lists with names
        #m_favouritesName_=list(data[data['userId']==i].sort_values(by='rating',ascending=False)['title'][0:m])
        #m_favourite_moviesName.append(m_favouritesName_)

    # Transpose list
    m_favourite_moviesId=pd.DataFrame(m_favourite_moviesId,index=set(data['userId'])).T
   # m_favourite_moviesName=pd.DataFrame(m_favourite_moviesName).T
    return m_favourite_moviesId

In [19]:
# function to create matrix with top 5 moviess of top most similiar users
def create_recommender_table(m_favourite_moviesId,k_closest_users,k=3,su=5): # k = number of bestrated movies per similiar user
    recommender_table=[]
    for i in range(1,k_closest_users.shape[1]+1):
        for j in range(1,su+1):
            recommender_table.append(m_favourite_moviesId[k_closest_users[i][j]][0:k])
    
    recommender_table=pd.DataFrame(np.array(recommender_table).reshape(k_closest_users.shape[1],su*k).T,columns=k_closest_users.columns)
    
    return recommender_table

In [20]:
def picking_probabilities(k=3,su=5,exp=1): # k: number of movies per sim user, su: number of sim users, exp: power for prob 
    if exp==1:
        probabilities=np.array(range(1,k*su+1))
        probabilities=probabilities/(sum(probabilities))
    if exp==2:
        probabilities=np.array(range(1,k*su+1))**2
        probabilities=probabilities/(sum(probabilities))
    return probabilities[::-1]

In [21]:
def pick_and_rate_movie_simple(user,data,recommender_table,k=3,su=5,exp=1):
    check=0
    counter=0
    recommender_table=recommender_table[0:k*su]
    while check==0:  # check=0 means, that there is not entry yet

        next_movie=np.random.choice(recommender_table,p=picking_probabilities(k,su,exp))

        rating=data[data['movieId']==next_movie]['rating'].mean()
        check=data[(data['userId']==user) & (data['movieId']==next_movie)].shape[0]
        counter+=1

        if check==1 and counter<5:     # here: use 'and' instead of '&'; '&' would be used as bit-wise operator here (works only in Pandas as intended)
            #recommender_table=recommender_table[counter::]
            check=0
            #print(f'{i}: user {user} has watched movie. Draw again.')
        elif check==1 and counter==5:
            #print(f'{i}: user {user} has watched all movies. No more movies are drawn.')
            return next_movie, rating, counter        
        else:
            counter=0
            return next_movie, rating, counter
        
#     if data[(data['userId']==user) & (data['movieId']==next_movie)].shape[0]==1:
#         print(f'new draw for {user}')
#         return pick_and_rate_movie(user,data,recommender_table) # only 1000 recursion with python; alternative: while loop
#     else:
#         next_movie
#         rating=data[data['movieId']==next_movie]['rating'].mean()
        
#     return next_movie,rating

In [22]:
# def find_movies_not_watched(user,data):
    
#     next_movie=np.random.choice(recommender_table[user],p=picking_probabilities(k,su,exp))
#     movies_watched=data[data['userId']==user]['movieId']
#     while next_movie not in movies_watched:
#         rating=data[data['movieId']==next_movie]['rating'].mean()
#         return next_movie,rating
#     else:
#         print(f'user {user} has watched this movie.')
#         next_movie='continue'
#         rating='place_holder'
#         return next_movie,rating

In [23]:
# dict mit userId as key (set of movie Ids, or dict in dict )

In [24]:
def pick_check_and_rate_movie(user,data,recommender_table,k=3,su=5,exp=1):
    check=0
    counter=0
    #recommender_table=recommender_table[user]
    
    while check==0:  # check=0 means, that there is not entry yet
        next_movie=np.random.choice(recommender_table[user],p=picking_probabilities(k,su,exp))
        #next_movie=recommender_table.iloc[0]
        rating=data[data['movieId']==next_movie]['rating'].mean()
        check=data[(data['userId']==user) & (data['movieId']==next_movie)].shape[0]
        counter+=1

        if check==1 and counter<5:     # here: use 'and' instead of '&'; '&' would be used as bit-wise operator here (works only in Pandas as intended)
            #recommender_table=recommender_table[counter::]
            check=0
            #print(f'{i}: user {user} has watched movie. Draw again.')
        elif check==1 and counter==5:
            print(f'{i}: user {user} has watched all movies. No more movies are drawn.')
            return next_movie, rating, counter        
        else:
            counter=0
            return next_movie, rating, counter

In [25]:
# recommender for weighted_ranking

# def pick_check_and_rate_movie_simple(user,data):
#             next_movie=np.random.choice(recommender_table[user],p=picking_probabilities(k,su,exp))
#         #next_movie=recommender_table.iloc[0]
#         rating=data[data['movieId']==next_movie]['rating'].mean()
#         check=data[(data['userId']==user) & (data['movieId']==next_movie)].shape[0]



In [26]:
# def rate_movie(next_movie,data):
#     rating=data[data['movieId']==next_movie]['rating'].mean()           #build in randomness!?
#     return rating

In [27]:
def create_entry(user,next_movie,rating,data):
    new_entry=pd.DataFrame([[user,next_movie,rating]],columns=['userId','movieId','rating'])
    data=data.append(new_entry)
    return data              # maybe here is the slow-source

In [28]:
def create_entries(n):
    sim_data=pd.DataFrame()
    for i in range(n):
        sim_data=sim_data.append(create_entry(i))
    return sim_data

In [29]:
# for i in list(range(1,611))*10:
#     user=i
#     k=4
#     su=4
#     next_movie,rating,counter=pick_check_and_rate_movie(user,
#                                                            data,
#                                                            create_recommender_table(create_favourites_table(data),
#                                                                                     create_closest_user_table(create_cossim_matrix(data)),k,su),k,su)
#     if counter==5:
#         continue
#     data=create_entry(user,next_movie,rating,data)
#     #print(f'{i}: dataset updated')

In [30]:
#         simulated_customer.to_csv('simulated_customer_test20.csv', sep=',', encoding='utf-8')

In [31]:
for k in range(1,2):
    print(k)
    for i in list(range(1,611))*k:
        user=i
        k=5
        su=5
        next_movie,rating,counter=pick_and_rate_movie_simple(user,data,create_chartlis(data),k,su,exp=1)



        if counter==5:
            continue
        data=create_entry(user,next_movie,rating,data)
        #print(f'{i}: dataset updated')

1


In [32]:
data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,av_rating,no_of_ratings,weighted_rank
0,1,1,4.000000,9.649827e+08,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215.0,3.787796
1,5,1,4.000000,8.474350e+08,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215.0,3.787796
2,7,1,4.500000,1.106636e+09,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215.0,3.787796
3,15,1,2.500000,1.510578e+09,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215.0,3.787796
4,17,1,4.500000,1.305696e+09,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215.0,3.787796
...,...,...,...,...,...,...,...,...,...
0,604,318,4.429022,,,,,,
0,605,58559,4.238255,,,,,,
0,607,1197,4.232394,,,,,,
0,608,58559,4.238255,,,,,,


In [33]:

data.to_csv('simulated_movie_user_data_simple_large.csv', sep=',', encoding='utf-8')


In [None]:

# calculate_weighted_rank(data,m=100)

# create_cossim_matrix(data)

# create_closest_user_table(cos_sim_matrix)

# create_favourites_table(data)

# create_recommender_table(m_favourite_moviesId,k_closest_users,k=3,su=5)

# picking_probabilities(k=3,su=5,exp=1) # within pick_movie()

# pick_movie(user,data,recommender_table,k=3,su=5,exp=1)

# rate_movie(movie,data)

# create_entry(user,next_movie,rating,data)

# create_entries(n)

In [None]:
import cProfile, pstats, io
def profile(fnc):
    
    """A decorator that uses cProfile to profile a function. 
       Starts the profile before executing a function, then exeuctes the function,
       then stops the profile, then prints out a diagnostics report.
       
       Lots of boilerplate code from the Python 3 documentation:
       https://docs.python.org/3/library/profile.html#profile.Profile
       """
    
    def inner(*args, **kwargs):
        
        pr = cProfile.Profile()
        pr.enable()  
        retval = fnc(*args, **kwargs)       
        pr.disable() 
        s = io.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print(s.getvalue())

        return retval

    return inner