In [1]:
import sys
import pandas as pd
import csv
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors
import Orange

def get_user_profile(user_id, df_rating, df_a_fatures):
    
    # To be used only if the user profiles file is not already created
    df_user = df_rating.loc[df_rating['user_id'] == user_id]
    df_merged = pd.merge(df_user, df_a_fatures, how='left', left_on='anime_id', right_on='anime_id').drop(['anime_id', 'rating'], axis=1)
    
    avg_genre = df_merged[df_merged.columns.difference(['user_id', 'anime_id', 'rating'])].sum(axis=1)
    
    # Count only 1's
    df_user_sum = df_merged.sum(axis=0)
    df_user_sum.user_id = user_id
    df_user_sum['rating'] = 10.0
    df_user_sum['genre_count'] = avg_genre.sum() / float(len(avg_genre))
    
    return df_user_sum
#
def get_user_profiles(df_animes_vector, df_rating, n_users=50):
    
    # To be used only if the user profiles file is not already created
    
    # first n_users
    users = list(df_rating['user_id'].unique())[:n_users] 

    # Create user profiles:
    df_user_profiles = pd.DataFrame()
    for u in users:
        u_prof = get_user_profile(u, df_rating, df_animes_vector)
        df_user_profiles = df_user_profiles.append(u_prof, ignore_index = True)
    return df_user_profiles
#
def normalize(df_user_profiles):
    x = df_user_profiles.iloc[:,1:-1].values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    
    x_scaled = min_max_scaler.fit_transform(x.T)
    
    df_scaled = pd.DataFrame(x_scaled.T, columns=df_user_profiles.columns.difference(['user_id','rating','genre']))
    
    df_scaled['user_id'] = df_user_profiles['user_id'].values
    df_scaled['genre_count'] = map(lambda x: x /10.0, df_user_profiles['genre_count'].values)
    df_scaled['rating'] = 1.0
    
    return df_scaled
#
def get_userids_by_indices(indices, df_user_prof_norm):
    users = []
    for i in indices:
       uid = df_user_prof_norm.loc[i]['user_id']
       users.append(uid)
    return users    
#

In [2]:
def get_collaborative_recommendations_per_user(user_id, k, df_user_prof_norm):

    # find closest k user profiles
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(df_user_prof_norm.drop(['user_id','rating','genre_count'], axis=1))
    user_prof = df_user_prof_norm[df_user_prof_norm['user_id'] == user_id]
    user_prof = user_prof.drop(['user_id','rating','genre_count'], axis=1)

    # Get closest neighbours
    distances, indices = nbrs.kneighbors(user_prof)

    # get user_ids
    uids = get_userids_by_indices(indices[0], df_user_prof_norm)
    
    # ------------------------------------------------------------
    u_animes = []
    for uid in uids:
        u_animes.append(df_rating[df_rating['user_id'] == uid]['anime_id'].tolist())
    with open('anime_trans.basket', 'wb') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(u_animes)
    # ------------------------------------------------------------
    
    # !!!!! Get the transactions directly from the list, not from the .basket file !!!!!

    # Get all training transactions

    data = Orange.data.Table("anime_trans.basket") #Orange.data.Table("anime_trans.basket")

    # This is the user we would like to recommend something for
    target_user = data[0]
    target_user_animes = data[0].get_metas(str).keys()

    # Drop the user's data from the transactions list
    data = data.get_items(range(1,len(data)))

    # Generate recommendation rules
    support_threshold = 0.5
    confidence_threshold = 0.8
    rulesOK = False
    while rulesOK is False:
        try:
            rules = Orange.associate.AssociationRulesSparseInducer(data, support = support_threshold, confidence = confidence_threshold,
                                                                   max_item_sets = 100000)
            rulesOK = True
        except:
            print(support_threshold, confidence_threshold)
            if confidence_threshold == 1:
                support_threshold += 0.1
            else:
                confidence_threshold += 0.1
            

    # print "%4s\t %4s  %s %s" % ("AnimeId", "Lift", "Support", "Conf")

    recommendations = {}
    for r in rules:

        # Compare the generated rules with a specific instance from the transactions list
        if(r.n_right==1):
            recommendation = str(r.right.get_metas(str).keys()[0])
            if recommendation not in target_user_animes:
                #if r.applies_left(target_user):
                try:
                    recommendations[r.n_left].append(r)
                except:
                    recommendations[r.n_left] = []
                    recommendations[r.n_left].append(r)
                    # print "%4.2f %4.4f %s %s" % (r.support, r.confidence, r, r.lift)

    user_recommendations = []
    for i, r in recommendations.iteritems():
        recommendations[i].sort(key=lambda x: (x.lift, x.support, x.confidence), reverse=True)

    for recommendation_length in sorted(recommendations.keys(), reverse=True):
        if len(user_recommendations) == 10:
            break
        for recommendation in recommendations[recommendation_length]:
            anime_id = str(recommendation.right.get_metas(str).keys()[0])
    #         print recommendation
    #         print anime_id, "\t", recommendation.lift, recommendation.support, recommendation.confidence
            if anime_id not in user_recommendations:
                user_recommendations.append(anime_id)
            if len(user_recommendations) == 10:
                break
    return user_recommendations
    # Orange.associate.AssociationRulesSparseInducer.get_itemsets(rules)

In [3]:
file_anime = "raw/anime.csv"
file_rating = "raw/rating_train.csv"

df_rating = pd.read_csv(file_rating)
df_animes = pd.read_csv(file_anime)
df_animes_genres = pd.get_dummies(df_animes['genre'].str.get_dummies(sep=", ")) # creates genre vectors
users_ids = list(df_rating['user_id'].unique())
df_animes_vector = pd.concat([df_animes['anime_id'], df_animes_genres], axis=1) # anime_id + genre vector

# Get user profiles; then normalize 
df_user_profiles = get_user_profiles(df_animes_vector, df_rating, n_users=500)
df_user_prof_norm = normalize(df_user_profiles)

In [4]:
for i in users_ids[:100]:
    print ("Results for user %4d\t " % (i))
    rec = get_collaborative_recommendations_per_user(user_id=i, k=11, df_user_prof_norm=df_user_prof_norm)
    print rec

Results for user    1	 
['11499', '5081', '10397', '1195', '10620', '7088', '15315', '17247', '11319', '11597']
Results for user    3	 
['31964']
Results for user    4	 
['269', '9919', '5114']
Results for user    5	 
['10030', '1604', '11771', '12365', '2025', '1482', '1535']
Results for user    6	 
['5081', '18679', '32', '10620', '4181', '2167', '9989', '19815', '11111', '16498']
Results for user    7	 
(0.5, 0.8)
(0.5, 0.9)
(0.5, 1.0)
['12293', '22199', '8841', '5114', '14813', '19815', '4224', '22535', '14513', '16498']
Results for user    8	 
[]
Results for user   11	 
['3457', '199', '9919', '1535']
Results for user   12	 
['813']
Results for user   13	 
['11111', '22535', '16498', '6547', '14741', '226', '30276', '23283', '19815']
Results for user   14	 
['15699', '4224', '11785', '11759', '18247', '14967', '14741', '13759', '15809', '356']
Results for user   16	 
[]
Results for user   17	 
['25183', '4224', '8841', '27989', '28025', '5958', '5341', '1840', '813', '28497']
Resu