In [18]:
import sys
import pandas as pd
import csv
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors
import Orange

def get_user_profile(user_id, df_rating, df_a_fatures):
    df_user = df_rating.loc[df_rating['user_id'] == user_id]
    df_merged = pd.merge(df_user, df_a_fatures, how='left', left_on='anime_id', right_on='anime_id').drop(['anime_id', 'rating'], axis=1)
    
    # Count only 1's
    df_user_sum = df_merged.apply(pd.Series.value_counts).loc[df_merged.index == 1]
    df_user_sum.fillna(0, inplace = True)
    df_user_sum.user_id = user_id
    return df_user_sum
#
def get_user_profiles(df_animes_vector, df_rating):
    # first n users
    n_users = 50
    users = list(df_rating['user_id'].unique())[:n_users] 

    # Create user profiles:
    df_user_profiles = pd.DataFrame()
    for u in users:
        u_prof = get_user_profile(u, df_rating, df_animes_vector)
        df_user_profiles = df_user_profiles.append(u_prof, ignore_index = True)
    return df_user_profiles
#
def normalize(df_user_profiles):
    x = df_user_profiles.iloc[:,1:].values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x.T)
    df_user_profiles = pd.concat([df_user_profiles['user_id'], pd.DataFrame(x_scaled.T, columns=df_user_profiles.columns[1:])], axis=1)
    return df_user_profiles
#
def get_userids_by_indices(indices, df_user_prof_norm):
    users = []
    for i in indices:
       uid = df_user_prof_norm.loc[i]['user_id']
       users.append(uid)
    return users    
#



In [19]:
file_anime = "raw/anime.csv"
file_rating = "raw/rating.csv"

df_rating = pd.read_csv(file_rating)
df_animes = pd.read_csv(file_anime)
df_animes_genres = pd.get_dummies(df_animes['genre'].str.get_dummies(sep=", ")) # creates genre vectors
df_animes_vector = pd.concat([df_animes['anime_id'], df_animes_genres], axis=1) # anime_id + genre vector

# Get user profiles; then normalize 
df_user_profiles = get_user_profiles(df_animes_vector, df_rating)
df_user_prof_norm = normalize(df_user_profiles)

In [24]:
# find closest k user profiles
k = 11
nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(df_user_prof_norm.iloc[:,1:])
user_id = 3
user_prof = df_user_prof_norm[df_user_prof_norm['user_id'] == user_id]
user_prof = user_prof.drop('user_id', axis=1)

# Get closest neighbours
distances, indices = nbrs.kneighbors(user_prof)

# get user_ids
uids = get_userids_by_indices(indices[0], df_user_prof_norm)
print uids

[3.0, 50.0, 34.0, 4.0, 43.0, 33.0, 29.0, 38.0, 10.0, 44.0, 23.0]


In [25]:
u_animes = []
for uid in uids:
    u_animes.append(df_rating[df_rating['user_id'] == uid]['anime_id'].tolist())
with open('anime_trans.basket', 'wb') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(u_animes)

In [27]:
# !!!!! Get the transactions directly from the list, not from the .basket file !!!!!

# Get all training transactions

data = Orange.data.Table("anime_trans.basket") #Orange.data.Table("anime_trans.basket")

# This is the user we would like to recommend something for
target_user = data[0]
target_user_animes = data[0].get_metas(str).keys()

# Drop the user's data from the transactions list
data = data.get_items(range(1,len(data)))

# Generate recommendation rules

rules = Orange.associate.AssociationRulesSparseInducer(data, support = 0.5, confidence = 0.7,
                                                       max_item_sets = 100000)

# print "%4s\t %4s  %s %s" % ("AnimeId", "Lift", "Support", "Conf")

recommendations = {}
for r in rules:
    
    # Compare the generated rules with a specific instance from the transactions list
    if(r.n_right==1):
        recommendation = str(r.right.get_metas(str).keys()[0])
        if recommendation not in target_user_animes:
            if r.applies_left(target_user):
                try:
                    recommendations[r.n_left].append(r)
                except:
                    recommendations[r.n_left] = []
                    recommendations[r.n_left].append(r)
                # print "%4.2f %4.4f %s %s" % (r.support, r.confidence, r, r.lift)

user_recommendations = []
for i, r in recommendations.iteritems():
    recommendations[i].sort(key=lambda x: (x.lift, x.support, x.confidence), reverse=True)
    
for recommendation_length in sorted(recommendations.keys(), reverse=True):
    if len(user_recommendations) == 10:
        break
    for recommendation in recommendations[recommendation_length]:
        anime_id = str(recommendation.right.get_metas(str).keys()[0])
#         print recommendation
#         print anime_id, "\t", recommendation.lift, recommendation.support, recommendation.confidence
        if anime_id not in user_recommendations:
            user_recommendations.append(anime_id)
        if len(user_recommendations) == 10:
            break
print user_recommendations
# Orange.associate.AssociationRulesSparseInducer.get_itemsets(rules)

['269', '121', '10087', '1575', '2001']


In [28]:
target_user

[], {"20":1.000, "154":1.000, "170":1.000, "199":1.000, "225":1.000, "341":1.000, "430":1.000, "527":1.000, "552":1.000, "813":1.000, "1119":1.000, "1121":1.000, "1122":1.000, "1132":1.000, "1292":1.000, "1313":1.000, "1526":1.000, "1535":1.000, "1564":1.000, "1689":1.000, "1764":1.000, "1943":1.000, "2201":1.000, "2404":1.000, "2847":1.000, "3588":1.000, "4026":1.000, "5114":1.000, "5231":1.000, "6178":1.000, "6702":1.000, "6880":1.000, "7695":1.000, "8074":1.000, "9107":1.000, "9135":1.000, "9760":1.000, "9917":1.000, "9919":1.000, "9989":1.000, "10408":1.000, "10507":1.000, "11111":1.000, "11703":1.000, "11737":1.000, "11757":1.000, "11759":1.000, "11771":1.000, "12671":1.000, "14075":1.000, "14093":1.000, "14345":1.000, "14513":1.000, "16498":1.000, "16512":1.000, "16782":1.000, "16894":1.000, "16918":1.000, "17265":1.000, "18097":1.000, "18115":1.000, "18393":1.000, "19315":1.000, "19815":1.000, "20021":1.000, "20159":1.000, "20507":1.000, "20583":1.000, "21507":1.000, "21881":1.0

In [None]:
for anime in uids:
    