In [1]:
import sys
import pandas as pd
import csv
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors
import Orange

def get_user_profile(user_id, df_rating, df_a_fatures):
    df_user = df_rating.loc[df_rating['user_id'] == user_id]
    df_merged = pd.merge(df_user, df_a_fatures, how='left', left_on='anime_id', right_on='anime_id').drop(['anime_id', 'rating'], axis=1)
    
    avg_genre = df_merged[df_merged.columns.difference(['user_id', 'anime_id', 'rating'])].sum(axis=1)
    
    
    
    # Count only 1's
    df_user_sum = df_merged.sum(axis=0)
    df_user_sum.user_id = user_id
    df_user_sum['rating'] = 10.0
    df_user_sum['genre_count'] = avg_genre.sum() / float(len(avg_genre))
    
    return df_user_sum
#
def get_user_profiles(df_animes_vector, df_rating, n_users=50):
    # first n_users
    users = list(df_rating['user_id'].unique())[:n_users] 

    # Create user profiles:
    df_user_profiles = pd.DataFrame()
    for u in users:
        u_prof = get_user_profile(u, df_rating, df_animes_vector)
        df_user_profiles = df_user_profiles.append(u_prof, ignore_index = True)
    return df_user_profiles
#
def normalize(df_user_profiles):
    x = df_user_profiles.iloc[:,1:-1].values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    
    x_scaled = min_max_scaler.fit_transform(x.T)
    
    df_scaled = pd.DataFrame(x_scaled.T, columns=df_user_profiles.columns.difference(['user_id','rating','genre']))
    
    df_scaled['user_id'] = df_user_profiles['user_id'].values
    df_scaled['genre_count'] = map(lambda x: x /10.0, df_user_profiles['genre_count'].values)
    df_scaled['rating'] = 1.0
    
    return df_scaled
#
def get_userids_by_indices(indices, df_user_prof_norm):
    users = []
    for i in indices:
       uid = df_user_prof_norm.loc[i]['user_id']
       users.append(uid)
    return users    
#



In [64]:
file_anime = "raw/anime.csv"
file_rating = "raw/rating.csv"

df_rating = pd.read_csv(file_rating)
df_animes = pd.read_csv(file_anime)
df_animes_genres = pd.get_dummies(df_animes['genre'].str.get_dummies(sep=", ")) # creates genre vectors
df_animes_vector = pd.concat([df_animes['anime_id'], df_animes_genres], axis=1) # anime_id + genre vector

# Get user profiles; then normalize 
df_user_profiles = get_user_profiles(df_animes_vector, df_rating, )
df_user_prof_norm = normalize(df_user_profiles)

In [None]:
# find closest k user profiles
k = 11
nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(df_user_prof_norm.iloc[:,1:])
user_id = 3
user_prof = df_user_prof_norm[df_user_prof_norm['user_id'] == user_id]
user_prof = user_prof.drop('user_id', axis=1)

# Get closest neighbours
distances, indices = nbrs.kneighbors(user_prof)

# get user_ids
uids = get_userids_by_indices(indices[0], df_user_prof_norm)
print uids

In [None]:
u_animes = []
for uid in uids:
    u_animes.append(df_rating[df_rating['user_id'] == uid]['anime_id'].tolist())
with open('anime_trans.basket', 'wb') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(u_animes)

In [None]:
# !!!!! Get the transactions directly from the list, not from the .basket file !!!!!

# Get all training transactions

data = Orange.data.Table("anime_trans.basket") #Orange.data.Table("anime_trans.basket")

# This is the user we would like to recommend something for
target_user = data[0]
target_user_animes = data[0].get_metas(str).keys()

# Drop the user's data from the transactions list
data = data.get_items(range(1,len(data)))

# Generate recommendation rules

rules = Orange.associate.AssociationRulesSparseInducer(data, support = 0.5, confidence = 0.7,
                                                       max_item_sets = 100000)

# print "%4s\t %4s  %s %s" % ("AnimeId", "Lift", "Support", "Conf")

recommendations = {}
for r in rules:
    
    # Compare the generated rules with a specific instance from the transactions list
    if(r.n_right==1):
        recommendation = str(r.right.get_metas(str).keys()[0])
        if recommendation not in target_user_animes:
            if r.applies_left(target_user):
                try:
                    recommendations[r.n_left].append(r)
                except:
                    recommendations[r.n_left] = []
                    recommendations[r.n_left].append(r)
                # print "%4.2f %4.4f %s %s" % (r.support, r.confidence, r, r.lift)

user_recommendations = []
for i, r in recommendations.iteritems():
    recommendations[i].sort(key=lambda x: (x.lift, x.support, x.confidence), reverse=True)
    
for recommendation_length in sorted(recommendations.keys(), reverse=True):
    if len(user_recommendations) == 10:
        break
    for recommendation in recommendations[recommendation_length]:
        anime_id = str(recommendation.right.get_metas(str).keys()[0])
#         print recommendation
#         print anime_id, "\t", recommendation.lift, recommendation.support, recommendation.confidence
        if anime_id not in user_recommendations:
            user_recommendations.append(anime_id)
        if len(user_recommendations) == 10:
            break
print user_recommendations
# Orange.associate.AssociationRulesSparseInducer.get_itemsets(rules)

In [13]:
def get_collaborative_recommendations_per_user(user_id, k, df_user_prof_norm):

    # find closest k user profiles
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(df_user_prof_norm.drop(['user_id','rating','genre_count'], axis=1))
    user_prof = df_user_prof_norm[df_user_prof_norm['user_id'] == user_id]
    user_prof = user_prof.drop(['user_id','rating','genre_count'], axis=1)

    # Get closest neighbours
    distances, indices = nbrs.kneighbors(user_prof)

    # get user_ids
    uids = get_userids_by_indices(indices[0], df_user_prof_norm)
    
    # ------------------------------------------------------------
    u_animes = []
    for uid in uids:
        u_animes.append(df_rating[df_rating['user_id'] == uid]['anime_id'].tolist())
    with open('anime_trans.basket', 'wb') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(u_animes)
    # ------------------------------------------------------------
    
    # !!!!! Get the transactions directly from the list, not from the .basket file !!!!!

    # Get all training transactions

    data = Orange.data.Table("anime_trans.basket") #Orange.data.Table("anime_trans.basket")

    # This is the user we would like to recommend something for
    target_user = data[0]
    target_user_animes = data[0].get_metas(str).keys()

    # Drop the user's data from the transactions list
    data = data.get_items(range(1,len(data)))

    # Generate recommendation rules

    rules = Orange.associate.AssociationRulesSparseInducer(data, support = 0.5, confidence = 0.7,
                                                           max_item_sets = 100000)

    # print "%4s\t %4s  %s %s" % ("AnimeId", "Lift", "Support", "Conf")

    recommendations = {}
    for r in rules:

        # Compare the generated rules with a specific instance from the transactions list
        if(r.n_right==1):
            recommendation = str(r.right.get_metas(str).keys()[0])
            if recommendation not in target_user_animes:
                if r.applies_left(target_user):
                    try:
                        recommendations[r.n_left].append(r)
                    except:
                        recommendations[r.n_left] = []
                        recommendations[r.n_left].append(r)
                    # print "%4.2f %4.4f %s %s" % (r.support, r.confidence, r, r.lift)

    user_recommendations = []
    for i, r in recommendations.iteritems():
        recommendations[i].sort(key=lambda x: (x.lift, x.support, x.confidence), reverse=True)

    for recommendation_length in sorted(recommendations.keys(), reverse=True):
        if len(user_recommendations) == 10:
            break
        for recommendation in recommendations[recommendation_length]:
            anime_id = str(recommendation.right.get_metas(str).keys()[0])
    #         print recommendation
    #         print anime_id, "\t", recommendation.lift, recommendation.support, recommendation.confidence
            if anime_id not in user_recommendations:
                user_recommendations.append(anime_id)
            if len(user_recommendations) == 10:
                break
    return user_recommendations
    # Orange.associate.AssociationRulesSparseInducer.get_itemsets(rules)

In [3]:
file_anime = "raw/anime.csv"
file_rating = "omer/rating_train.csv"

df_rating = pd.read_csv(file_rating)
df_animes = pd.read_csv(file_anime)
df_animes_genres = pd.get_dummies(df_animes['genre'].str.get_dummies(sep=", ")) # creates genre vectors
users_ids = list(df_rating['user_id'].unique())
df_animes_vector = pd.concat([df_animes['anime_id'], df_animes_genres], axis=1) # anime_id + genre vector

# Get user profiles; then normalize 
df_user_profiles = get_user_profiles(df_animes_vector, df_rating, n_users=100)
df_user_prof_norm = normalize(df_user_profiles)

In [4]:
for i in range(0, 20):
    print df_user_profiles.iloc[i].to_frame().T.user_id

0    1.0
Name: user_id, dtype: float64
1    2.0
Name: user_id, dtype: float64
2    3.0
Name: user_id, dtype: float64
3    4.0
Name: user_id, dtype: float64
4    5.0
Name: user_id, dtype: float64
5    6.0
Name: user_id, dtype: float64
6    7.0
Name: user_id, dtype: float64
7    8.0
Name: user_id, dtype: float64
8    9.0
Name: user_id, dtype: float64
9    10.0
Name: user_id, dtype: float64
10    11.0
Name: user_id, dtype: float64
11    12.0
Name: user_id, dtype: float64
12    13.0
Name: user_id, dtype: float64
13    14.0
Name: user_id, dtype: float64
14    15.0
Name: user_id, dtype: float64
15    16.0
Name: user_id, dtype: float64
16    17.0
Name: user_id, dtype: float64
17    18.0
Name: user_id, dtype: float64
18    19.0
Name: user_id, dtype: float64
19    20.0
Name: user_id, dtype: float64


In [11]:
for i in users_ids:
    print ("Results for user %4d\t " % (i))
    rec = get_collaborative_recommendations_per_user(user_id=i, k=11, df_user_prof_norm=df_user_prof_norm)
    print rec

Results for user    1	 
['14741', '6213', '15809', '10620', '14345', '16498', '20787', '11633', '8769']
None
Results for user    2	 
[]
None
Results for user    3	 
['14513']
None
Results for user    4	 
[]
None
Results for user    5	 
['4181', '11887', '11771', '2904', '1575', '813', '8769', '14741', '16894', '16742']
None
Results for user    6	 
['13601', '10087', '6746', '11741', '9989', '8769', '4224', '11757', '9756', '10620']
None
Results for user    7	 
['10793', '11757', '15809', '16498', '13759', '16668', '15451', '11887', '13659', '14813']
None
Results for user    8	 
[]
None
Results for user    9	 
[]
None
Results for user   10	 
[]
None
Results for user   11	 
['1535', '4224', '9989', '457', '16498', '6746']
None
Results for user   12	 
[]
None
Results for user   13	 
['1575', '6547', '5114', '10620', '16498', '4181', '121', '2904', '11771']
None
Results for user   14	 
['15451', '11617', '13759', '10719', '14741', '11759', '16668', '14967', '11633', '15809']
None
Results f

ValueError: Found array with 0 sample(s) (shape=(0, 43)) while a minimum of 1 is required.

In [12]:
get_collaborative_recommendations_per_user(user_id=42, k=11, df_user_prof_norm=df_user_prof_norm)

[]
