# Data preprocessing 

In [156]:
import numpy as np
import pandas as pd
from surprise import Reader
from surprise import Dataset
from scipy import spatial

In [48]:
df = pd.read_csv('end_style.csv')

In [49]:
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis='columns')

In [50]:
df

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,code,predict
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,1,1
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,19,19
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,3,3
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants,30,28
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
43742,17036,Men,Footwear,Shoes,Casual Shoes,White,Summer,2013.0,Casual,Gas Men Caddy Casual Shoe,2,2
43743,6461,Men,Footwear,Flip Flops,Flip Flops,Red,Summer,2011.0,Casual,Lotto Men's Soccer Track Flip Flop,11,11
43744,18842,Men,Apparel,Topwear,Tshirts,Blue,Fall,2011.0,Casual,Puma Men Graphic Stellar Blue Tshirt,0,0
43745,46694,Women,Personal Care,Fragrance,Perfume and Body Mist,Blue,Spring,2017.0,Casual,Rasasi Women Blue Lady Perfume,18,3


In [51]:
df_male = df[df['gender']=='Men']
df_female = df[df['gender']=='Women']

In [239]:
# gathering data by men
dummy_masterCat = pd.get_dummies(df_male['masterCategory'])
dummy_code = pd.get_dummies(df_male['code'])
dummy_predict = pd.get_dummies(df_male['predict'])
dummy_baseColour = pd.get_dummies(df_male['baseColour'])
dummy_season = pd.get_dummies(df_male['season'])
dummy_usage = pd.get_dummies(df_male['usage'])
year = df_male['year']

# gathering data by women
dummy_masterCat_f = pd.get_dummies(df_female['masterCategory'])
dummy_code_f = pd.get_dummies(df_female['code'])
dummy_predict_f = pd.get_dummies(df_female['predict'])
dummy_baseColour_f = pd.get_dummies(df_female['baseColour'])
dummy_season_f = pd.get_dummies(df_female['season'])
dummy_usage_f = pd.get_dummies(df_female['usage'])
year_f = df_female['year']

In [240]:
male_real = dummy_season
male_real = male_real.join(dummy_code)
male_real = male_real.join(year)
male_real = male_real.join(dummy_baseColour)
male_real = male_real.join(dummy_usage)

male_predict = dummy_season
male_predict = male_predict.join(dummy_predict)
male_predict = male_predict.join(year)
male_predict = male_predict.join(dummy_baseColour)
male_predict = male_predict.join(dummy_usage)

female_predict = dummy_season_f
female_predict = female_predict.join(dummy_code_f)
female_predict = female_predict.join(year_f)
female_predict = female_predict.join(dummy_baseColour_f)
female_predict = female_predict.join(dummy_usage_f)

female_predict = dummy_season_f
female_predict = female_predict.join(dummy_predict_f)
female_predict = female_predict.join(year_f)
female_predict = female_predict.join(dummy_baseColour_f)
female_predict = female_predict.join(dummy_usage_f)

In [241]:
# to matrix type

male_real_mat = male_real.as_matrix()
male_predict_mat = male_predict.as_matrix()
female_real_mat = female_real.as_matrix()
female_predict_mat = female_predict.as_matrix()

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  


# Simple recommendation system

In [182]:
def simple_similar_euclidean(data, product):
    
    similar =[]
    for i in data:
        sim = spatial.distance.euclidean(i, product)
        similar.append(sim)
    similar = np.array(similar)
    similar_norm = similar/similar.max()
    
    return similar_norm 

def simple_rating(similarity, data):
    id_sim = []
    for n, i in enumerate(similarity):
        id_sim.append([i, list(data['id'])[n]])
    sorted_similarity = sorted(id_sim)
    return sorted_similarity

In [188]:
list(df_male['id'])[10]

12369

In [224]:
s_real = simple_recommend_euclidean(male_real_mat, male_real_mat[10])
rating_list = simple_rating(s_real, df_male)

In [236]:
rating_list[-3:-1]

[[1.0, 50764], [1.0, 50765]]

In [242]:
s_cnn = simple_recommend_euclidean(male_predict_mat, male_predict_mat[10])
rating_list_cnn = simple_rating(s_cnn, df_male)

In [247]:
rating_list_cnn[-3:-1]

[[1.0, 50764], [1.0, 50765]]

In [258]:
s_real_f = simple_recommend_euclidean(female_real_mat, female_real_mat[10])
rating_list_f = simple_rating(s_real_f, df_female)

In [260]:
s_cnn_f = simple_recommend_euclidean(female_real_mat, female_real_mat[10])
rating_list_cnn_f = simple_rating(s_cnn_f, df_female)

In [301]:
len(df_female)*0.8

14587.2

In [302]:
rating_list_f[14587:14589]

[[1.0, 45366], [1.0, 45367]]

In [303]:
rating_list_cnn_f[14587:14589]

[[1.0, 45366], [1.0, 45367]]

Real_data_male: | id: 12369 | ideal(0.0): [0.0, 9248], [0.0, 9253] | good(0.25): [0.32444284226152503, 17501], [0.32444284226152503, 17502] | normal(0.5): [0.3746343246326776, 14605], [0.3746343246326776, 14606] | bad(0.75): [0.4588314677411235, 34234], [0.4588314677411235, 34235] | very bad(1): [1.0, 50764], [1.0, 50765]

Predict_data_male: | id: 12369 | ideal(0.0): [0.0, 9248], [0.0, 9253]| good(0.25): [0.32444284226152503, 17511], [0.32444284226152503, 17512] | normal(0.5): [0.3746343246326776, 13939], [0.3746343246326776, 13945] | bad(0.75): [0.4588314677411235, 34234], [0.4588314677411235, 34235] | very bad(1): [1.0, 50764], [1.0, 50765]

Real_data_female:| id: 20099 | ideal(0.0): [0.0, 11790], [0.0, 11794] | good(0.005): [0.4472135954999579, 11545], [0.4472135954999579, 11547] | normal(0.01): [0.6324555320336759, 11247], [0.6324555320336759, 11370] | bad(0.2): [0.7745966692414834, 27428], [0.7745966692414834, 27434] | very bad(1): [1.0, 45366], [1.0, 45367]

Predict_data_male: | id: 12369 | ideal(0.0): [0.0, 11790], [0.0, 11794] | good(0.005): [0.4472135954999579, 11545], [0.4472135954999579, 11547] | normal(0.01): [0.6324555320336759, 11247], [0.6324555320336759, 11370] | bad(0.2):  [0.7745966692414834, 27428], [0.7745966692414834, 27434] | very bad(1): [1.0, 45366], [1.0, 45367]

# Creating of random personal rating

In [158]:
import random 

In [159]:
def create_random_rating(data, product):
    similar =[]
    for i in data:
        sim = spatial.distance.euclidean(i, product)
        similar.append(sim)
    similar = np.array(similar)
    similar_norm = similar/similar.max()
    
    rand = random.choice([1,2,3])
    rating = []
    # rating system 1
    if rand == 1:
        for i in similar_norm:
            if i <= 0.2:
                rating.append(5)
            elif i<= 0.4:
                rating.append(4)
            elif i <= 0.6:
                rating.append(3)
            elif i <= 0.8:
                rating.append(2)
            else:
                rating.append(1)
    elif rand == 2:
        for i in similar_norm:
            if i <= 0.1:
                rating.append(5)
            elif i<= 0.4:
                rating.append(4)
            elif i <= 0.7:
                rating.append(3)
            else:
                rating.append(1)
    else:
        for i in similar_norm:
            if i <= 0.1:
                rating.append(5)
            elif i<= 0.3:
                rating.append(4)
            else:
                rating.append(2)
        
    return rating

In [358]:
def random_person(data):
    
    ids = np.arange(0, 21914)
    n_likes = [50, 100, 150, 200, 500]
    rand_likes = random.choice(n_likes)
    lovely_product = random.choice(ids)
    print('lovely product:', lovely_product)
    rating = create_random_rating(data, data[lovely_product])
    
    personal_rating = []
    liked_products = []
    c = 0
    while c <= rand_likes:
        rand_product = random.choice(ids)
        product_rating = rating[rand_product]
        liked_products.append(list(df_male['id'])[rand_product])
        personal_rating.append(product_rating)
        c+=1
    
    return [liked_products, personal_rating]



In [349]:
names = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

In [359]:
name_lists = []
ratings_personal = []
for i in names:
    print(i)
    person = random_person(male_real_mat)
    n = len(person[0])
    name_list = [i]*n
    name_lists.append(name_list)
    ratings_personal.append(person)


A
lovely product: 14275
B
lovely product: 2970
C
lovely product: 13209
D
lovely product: 18626
E
lovely product: 21781
F
lovely product: 16449
G
lovely product: 11557
H
lovely product: 6188
I
lovely product: 12880
J
lovely product: 20023
K
lovely product: 8484
L
lovely product: 5329
M
lovely product: 12777
N
lovely product: 17336
O
lovely product: 21344
P
lovely product: 14337
Q
lovely product: 21214
R
lovely product: 16272
S
lovely product: 5850
T
lovely product: 15443
U
lovely product: 10645
V
lovely product: 8445
W
lovely product: 9003
X
lovely product: 13295
Y
lovely product: 18569
Z
lovely product: 13261


In [360]:
users = []
for i in name_lists:
    users = users+i

ids = []
for i in ratings_personal:
    ids = ids + i[0]
    
rates = []
for i in ratings_personal:
    rates = rates + i[1]
    

In [361]:
print(len(users), len(ids), len(rates))

5526 5526 5526


# Colaborative filtering

In [362]:
# load_data.py
# This is the same data that was plotted for similarity earlier
# with one new user "E" who has rated only movie 1
ratings_dict = {
    "item": ids ,
    "user": users,
    "rating": rates,
}

df = pd.DataFrame(ratings_dict)
reader = Reader(rating_scale=(1, 5))

# Loads Pandas dataframe
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)
trainingSet = data.build_full_trainset()

In [378]:
from surprise import KNNWithMeans

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)

In [379]:
algo.fit(trainingSet)

predicted_rating_real = []

# for i in df_male['id']:
prediction = algo.predict('A', ids[0])
# predicted_rating_real.append(prediction.est)

prediction

Computing the cosine similarity matrix...
Done computing similarity matrix.


Prediction(uid='A', iid=31418, r_ui=None, est=2.0375, details={'actual_k': 40, 'was_impossible': False})

In [380]:
prediction = algo.predict('C', random.choice(ids))

In [381]:
prediction

Prediction(uid='C', iid=4766, r_ui=None, est=2.0, details={'actual_k': 0, 'was_impossible': False})

In [382]:
print('A', 'loves', 14275)
print(algo.predict('A', ids[0]))
print(algo.predict('A', ids[1]))
print(algo.predict('A', ids[10]))

print('C', 'loves', 13209)
print(algo.predict('C', ids[0]))
print(algo.predict('C', ids[1]))
print(algo.predict('C', ids[10]))

print('D', 'loves', 18626)
print(algo.predict('D', ids[0]))
print(algo.predict('D', ids[1]))
print(algo.predict('D', ids[10]))

A loves 14275
user: A          item: 31418      r_ui = None   est = 2.04   {'actual_k': 40, 'was_impossible': False}
user: A          item: 28559      r_ui = None   est = 4.04   {'actual_k': 40, 'was_impossible': False}
user: A          item: 24817      r_ui = None   est = 4.04   {'actual_k': 40, 'was_impossible': False}
C loves 13209
user: C          item: 31418      r_ui = None   est = 2.00   {'actual_k': 0, 'was_impossible': False}
user: C          item: 28559      r_ui = None   est = 3.61   {'actual_k': 6, 'was_impossible': False}
user: C          item: 24817      r_ui = None   est = 4.00   {'actual_k': 0, 'was_impossible': False}
D loves 18626
user: D          item: 31418      r_ui = None   est = 2.00   {'actual_k': 0, 'was_impossible': False}
user: D          item: 28559      r_ui = None   est = 3.00   {'actual_k': 1, 'was_impossible': False}
user: D          item: 24817      r_ui = None   est = 4.00   {'actual_k': 0, 'was_impossible': False}
