In [1]:
import numpy as np
import math
import pandas as pd
import random
import time
from sklearn import metrics as mr
from sklearn.decomposition import PCA
from collections import defaultdict

# Similarity calculations

In [2]:
def CS_f(r_i, r_j):
    CS_i_j = np.dot(r_i.T, r_j) / ((math.sqrt(np.dot(r_i.T, r_i)))
                                   * ((math.sqrt(np.dot(r_j.T, r_j)))))
    return CS_i_j


def ACS_f(r_i, r_j):
    a = 0.05
    ACS_i_j = np.dot(r_i.T, r_j) / ((math.pow(np.dot(r_i.T, r_i), a))
                                    * (math.pow(np.dot(r_j.T, r_j), 1 - a)))
    return ACS_i_j


def JS_f(r_i, r_j, mode=None):
    if mode == 'BASE':
        U = r_i + r_j
        N = 0
        for u in U:
            if u > 0:
                N = N + 1
        JS_i_j = np.dot(r_i.T, r_j) / N

    elif mode == 'PCA':
        JS_i_j = np.dot(r_i.T, r_j) / (np.dot(r_i.T, r_i) * np.dot(r_j.T, r_j) - np.dot(r_i.T, r_j))
    return JS_i_j


def PMI_f(r_i, r_j, mode=None, v_i=None, v_j=None):
    if mode == 'BASE':
        p_i_j = np.dot(r_i.T, r_j) / len(r_i)
        p_i = np.dot(r_i.T, r_i) / len(r_i)
        p_j = np.dot(r_j.T, r_j) / len(r_j)
        if p_i_j == 0:
            PMI_i_j = 0
        else:
            PMI_i_j = math.log((p_i_j / (p_i * p_j)))

    elif mode == 'PCA':
        p_i_j = np.dot(r_i.T, r_j) / len(r_i)
        p_i = np.dot(v_i.T, v_i) / len(v_i)
        p_j = np.dot(v_j.T, v_j) / len(v_j)
        PMI_i_j = p_i_j / (p_i * p_j)

    return PMI_i_j

# Get similarity matrix

In [3]:
def SIM_matrix(R, similarity_method):
    SIM = np.zeros([R.shape[1], R.shape[1]])
    if similarity_method == 'CS':
        for i in range(SIM.shape[1]):
            for j in range(SIM.shape[1]):
                SIM[i, j] = CS_f(R.iloc[:, i], R.iloc[:, j])
    elif similarity_method == 'ACS':
        for i in range(SIM.shape[1]):
            for j in range(SIM.shape[1]):
                SIM[i, j] = ACS_f(R.iloc[:, i], R.iloc[:, j])
    elif similarity_method == 'JS':
        for i in range(SIM.shape[1]):
            for j in range(SIM.shape[1]):
                SIM[i, j] = JS_f(R.iloc[:, i], R.iloc[:, j], mode=current_mode)
    elif similarity_method == 'PMI':
        if current_mode == 'PCA':
            base_data_for_pca_PMI = np.load('base_data_for_pca_PMI.npy')
            base_data_for_pca_PMI = pd.DataFrame(base_data_for_pca_PMI)
            for i in range(SIM.shape[1]):
                for j in range(SIM.shape[1]):
                    SIM[i, j] = PMI_f(R.iloc[:, i], R.iloc[:, j], mode=current_mode,
                                      v_i=base_data_for_pca_PMI.iloc[:, i], 
                                      v_j=base_data_for_pca_PMI.iloc[:, j])
        else:
            for i in range(SIM.shape[1]):
                for j in range(SIM.shape[1]):
                    SIM[i, j] = PMI_f(R.iloc[:, i], R.iloc[:, j], mode=current_mode)

    else:
        print('Wrong similarity function choice')
    return SIM

# Get top-k-ingredient-index

In [4]:
def TOP_k_index(Sim_matrix, Row, K_ingredient):
    Sim_matrix_i = Sim_matrix[Row, :]
    Index = np.argsort(Sim_matrix_i)[::-1]
    Top_k = Index[:K_ingredient]
    Top_k = Top_k.tolist()

    if Row in Top_k:
        Top_k.remove(Row)
        Top_k.append(Index[K_ingredient])

    return Top_k

# Random delete (with seed)

In [5]:
def random_delete_index(df_renamed_rev_test, test_index):
    rand_delete_index_dict = dict()

    random.seed(233)

    for i in test_index:
        recipe_vec = df_renamed_rev_test.iloc[i]
        index = [i for i in range(len(recipe_vec)) if recipe_vec[i] == 1]
        rand_delete_index_dict[i] = random.sample(index, 1)[0]

    return rand_delete_index_dict

# Get top N(10) ingredients recommendation

In [6]:
def TOP_ingredient(df_renamed_rev_test, Recipe_index, rand_delete_index_dict, N, Sim_matrix, Sim_ingredient_list):
    P_recipe = []
    Top_N_ingredient_name = []
    All_ingredient_name = []
    Recipe_vec = df_renamed_rev_test.iloc[Recipe_index]  # 获得某一个菜的向量表示
    Delete_index = rand_delete_index_dict[Recipe_index]  # 获得要随机删除的菜原料的序号
    Delete_name = df_renamed_rev_test.columns.values[Delete_index]   # 获得要删除的菜原料的名字

    Recipe_vec[Delete_index] = 0  # 随机挖空

    for i in range(df_renamed_rev_test.shape[1]):
        Numerator = 0
        Denominator = 0
        for j in Sim_ingredient_list[i]:
            Numerator = Numerator + Recipe_vec[j] * Sim_matrix[i, j]
            Denominator = Denominator + Sim_matrix[i, j]
        P_recipe_i = Numerator / Denominator  # 获得测试的菜跟第i个原料的适合程度
        P_recipe.append(P_recipe_i)

    Ingredient_index = np.argsort(P_recipe)[::-1]
    Top_N_ingredient = Ingredient_index[:N]  # 获得相似度靠前的前n个原料
    
    Recipe_vec[Delete_index] = 1  # 恢复挖空

    for s in Ingredient_index:
        All_ingredient_name.append(
            df_renamed_rev_test.columns.values[s])  # 按从适合程度高到低排序所有成分

    for n in Top_N_ingredient:
        Top_N_ingredient_name.append(
            df_renamed_rev_test.columns.values[n])  # 获得这个菜前n个适合原料的序列号

    for l in range(df_renamed_rev_test.shape[1]):
        if All_ingredient_name[l] == Delete_name:
            Ingredient_rank = l
    Ingredient_rank = Ingredient_rank + 1

    return Top_N_ingredient_name, Delete_name, Ingredient_rank

# Dataset Split

In [7]:
def data_split():
    df_renamed_rev = pd.read_csv("processed_data_with_cuisine.csv")
    df_renamed_rev.drop('Unnamed: 0', axis=1, inplace=True)

    np.random.seed(1)
    df_renamed_rev_shuf = np.random.permutation(df_renamed_rev)
    df_renamed_rev_shuf = pd.DataFrame(df_renamed_rev_shuf)
    df_renamed_rev_shuf.columns = df_renamed_rev.columns.values
    df_renamed_rev_train = df_renamed_rev_shuf.iloc[:34625]
    df_renamed_rev_test = df_renamed_rev_shuf.iloc[34625:]
    index = np.linspace(0, 3846, 3847, endpoint=True, dtype=int)
    df_renamed_rev_test.index = index.tolist()
    return df_renamed_rev_train, df_renamed_rev_test

# Run test-points and get recall@10, mean, median...

In [18]:
def test_f(df_renamed_rev_test, test_index, N, K, S):
    Rank = []
    N_ingredient_list = []

    True_label = 0
    Whole_label = len(test_index)

    for i in range(df_renamed_rev_test.shape[1]):
        N_ingredient_list.append(TOP_k_index(S, i, K))

    # get rand_delete_index_dict
    rand_delete_index_dict = random_delete_index(df_renamed_rev_test, test_index)

    for Recipe_i in test_index:
        Predict_name, True_name, Predict_rank = TOP_ingredient(
            df_renamed_rev_test, Recipe_i, rand_delete_index_dict, N, S, N_ingredient_list)
        Rank.append(Predict_rank)
        if True_name in Predict_name:
            True_label = True_label + 1

    Recall = True_label / Whole_label

    return Recall, np.mean(Rank), np.median(Rank)

In [9]:
def gen_IC_mat(df_renamed_rev_train):
    # {I: {C: count} }
    I_C_dict = defaultdict(dict)
    
    C_total_count_dict = defaultdict(int)

    cuisine = df_renamed_rev_train['cuisine']
    for i in cuisine:
        C_total_count_dict[i] += 1

    for i in df_renamed_rev_train.columns[:-1]:
        u_vec = df_renamed_rev_train[i]
        C_count_dict = defaultdict(int)
        for num in range(len(u_vec)):
            if u_vec[num] == 1:
                C_count_dict[cuisine[num]] += 1
        I_C_dict[i] = C_count_dict
        
    IC_mat = pd.DataFrame(I_C_dict).fillna(0)
    
    # just for df.iloc(only valid for number)
    name_index_dict = dict()
    ind = 0
    for c in IC_mat.index:
        name_index_dict[c] = ind
        ind += 1
        
    # normalization
    for cuisine in IC_mat.index:
        IC_mat.iloc[name_index_dict[cuisine]] = IC_mat.iloc[name_index_dict[cuisine]]/C_total_count_dict[cuisine]*df_renamed_rev_train.shape[0]/100
    
    return IC_mat

In [10]:
def tfidf_transfer(IC_mat):
    for i in IC_mat.columns:
        v = IC_mat[i]
        zero_count = 0
        for item in v:
            if item == 0:
                zero_count += 1
        df = IC_mat.shape[0]-zero_count
        idf = IC_mat.shape[0]/df
        IC_mat[i] = IC_mat[i] * idf
    return IC_mat

In [34]:
def projection(IC_mat):
    IC_mat_projected = pd.DataFrame.copy(IC_mat, deep=True)
    for col in IC_mat.columns:
        max_num = IC_mat[col].max()
        min_num = IC_mat[col].min()
        new_col = [(c-min_num)/(max_num-min_num) for c in IC_mat[col]]
        IC_mat_projected[col] = new_col
    return IC_mat_projected

# 主函数 (现在可以直接改参数->重复跑)

## split the dataset

In [23]:
df_renamed_rev_train, df_renamed_rev_test = data_split()
df_renamed_rev_test.drop('cuisine', axis=1, inplace=True)  # todo: 删除test的cuisine列

In [24]:
IC_mat = gen_IC_mat(df_renamed_rev_train)
IC_mat = tfidf_transfer(IC_mat)

In [38]:
IC_mat

Unnamed: 0,active dry yeast,avocado,bacon,baguette,baking powder,basil,bay leaf,beans,beansprouts,beef,...,worcestershire sauce,zucchini,chicken breast,cheddar,egg,onion,sausage,tomato,yogurt,spray
french,5.639513,0.322258,22.620714,12.027125,8.57206,20.358643,40.411141,28.16534,0.0,17.14412,...,2.416934,12.398873,11.63351,3.571692,136.999889,98.731764,3.241535,46.993258,3.06145,23.420093
italian,6.210186,1.23133,11.302063,7.774205,8.747797,86.409922,12.05365,18.563638,0.0,24.259878,...,2.569732,14.749192,23.344411,5.707542,72.220182,120.739938,24.532125,117.739241,1.638799,22.886677
cajun_creole,8.211253,1.059517,14.818516,2.875831,7.80069,21.137355,74.735647,35.48056,0.0,15.601381,...,46.353848,3.019622,40.009993,5.032703,47.559048,239.556686,109.535311,116.255451,0.838784,10.065407
indian,4.509798,0.683303,0.0,0.185468,5.322928,1.817585,28.172572,12.982752,0.370936,6.231721,...,0.273321,5.322928,28.691882,0.577011,15.838958,207.074897,0.0,122.816835,71.54939,8.179134
jamaican,1.54438,3.86095,8.150895,0.0,26.408898,5.135064,15.405191,30.810381,0.0,29.34322,...,6.94971,3.667903,27.876059,1.630179,42.547669,214.205508,0.0,49.149894,0.815089,8.802966
chinese,1.380001,1.073334,3.722596,0.208095,6.700673,4.370004,1.456668,7.574674,38.705752,18.499684,...,4.906671,4.078671,58.84939,0.323704,84.486748,154.261149,3.427454,11.070677,0.323704,5.972339
southern_us,2.46266,0.985064,38.576201,1.336873,61.857095,5.989189,12.071959,10.013176,0.0,6.550676,...,15.859531,1.310135,10.948986,34.936937,131.107095,106.214527,8.807631,31.911149,2.807432,10.106757
russian,27.378305,0.855572,9.031038,0.0,27.634977,2.43838,30.88615,13.817488,0.0,43.890845,...,5.133432,0.812793,8.940728,2.709311,191.006455,144.67723,5.737365,39.826878,9.031038,3.251174
greek,1.427909,1.427909,1.130428,0.968938,11.191234,22.721596,10.852106,14.243389,0.0,16.617287,...,1.070931,13.226004,27.808521,1.884046,58.330069,161.425073,0.398974,128.868756,84.405267,21.025955
mexican,1.092586,65.490863,7.19107,0.436116,9.402663,2.869644,9.707944,37.915932,0.0,42.006701,...,3.663375,7.8152,44.754232,66.686994,33.580938,218.581379,4.022529,133.346852,6.919708,16.240963


In [35]:
# start to do [0, 1] projection for concatenate method
IC_mat_projected = projection(IC_mat)
np.save('cuisine_vec.npy', IC_mat_projected)

In [39]:
IC_mat_projected.multiply(10)

Unnamed: 0,active dry yeast,avocado,bacon,baguette,baking powder,basil,bay leaf,beans,beansprouts,beef,...,worcestershire sauce,zucchini,chicken breast,cheddar,egg,onion,sausage,tomato,yogurt,spray
french,2.059847,0.049207,5.863904,5.811229,1.205192,2.271904,5.348652,7.209026,0.0,1.953266,...,0.52141,5.69134,1.517131,0.53559,6.916861,0.293136,0.295935,3.431441,0.362708,8.863116
italian,2.268287,0.188016,2.929802,3.756316,1.234197,10.0,1.5059,4.460669,0.0,3.226951,...,0.554373,6.845869,3.194008,0.85587,3.218704,1.810129,2.239654,8.812794,0.194158,8.646684
cajun_creole,2.999182,0.161781,3.841362,1.389535,1.077875,2.363015,10.0,9.302908,0.0,1.677123,...,10.0,1.084043,5.580341,0.754675,1.810843,10.0,10.0,8.699928,0.099376,3.444466
indian,1.647216,0.104336,0.0,0.089614,0.668915,0.102574,3.690191,2.863216,0.023583,0.0,...,0.058964,2.215479,3.959708,0.086525,0.0,7.761076,0.0,9.199025,8.476887,2.679113
jamaican,0.564089,0.58954,2.112933,0.0,4.1492,0.490724,1.960071,7.966133,0.0,4.136844,...,1.499274,1.402493,3.842891,0.244452,1.524753,8.252579,0.0,3.595488,0.096569,2.932232
chinese,0.504049,0.163891,0.964998,0.100547,0.896315,0.401211,0.069893,1.315227,2.460753,2.195905,...,1.058525,1.604272,8.277942,0.048541,3.91898,4.120699,0.312909,0.69896,0.038351,1.783708
southern_us,0.899493,0.150412,10.0,0.645946,10.0,0.590658,1.508381,2.013215,0.0,0.057091,...,3.421405,0.244305,1.419115,5.238943,6.580452,0.808913,0.804091,2.284208,0.332613,3.461244
russian,10.0,0.13064,2.341091,0.0,4.351566,0.175208,4.057911,3.102148,0.0,6.740797,...,1.107445,0.0,1.131553,0.406273,10.0,3.460093,0.523791,2.886325,1.069961,0.679598
greek,0.521548,0.218032,0.293038,0.468168,1.637492,2.548373,1.343078,3.224056,0.0,1.858965,...,0.231034,6.097645,3.833221,0.282521,2.425742,4.614498,0.036424,9.65937,10.0,7.891697
mexican,0.39907,10.0,1.864121,0.210721,1.342284,0.225666,1.188031,10.0,0.0,6.403545,...,0.790307,3.439738,6.259666,10.0,1.012858,8.554202,0.367236,10.0,0.81982,5.950192


## parameter settings

In [19]:
# current_mode = {'PCA','BASE'}
current_mode = 'PCA'

# current_similarity_method = {'CS', 'ACS', 'JS', 'PMI'}
current_similarity_method = 'CS'

# set # recommendations
N = 10

# set # neighbours
K = 50

# num_test_points = {'ALL', number}
num_test_points = 'ALL'

# RUN

In [20]:
if current_mode == 'PCA':
    pca = PCA(n_components=20)
    new_data = pca.fit_transform(IC_mat.T)
    df_new_data = pd.DataFrame(new_data.T)
elif current_mode == 'BASE':
    df_new_data = IC_mat

if num_test_points == 'ALL':
    test_index = list(range(df_renamed_rev_test.shape[0]))
else:
    random.seed(666)
    test_index = random.sample(list(df_renamed_rev_test.index), num_test_points)

start = time.process_time()

S = SIM_matrix(R=df_new_data, similarity_method=current_similarity_method)

recall_test, mean_rank_test, median_rank_test = test_f(df_renamed_rev_test, test_index, N, K, S)
print('Recall rate is :%.2f%%' % (recall_test * 100))
print('The mean of rank is :%.2f' % (mean_rank_test))
print('The median of rank is :%.2f' % (median_rank_test))

end = time.process_time()
print('Running time: %s Seconds' % (end - start))

Recall rate is :41.25%
The mean of rank is :32.65
The median of rank is :15.00
Running time: 799.632284 Seconds
