#### Import libraries

In [18]:
import numpy as np
import pandas as pd

#### Parameters  

In [19]:
csv_in = 'dm-end1-3.csv'
# min number of common items between target user's evaluation and items in DB
min_common_items = 3

# To show more rows and columns
pd.options.display.max_rows = 999 
pd.options.display.max_columns = 999 

#### Read CSV file  

In [20]:
df_orig = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
#df.index = df.columns
print(df_orig.shape)
print(df_orig.info())
display(df_orig.head())

(50, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       26 non-null     float64
 1   B       27 non-null     float64
 2   C       21 non-null     float64
 3   D       24 non-null     float64
 4   E       30 non-null     float64
 5   F       28 non-null     float64
 6   G       27 non-null     float64
 7   H       30 non-null     float64
 8   I       25 non-null     float64
 9   J       25 non-null     float64
dtypes: float64(10)
memory usage: 4.0 KB
None


Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,1.0,2.0,,0.0,4.0,,,3.0,,0.0
1,,2.0,,1.0,,3.0,,3.0,1.0,
2,,3.0,4.0,,2.0,1.0,4.0,1.0,,0.0
3,,2.0,3.0,,2.0,1.0,3.0,0.0,3.0,
4,,2.0,0.0,,,,2.0,4.0,,2.0


相関行列を計算し、dfに格納   
compute correlation matrix and store it to the DataFrame df


In [21]:
df = df_orig.corr(method='pearson', min_periods=12)
display(df)

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
A,1.0,,,-0.089188,-0.648825,-0.63587,-0.214124,-0.282402,0.214412,
B,,1.0,,0.48134,0.121439,0.243721,0.173057,-0.443077,-0.043745,-0.106924
C,,,1.0,,,0.428571,,0.030041,,
D,-0.089188,0.48134,,1.0,0.038554,0.012099,,0.15745,0.123352,0.256575
E,-0.648825,0.121439,,0.038554,1.0,0.374204,-0.040359,-0.250747,-0.263955,-0.126361
F,-0.63587,0.243721,0.428571,0.012099,0.374204,1.0,-0.216386,0.468389,-0.126563,-0.168568
G,-0.214124,0.173057,,,-0.040359,-0.216386,1.0,-0.130196,0.086486,0.173514
H,-0.282402,-0.443077,0.030041,0.15745,-0.250747,0.468389,-0.130196,1.0,-0.421996,0.018282
I,0.214412,-0.043745,,0.123352,-0.263955,-0.126563,0.086486,-0.421996,1.0,0.287958
J,,-0.106924,,0.256575,-0.126361,-0.168568,0.173514,0.018282,0.287958,1.0


In [22]:
def predict_scores(df_sim, ser_target):
    ret = {}
    for item1 in df_sim.index:  # not yet rated by the target user
        v1 = df_sim.loc[item1]
        #if v1.isnull().sum() > 0:  # debug
        #    print('v1:',v1)  # debug
        if v1.notnull().sum() < min_common_items: continue
        v11 = v1[ v1.notnull() ]
        t11 = ser_target[ v1.notnull() ]
        pred1 = (v11 * t11).sum() / np.abs(v11).sum()
        #print('v11:',v11)  # debug
        #print('t11:',t11)  # debug
        #print('pred1:',pred1)  # debug
        ret[item1] = pred1
    
    ser_ret = pd.Series(ret)
    
    return ser_ret.sort_values(ascending=False)

Function for user-based collaborative filtering.  

arguments: dictionary of scores for the target user  
and the number of items to recommend.  

ex)
```
get_recomm_by_user_sim(df, {'maguro':1, 'ika':1, 'uni':3,
                        'awabi':4, 'hirame':4, 'aoyagi':4})  
```
-> return list such as [('akagai', 2.9835603009918303), ('mirugai', 2.945676429588114), ...]

In [23]:
def get_recomm_by_item_sim(df, target_dic):
    ser_target = pd.Series(target_dic)
    #print(target_dic)  # debug
    #print(ser_target)  # debug
    # make dataframe with columns included in target_dic
    #print(df.shape)  # debug
    df_scores = df[ ser_target.index ]
    #print(df_scores.shape)  # debug
    # drop rows included in target_dic (already rated)
    df_scores = df_scores.drop(index=ser_target.index)
    #print(df_scores.shape)  # debug
    #display(df_scores.head())  # debug
    recomm = predict_scores(df_scores, ser_target)
    
    return recomm

#### Do recommendation  

In [24]:
recomm = get_recomm_by_item_sim(df,
                                {'A':1, 'B':2, 'C':3, })
print('Number of items calculated:', len(recomm))
print('Recommendation:')
print(recomm.head())

Number of items calculated: 2
Recommendation:
F    0.869376
H   -1.427405
dtype: float64


In [25]:
recomm = get_recomm_by_item_sim(df,
                                {'A':0, 'B':0, 'C':4, 'D': 4,})
print('Number of items calculated:', len(recomm))
print('Recommendation:')
print(recomm.head())

Number of items calculated: 4
Recommendation:
F    1.335101
I    1.293309
H    0.821454
E    0.190669
dtype: float64
