In [1]:
import numpy as np, pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# ==== D·ªÆ LI·ªÜU G·ªêC ====
R = pd.DataFrame(
    [[4,3,np.nan,4],
     [3,3,5,5],
     [5,4,3,4],
     [np.nan,2,3,5]],
    index=['U1','U2','U3','U4'], columns=['I1','I2','I3','I4']
)
R


Unnamed: 0,I1,I2,I3,I4
U1,4.0,3,,4
U2,3.0,3,5.0,5
U3,5.0,4,3.0,4
U4,,2,3.0,5


## 1.User‚ÄìUser Pearson Similarity (theo ƒë·ªÅ g·ªëc)
- D·ª± ƒëo√°n r<sub>U1,I3</sub> v√† r<sub>U4,I1</sub>

In [2]:
def pearson(u, v):
    a, b = R.loc[u], R.loc[v]
    mask = a.notna() & b.notna()
    if mask.sum() < 2: return 0.0
    au, bv = a[mask] - a[mask].mean(), b[mask] - b[mask].mean()
    num = (au * bv).sum()
    den = np.sqrt((au**2).sum() * (bv**2).sum())
    return 0.0 if den == 0 else num / den

def predict_pearson(u, i):
    mu_u = R.loc[u].mean()
    num = den = 0
    for v in R.index:
        if v == u or pd.isna(R.loc[v, i]): continue
        s = pearson(u, v)
        mu_v = R.loc[v].mean()
        num += s * (R.loc[v, i] - mu_v)
        den += abs(s)
    return mu_u if den == 0 else np.clip(mu_u + num / den, 1, 5)

print("üéØ Pearson:")
print("r_U1,I3 =", round(predict_pearson('U1', 'I3'), 3))
print("r_U4,I1 =", round(predict_pearson('U4', 'I1'), 3))


üéØ Pearson:
r_U1,I3 = 3.5
r_U4,I1 = 3.213


## 2.Centered-Cosine Similarity (chu·∫©n ho√° mean)
- Tr·ª´ trung b√¨nh m·ªói user ‚Üí cosine similarity

In [3]:
Rc = R.sub(R.mean(axis=1), axis=0).fillna(0.0)
S_cos = pd.DataFrame(cosine_similarity(Rc), index=R.index, columns=R.index)

def predict_centered_cosine(u, i):
    mu_u = R.loc[u].mean()
    num = den = 0
    for v in R.index:
        if v == u or pd.isna(R.loc[v, i]): continue
        s = S_cos.loc[u, v]
        num += s * (R.loc[v, i] - R.loc[v].mean())
        den += abs(s)
    return mu_u if den == 0 else np.clip(mu_u + num / den, 1, 5)

print("\nüßÆ Centered-Cosine:")
print("r_U1,I3 =", round(predict_centered_cosine('U1','I3'),3))
print("r_U4,I1 =", round(predict_centered_cosine('U4','I1'),3))



üßÆ Centered-Cosine:
r_U1,I3 = 3.565
r_U4,I1 = 3.181


## 3.Item‚ÄìItem Pearson (bi·∫øn th·ªÉ ƒë·ªÅ c√≥ th·ªÉ ra)
- D·ª± ƒëo√°n d·ª±a tr√™n t∆∞∆°ng ƒë·ªìng gi·ªØa item I<sub>3</sub> & c√°c item user U1 ƒë√£ rate.

In [4]:
def pearson_item(i, j):
    a, b = R[i], R[j]
    mask = a.notna() & b.notna()
    if mask.sum() < 2: return 0.0
    ai, bj = a[mask] - a[mask].mean(), b[mask] - b[mask].mean()
    num = (ai * bj).sum()
    den = np.sqrt((ai**2).sum() * (bj**2).sum())
    return 0.0 if den == 0 else num / den

def predict_item_based(u, i):
    mu_i = R[i].mean()
    num = den = 0
    for j in R.columns:
        if j == i or pd.isna(R.loc[u, j]): continue
        s = pearson_item(i, j)
        mu_j = R[j].mean()
        num += s * (R.loc[u, j] - mu_j)
        den += abs(s)
    return mu_i if den == 0 else np.clip(mu_i + num / den, 1, 5)

print("\nüîπ Item-Item Pearson:")
print("r_U1,I3 =", round(predict_item_based('U1','I3'),3))
print("r_U4,I1 =", round(predict_item_based('U4','I1'),3))



üîπ Item-Item Pearson:
r_U1,I3 = 3.5
r_U4,I1 = 3.769


## 4.Z-Score Normalization (bi·∫øn th·ªÉ ƒë·ªÅ n√¢ng cao)
- Chu·∫©n ho√° m·ªói user v·ªÅ Z-score, t√≠nh Pearson, gi·∫£i chu·∫©n ho√° ng∆∞·ª£c.

In [5]:
def predict_zscore(u,i):
    zR = R.apply(lambda x: (x - x.mean()) / x.std(), axis=1)
    def pearson_z(u,v):
        a,b = zR.loc[u], zR.loc[v]
        mask = a.notna() & b.notna()
        if mask.sum()<2: return 0
        num = ((a[mask]-a[mask].mean())*(b[mask]-b[mask].mean())).sum()
        den = np.sqrt(((a[mask]-a[mask].mean())**2).sum()*((b[mask]-b[mask].mean())**2).sum())
        return 0 if den==0 else num/den
    mu_u, sigma_u = R.loc[u].mean(), R.loc[u].std()
    num = den = 0
    for v in R.index:
        if v==u or pd.isna(R.loc[v,i]): continue
        s = pearson_z(u,v)
        num += s * ((R.loc[v,i] - R.loc[v].mean())/R.loc[v].std())
        den += abs(s)
    return mu_u if den==0 else np.clip(mu_u + sigma_u*(num/den),1,5)

print("\nüìä Z-score:")
print("r_U1,I3 =", round(predict_zscore('U1','I3'),3))
print("r_U4,I1 =", round(predict_zscore('U4','I1'),3))



üìä Z-score:
r_U1,I3 = 3.552
r_U4,I1 = 3.454


## 5.Baseline (Global mean + user bias + item bias)

In [6]:
# Global mean
mu = R.stack().mean()
# User bias & item bias
bu = R.mean(axis=1) - mu
bi = R.mean(axis=0) - mu

def predict_baseline(u,i):
    base = mu + bu[u] + bi[i]
    return np.clip(base,1,5)

print("\nüìà Baseline bias model:")
print("r_U1,I3 =", round(predict_baseline('U1','I3'),3))
print("r_U4,I1 =", round(predict_baseline('U4','I1'),3))



üìà Baseline bias model:
r_U1,I3 = 3.548
r_U4,I1 = 3.548


## 6.Slope-One (bi·∫øn th·ªÉ hay ra ƒë·ªÅ ‚Äút√≠nh ch√™nh l·ªách trung b√¨nh‚Äù)

In [7]:
def slope_one(u,i):
    diffs, counts = {}, {}
    for j in R.columns:
        if j==i: continue
        mask = R[i].notna() & R[j].notna()
        if mask.sum()<1: continue
        diffs[j] = (R.loc[mask,i] - R.loc[mask,j]).mean()
        counts[j] = mask.sum()
    num = den = 0
    for j in R.columns:
        if pd.isna(R.loc[u,j]) or j not in diffs: continue
        num += (R.loc[u,j] + diffs[j]) * counts[j]
        den += counts[j]
    return np.clip(num/den,1,5) if den>0 else np.nan

print("\n‚öóÔ∏è Slope-One:")
print("r_U1,I3 =", round(slope_one('U1','I3'),3))
print("r_U4,I1 =", round(slope_one('U4','I1'),3))



‚öóÔ∏è Slope-One:
r_U1,I3 = 3.5
r_U4,I1 = 3.5


## 6.Significance Weighting (gi·∫£m nhi·ªÖu khi giao nhau √≠t)

In [8]:
def slope_one(u,i):
    diffs, counts = {}, {}
    for j in R.columns:
        if j==i: continue
        mask = R[i].notna() & R[j].notna()
        if mask.sum()<1: continue
        diffs[j] = (R.loc[mask,i] - R.loc[mask,j]).mean()
        counts[j] = mask.sum()
    num = den = 0
    for j in R.columns:
        if pd.isna(R.loc[u,j]) or j not in diffs: continue
        num += (R.loc[u,j] + diffs[j]) * counts[j]
        den += counts[j]
    return np.clip(num/den,1,5) if den>0 else np.nan

print("\n‚öóÔ∏è Slope-One:")
print("r_U1,I3 =", round(slope_one('U1','I3'),3))
print("r_U4,I1 =", round(slope_one('U4','I1'),3))



‚öóÔ∏è Slope-One:
r_U1,I3 = 3.5
r_U4,I1 = 3.5


## 7.Significance Weighting (gi·∫£m nhi·ªÖu khi giao nhau √≠t)

In [9]:
def pearson_sig(u,v):
    a,b=R.loc[u],R.loc[v]
    mask=a.notna()&b.notna(); n=mask.sum()
    if n<2: return 0
    au,bv=a[mask]-a[mask].mean(),b[mask]-b[mask].mean()
    s=(au*bv).sum()/np.sqrt((au**2).sum()*(bv**2).sum())
    return s*(n/5)  # shrink v·ªÅ 0 n·∫øu n<5

def predict_sig(u,i):
    mu_u=R.loc[u].mean(); num=den=0
    for v in R.index:
        if v==u or pd.isna(R.loc[v,i]): continue
        s=pearson_sig(u,v)
        mu_v=R.loc[v].mean()
        num+=s*(R.loc[v,i]-mu_v)
        den+=abs(s)
    return mu_u if den==0 else np.clip(mu_u+num/den,1,5)

print("\nüßÆ Significance-weighted Pearson:")
print("r_U1,I3 =", round(predict_sig('U1','I3'),3))
print("r_U4,I1 =", round(predict_sig('U4','I1'),3))



üßÆ Significance-weighted Pearson:
r_U1,I3 = 3.533
r_U4,I1 = 3.119


## 8.T√≠nh to√†n b·ªô ma tr·∫≠n d·ª± ƒëo√°n (cho c√°c √¥ thi·∫øu)

In [10]:
pred = R.copy()
for u in R.index:
    for i in R.columns:
        if pd.isna(R.loc[u,i]):
            pred.loc[u,i] = predict_pearson(u,i)
print("\nüóÇÔ∏è Ma tr·∫≠n ho√†n ch·ªânh d·ª± ƒëo√°n (Pearson):")
print(pred.round(3))



üóÇÔ∏è Ma tr·∫≠n ho√†n ch·ªânh d·ª± ƒëo√°n (Pearson):
       I1  I2   I3  I4
U1  4.000   3  3.5   4
U2  3.000   3  5.0   5
U3  5.000   4  3.0   4
U4  3.213   2  3.0   5
