In [4]:
import pandas as pd
import numpy as np
import copy

u.data     -- The full u data set, 100000 ratings by 943 users on 1682 items.
              Each user has rated at least 20 movies.  Users and items are
              numbered consecutively from 1.  The data is randomly
              ordered. This is a tab separated list of
	         user id | item id | rating | timestamp.
              The time stamps are unix seconds since 1/1/1970 UTC

In [5]:
def similarity_matrix(user_data: pd.DataFrame):
    df = user_data.fillna(0)

    # compute the cosine similarity
    number_of_users = len(df.iloc[:,0])
    cosine_sim = df.dot(df.T)
    norms = np.sqrt(np.diag(cosine_sim))
    cosine_sim = cosine_sim / np.outer(norms, norms)

    # convert the result to a DataFrame
    similarities = pd.DataFrame(cosine_sim, index=df.columns, columns=df.columns)
    similarities = similarities.iloc[:number_of_users, :number_of_users]
    similarities = similarities.rename_axis('user id',axis=1).rename_axis('user id',axis=0)
    return similarities

def recommend(user_id, k, data):
    #calculating similarity matrix
    sim_mat = similarity_matrix(data)
    #getting k-nn for user
    neighbors = copy.deepcopy(sim_mat.loc[:,user_id])
    neighbors.sort_values(ascending=False, inplace=True)
    neighbors = neighbors[1:k+1]


    number_of_items = len(user_data.iloc[0,:])
    recommendations_vector = pd.Series([0]*number_of_items, index=range(1,number_of_items+1))

    #adding up weighted recommendations
    for user in neighbors.index:
        recommendations_vector += data.loc[user,:].fillna(0).apply(lambda x: x*neighbors.loc[user])
    user_vector = data.loc[user_id,:]
    recommendation = (recommendations_vector[user_vector.isna()].sort_values(ascending=False).index[0] ,recommendations_vector[user_vector.isna()].sort_values(ascending=False).iloc[0])

    # returning none if highest score is 0 or recommendation in other case
    if recommendation[1] == 0: return None
    else: return recommendation[0]


In [7]:
path_to_data = '/home/michalmierzejewski/PycharmProjects/SUS2023/lab2/ml-100k/u.data'
user_data = pd.read_csv(path_to_data, sep='\t', names=['user id', 'item id', 'rating'], usecols=[0,1,2])
user_data = user_data.pivot(index='user id', columns='item id', values='rating')


arbitrary_10_users = [11,22,33,44,55,66,77,88,99,111]
for user_id in arbitrary_10_users:
    print(f"For user {user_id} I recommend {recommend(user_id=user_id, k=5, data=user_data)}")


users_from_task = [1,2,3,4,5]
for user_id in users_from_task:
    print(f"For user {user_id} I recommend {recommend(user_id=user_id, k=5, data=user_data)}")


For user 11 I recommend 50
For user 22 I recommend 183
For user 33 I recommend 259
For user 44 I recommend 210
For user 55 I recommend 183
For user 66 I recommend 118
For user 77 I recommend 318
For user 88 I recommend 269
For user 99 I recommend 96
For user 111 I recommend 345
For user 1 I recommend 474
For user 2 I recommend 750
For user 3 I recommend 313
For user 4 I recommend 305
For user 5 I recommend 746


# Homework 24.03.2023

In [32]:
def split_data(data, proportions):
    if sum(proportions) != 1: return None
    number_of_data_points = len(data.iloc[:,0])

    train_size = int(number_of_data_points*proportions[0])
    valid_size = int(number_of_data_points*proportions[1])
    test_size = int(number_of_data_points*proportions[2])

    train_size += (number_of_data_points - (train_size + valid_size + test_size))

    temp_data = data.sample(frac=1)

    #print(temp_data)

    train = temp_data.iloc[:train_size,:]
    valid = temp_data.iloc[train_size:(train_size+valid_size),:]
    test = temp_data.iloc[(train_size+valid_size):,:]

    return train, valid, test



In [35]:
train, valid, test = split_data(user_data, [0.8,0.1,0.1])

print(valid)

item id  1     2     3     4     5     6     7     8     9     10    ...  \
user id                                                              ...   
809       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
52        NaN   NaN   NaN   NaN   NaN   NaN   5.0   NaN   NaN   NaN  ...   
144       4.0   NaN   NaN   4.0   NaN   NaN   2.0   4.0   5.0   NaN  ...   
580       3.0   NaN   5.0   NaN   NaN   NaN   3.0   NaN   NaN   NaN  ...   
754       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   4.0   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
895       4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
777       4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   5.0   NaN  ...   
761       1.0   NaN   NaN   NaN   NaN   NaN   4.0   NaN   2.0   NaN  ...   
855       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
133       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

item id  16