# real stuff

In [2]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
"""
Dimensions need to be checked between n_users. n_items and X before calling this
"""

class Bias_Model(nn.Module):
  
    def __init__(self, n_users, n_items, nan_map, rand_init=True, max_bias=3, min_bias=0):
        super().__init__()
        torch.manual_seed(0)
        self.user_bias = nn.Parameter(torch.zeros(n_users))
        self.item_bias = nn.Parameter(torch.zeros(n_items))
        self.nan_map = nan_map
        if rand_init:
            nn.init.uniform_(self.user_bias, min_bias, max_bias)
            nn.init.uniform_(self.item_bias, min_bias, max_bias)
    
    def forward(self, X):
        y_pred = torch.cartesian_prod(self.user_bias, self.item_bias).sum(-1).view(len(self.user_bias), len(self.item_bias))
        y_pred = torch.where(self.nan_map, torch.zeros_like(y_pred), y_pred)
        return y_pred
        

In [None]:
def fit_v1(interaction_matrix, model, nan_map, epochs = 100, learning_rate = 1):
    loss_arr = []
    opt = optim.Adam(model.parameters(), lr=learning_rate)
    
    X_train = torch.where(nan_map, torch.zeros_like(interaction_matrix), interaction_matrix)
    
    for epoch in range(epochs):
        y_hat = model(X_train)
        loss = F.mse_loss(y_hat, X_train)
        loss_arr.append(loss.item())

        loss.backward()
        opt.step()
        opt.zero_grad()


    plt.plot(loss_arr, 'r-')
    plt.show()      
    print('Loss before training', loss_arr[0])
    print('Loss after training', loss_arr[-1])

## Testing with small handcrafted input

In [None]:
# testing with dummy matrix
train = torch.tensor([
    [4, np.nan, np.nan, 3, 5, 3],
    [np.nan, 1, 2, np.nan, 3, np.nan],
    [4, np.nan, np.nan, 3, 4, 2],
    [5, 1, 2, np.nan, 3, np.nan],
    [4, np.nan, 3, 3, 5, 1]
])

nan_map = torch.isnan(train)
n_users, n_items = train.shape


In [None]:
model = Bias_Model(n_users=n_users, n_items=n_items, nan_map=nan_map)
fit_v1(interaction_matrix=train, nan_map=nan_map, model=model)

In [None]:
model_v2 = Bias_Model(n_users=n_users, n_items=n_items, nan_map=nan_map, rand_init=False)
fit_v1(interaction_matrix=train, nan_map=nan_map, model=model_v2)

In [None]:
model_v2.user_bias, model_v2.item_bias

In [None]:
model.user_bias, model.item_bias

In [None]:
final = torch.cartesian_prod(model.user_bias, model.item_bias).sum(-1).view(len(model.user_bias), len(model.item_bias))
final

In [None]:
final_v2 = torch.cartesian_prod(model_v2.user_bias, model_v2.item_bias).sum(-1).view(len(model_v2.user_bias), len(model_v2.item_bias))
final_v2

In [None]:
train

In [None]:
torch.where(nan_map, final, train)

In [None]:
torch.where(nan_map, final_v2, train)

## Playing with ratings only input from Amazon dataset

In [3]:
import os

In [17]:
def _get_mad_from_mean(arr):
    mean_arr = np.array([np.mean(arr)]*len(arr))
    return np.mean(abs(mean_arr - arr))


def _enumerate_arr(value_count_series):
    arr = np.empty(0)
    for rating, occurence in zip(value_count_series.index, value_count_series.values):
        arr = np.append(arr, [rating]*occurence)
    return arr


def _get_unique_user_item_data(df):
    unique_users, unique_items = len(df['u_id'].unique()), len(df['p_id'].unique())
    return unique_users, unique_items, unique_users*unique_items


def _get_unique_pairs_with_count(df):
    result_df = df.groupby(['u_id','p_id']).size().reset_index().rename(columns={0:'count'})
    return result_df


def _get_ratings_stats(counts_df, original_df):
    same_ratings, diff_ratings = 0, 0
    mad_arr = np.empty(0)
    multi_rating_df = counts_df[counts_df['count']>1]
    
    for ind, row in multi_rating_df.iterrows():
        temp = original_df[(original_df['u_id']==row['u_id']) & (original_df['p_id']==row['p_id'])]
        if len(temp['rating'].value_counts()) > 1:
            diff_ratings+=1
            mad = _get_mad_from_mean(_enumerate_arr(temp['rating'].value_counts()))
            mad_arr = np.append(mad_arr, mad)
        else:
            same_ratings+=1
    print(mad_arr)
    return len(multi_rating_df), same_ratings, diff_ratings, np.mean(mad_arr)


def get_template_dict():
    return {
        'unique_users': np.nan,
        'unique_items': np.nan,
        'im_size': np.nan,
        'unique_user_item_pairs': np.nan,
        "sparsity(unique_user_item_pairs/im_size x 100)": np.nan,
        'no_of_users_having_multiple_ratings': np.nan,
        'same_ratings': np.nan,
        'diff_ratings': np.nan,
        'diff_mad': np.nan
    }


def get_report(file_name, folder_path):
    stats_dict = get_template_dict()
    df = pd.read_csv(folder_path+"/"+file_name, header=None, names=['u_id', 'p_id', 'rating'])
    
    stats_dict['unique_users'], stats_dict['unique_items'], stats_dict['im_size'] = _get_unique_user_item_data(df)
    
    counts_df = _get_unique_pairs_with_count(df=df)
    stats_dict['unique_user_item_pairs'] = len(counts_df)
    stats_dict["sparsity(unique_user_item_pairs/im_size x 100)"] = (len(counts_df)/stats_dict['im_size'])*100
    
    stats_dict['no_of_users_having_multiple_ratings'],\
    stats_dict['same_ratings'], stats_dict['diff_ratings'],\
    stats_dict['diff_mad'] = _get_ratings_stats(counts_df=counts_df, original_df=df)
    return stats_dict

In [18]:
import multiprocessing

if __name__ == "__main__":
    folder_path = "file_server/dataset/ratings/"
#     files = os.listdir(folder_path)
#     print(files)
#     files = ['Magazine_Subscriptions.csv']
    files = ['Gift_Cards.csv', 'Pet_Supplies.csv', 'AMAZON_FASHION.csv', 'Patio_Lawn_and_Garden.csv', 'Books.csv']
    process_pool = multiprocessing.Pool(6)
    data = list(zip(files, [folder_path]*len(files)))
    print(data)
    output = process_pool.starmap(get_report, data)
    print(output)


# t = pd.read_csv(folder+"/Magazine_Subscriptions.csv", header=None, names=['u_id', 'p_id', 'rating'])
# temp_df = t.head(10)
# temp_df

[('Gift_Cards.csv', 'file_server/dataset/ratings/'), ('Pet_Supplies.csv', 'file_server/dataset/ratings/'), ('AMAZON_FASHION.csv', 'file_server/dataset/ratings/'), ('Patio_Lawn_and_Garden.csv', 'file_server/dataset/ratings/'), ('Books.csv', 'file_server/dataset/ratings/')]
[0.5        0.5        0.5        0.44444444 0.5       ]
[0.5        0.5        1.         0.5        0.5        0.5
 1.5        0.5        1.         0.5        1.         1.5
 0.375      0.44444444 1.         0.5        0.5        0.88888889
 0.5        0.5        0.5        0.96       0.44444444 0.5
 0.5        0.5        1.         1.         0.5        0.5
 2.         1.5        0.5        0.5        0.5        1.33333333
 0.5        0.5        0.5        1.         1.         0.5
 0.5        2.         1.5        0.5        0.5        0.5
 1.33333333 1.         0.5        1.         0.5        0.5
 1.33333333 1.         0.5        1.         0.5        0.5
 1.         0.5        0.5        2.         1.         

Process ForkPoolWorker-34:
Process ForkPoolWorker-36:
Process ForkPoolWorker-38:
Process ForkPoolWorker-33:
Process ForkPoolWorker-35:


In [20]:
print(output)

In [None]:
get_unique_user_item_data(temp_df)

In [None]:
k = temp_df['rating'].value_counts()
k, k.index, k.values, np.dot(k.index, k.values)

In [None]:
dummies = pd.get_dummies(temp_df['p_id'])
dummies.values[dummies!=0] = temp_df['rating']
dummies.replace(0, np.nan, inplace=True)
dummies
# l = pd.concat([temp_df['u_id'], pd.get_dummies()])

In [None]:
l = t.groupby(['u_id','p_id']).size().reset_index().rename(columns={0:'count'})
print(len(pd.unique(t['u_id'])), len(pd.unique(t['p_id'])))
print(len(l))

In [None]:
same_ratings, diff_ratings = 0, 0

for ind, row in l[l['count']>1].iterrows():
    temp = t[(t['u_id']==row['u_id']) & (t['p_id']==row['p_id'])]
    if len(temp['rating'].value_counts()) > 1:
        diff_ratings+=1
    else:
        same_ratings+=1
    
same_ratings, diff_ratings

In [None]:
a,\
b = 0, 0
a,b

In [None]:
l[l["count"]>1]

In [None]:
index = pd.MultiIndex.from_product([t.head(10)[0], t.head(10)[1]])
pd.DataFrame(index = index).reset_index()


In [None]:
folder = "file_server/dataset/ratings/"
file_names = os.listdir(folder)
for file_name in file_names:
    temp_df = pd.read_csv()

In [None]:
%%time
for i in range(100):
    a = np.random.randn(10000, 10000)
    b = np.random.randn(10000, 10000)
    c = np.matmul(a, b)

In [None]:
%%time
for i in range(100):
    a = torch.randn(10000, 10000)
    b = torch.randn(10000, 10000)
    c = torch.matmul(a, b)

In [None]:
torch.cuda.device_count()

In [None]:
def get_gpu():
    if torch.cuda.device_count():
        return torch.device('cuda:0')
    return None

gpu = get_gpu()
gpu

In [None]:
x = torch.randn([20, 1], requires_grad=True)
y = 3*x - 2

w = torch.tensor([1.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

y_hat = w*x = b
loss = torch.sum((y_hat - y)**2)

In [None]:
loss

In [None]:
w.grad, b.grad

In [None]:
t = torch.ones(2,3)
t

In [None]:
df = pd.DataFrame([[1,2,3,4],[12,13,14,15]])
df

In [None]:
# df_npy = df.values()

df_tensors = torch.tensor(df.values)
df_tensors

In [None]:
n_users, n_items = df_tensors.shape

bi = torch.zeros(n_users)
rj = torch.zeros(n_items)
# provision for random initialization 
 
# create a map of present values
rij = bi + rj
loss = 

In [None]:
df_tensors.sum(-1).unsqueeze(-1)

In [None]:
def model()

In [None]:
def mse(y, y_hat):
    """
    Compute mean squared error
    """
    return torch.mean((y - y_hat).pow(2))

In [None]:
0*np.nan

In [None]:
import torch.nn.functional as F
F.mse_loss

In [None]:
torch.optim.

In [None]:
dummy_train = np.array([[4, np.nan, np.nan, 3, 5, 2],
                        [np.nan, 1, 2, np.nan, 3, np.nan]])

In [None]:
tr = torch.tensor(dummy_train)
map_ = torch.isnan(tr)
print(tr)
print(map_)
# tr[torch.isnan(tr)] = 0
# tr

In [None]:
torch.where(map_, torch.zeros_like(tr), tr)

In [None]:
dd = torch.tensor([[2,2,2,2,2,2], [4,4,4,4,4,4]], dtype=float)
dd = torch.where(map_, torch.zeros_like(dd), dd)
dd

In [None]:
k = 3*torch.zeros(5) - 3*torch.rand(5)
k.repeat(1,3)

In [None]:
import torch.nn.init as init

k = torch.zeros(5)
init.uniform_(k, 4,5)
k


In [None]:
a = torch.tensor([2,3,4,5], dtype=float)
b = torch.tensor([6, 10], dtype=float)

def vector_cartesian_sum(x, y):
    x_hat_transpose =  x.repeat(len(y)).view(len(y), len(x)).t()
    y_hat = y.repeat(len(x)).view(len(x), len(y))
    return torch.add(x_hat_transpose,y_hat)


vector_cartesian_sum(a,b)

In [None]:
torch.cartesian_prod(a,b).sum(-1).view(len(a), len(b))

In [None]:
from pyspark import SparkContext, SparkConf
