In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import torch
import numpy as np
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset,DataLoader
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.tensorboard import SummaryWriter
#from sklearn.preprocessing import MinMaxScaler

# importing the dataset

In [None]:
#import dataset
ds=pd.read_csv('C:/Users/nebiy/Documents/Dataset/datasets/Reviews.csv')

In [None]:
ds.head()

# EDA

In [None]:
ds.info()

In [None]:
ds.isnull().sum()

In [None]:
ds=ds.dropna(how='any')

In [None]:
ds.head()

In [None]:
#ds[ds['UserId']]

In [None]:
ds.drop(['ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Summary','Text'],axis=1,inplace=True)

In [None]:
ds['Time'].value_counts()

In [None]:
ds.info()

# Encoding the features

In [None]:
object_encoders={}
columns=['ProductId','UserId']
for col in columns:
    le=LabelEncoder()
    ds[col]=le.fit_transform(ds[col])
    #storing the feature values and its encoded values
    object_encoders[col]=le

# lets make more features:
    * day of the week
    * month
    * time of the day(Hour)

In [None]:
#change the time stamp into datatime
ds['Time']=pd.to_datetime(ds['Time'],unit='s',utc=True)
ds['Time']=ds['Time'].dt.tz_convert('America/New_York')

In [None]:
ds.head()

In [None]:
ds.head()

In [None]:
#year
ds['year']=ds['Time'].dt.year
#month
ds['month']=ds['Time'].dt.month
#week
ds['day_of_week']=ds['Time'].dt.dayofweek
#hour
ds['hour']=ds['Time'].dt.time

In [None]:
ds.head()

In [None]:
ds.isnull().sum()

In [None]:
#droping the time column
clean_data=ds.drop(['Time'],axis=1)

In [None]:
#dropign the ID
clean_data.drop(['Id'],axis=1,inplace=True)

In [None]:
clean_data.info()

In [None]:
#convert the hour feature into int
clean_data['hour']=clean_data['hour'].apply(lambda x:x.hour )

# lets add another feature:
        * Name of the feature is: recomended
        * the feature if gonna be boolean
        * we are gonna assign it True if its equal or greater than 3 : False other wise

In [None]:
clean_data['Recomended']=clean_data['Score']>=3

In [None]:
clean_data.columns

## droping the 'Rate'---> Score

In [None]:
clean_data.drop(['Score'],axis=1,inplace=True)

# mapping features

In [None]:
clean_data['year'].value_counts()

In [None]:
def mapping_function_year(year):
    return year-1999

In [None]:
#clone=clean_data.copy()

In [None]:
clean_data['year']=clean_data['year'].apply(mapping_function_year)

# mapping hour

In [None]:
def mapping_function_hour(hour):
    return hour-19

In [None]:
clean_data['hour'].value_counts()

In [None]:
clean_data['hour']=clean_data['hour'].apply(mapping_function_hour)

In [None]:
clean_data.head()

In [None]:
# scaler=MinMaxScaler()

# columns_to_scale=['year','month','hour','day_of_week']
# clean_data[columns_to_scale]=scaler.fit_transform(clean_data[columns_to_scale])

In [None]:
# lets change the boolean into number
clean_data['Recomended']=clean_data['Recomended'].astype(int)

# split into train and test

In [None]:
#split into train and test
train,test=train_test_split(
    clean_data,
    random_state=42,
    test_size=0.2,
    stratify=clean_data['Recomended']
)

# lets check for data distribution

In [None]:
plt.hist(clean_data['month'],edgecolor="black")
plt.show()

In [None]:
plt.hist(clean_data['year'],edgecolor='black')
plt.show()

In [None]:
plt.hist(clean_data['hour'],edgecolor='black')
plt.show()

In [None]:
plt.hist(clean_data['Recomended'],edgecolor="black")
plt.show()

In [None]:
plt.hist(clean_data['day_of_week'],edgecolor="black")
plt.show()

# turn them into torch dataset

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
p=torch.tensor(90,dtype=torch.long)

In [None]:
class Mydataset(Dataset):
    def __init__(self,data):
        super().__init__()
        self.data=data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        #element to extract
        item=self.data.iloc[idx]
        
        return {
            "product_id":torch.tensor([item['ProductId']],dtype=torch.long).to(device),
            "user_id":torch.tensor([item['UserId']],dtype=torch.long).to(device) ,
            "year":torch.tensor([item['year']],dtype=torch.long).to(device) ,
            "month":torch.tensor([item['month']],dtype=torch.long).to(device) ,
            "day_of_week":torch.tensor([item['day_of_week']],dtype=torch.long).to(device) ,
            "hour":torch.tensor([item['hour']],dtype=torch.long).to(device) ,
            "recomended":torch.tensor([item['Recomended']],dtype=torch.float).to(device)
        }

## loading the torch dataset format

In [None]:
#trainig dataset
train_dataset=Mydataset(train)
test_dataset=Mydataset(test)

In [None]:
print(train_dataset)
print(test_dataset)

# turning it into torch dataloader

In [None]:
#making the train dataloader
train_dataloader=DataLoader(
    dataset=train_dataset,
    batch_size=32
)

#making test dataloader
test_dataloader=DataLoader(
    dataset=test_dataset,
    batch_size=32
)

In [None]:
# acessing elements of the dataloader
for batch in train_dataloader:
    print(batch['product_id'].shape)
    print(batch['user_id'].shape)
    break

# model package importing

In [None]:
import sys
sys.path.append('C:/Users/nebiy/Documents/recommendation_system/tiny_recommendation/jupyter_notes/trial.py')
from trial import WideDeep,WD_Config

In [None]:
config=WD_Config()
config.num_product=clean_data['ProductId'].nunique()
config.num_users=clean_data['UserId'].nunique() 
config.num_year=clean_data["year"].nunique()
config.num_time_day=clean_data["hour"].nunique()
config.num_month=clean_data['month'].nunique()+1   #since in this column it start from 1
config.num_day_week=clean_data["day_of_week"].nunique()
config.embedding_dim=100

In [None]:
print(config)

In [None]:
model=WideDeep(config)

# setup the config of the model

In [None]:
clean_data["day_of_week"].value_counts()

In [None]:
# config.num_product=clean_data['ProductId'].nunique()+1
# config.num_users=clean_data['UserId'].nunique() +1
# config.num_year=clean_data["year"].nunique()+1
# config.num_time_day=clean_data["hour"].nunique()+1
# config.num_month=clean_data['month'].nunique()+1
# config.num_day_week=clean_data["day_of_week"].nunique()+1
# config.embedding_dim=100

In [None]:
model=model.to(device)

In [None]:
optim=torch.optim.AdamW(model.parameters(),lr=0.01,weight_decay=0.01)  #the deep component
#loss_fn=torch.nn.BCELoss() 

#bce with logits
loss_fn=nn.BCEWithLogitsLoss()

#lets use different loss calculation
#loss_rn=nn.MSELoss()

# lets debug some stuff with data and model

In [None]:
# lets access 10 elements from one batch
for batch in train_dataloader:
    product_id=batch["product_id"]
    user_id=batch["user_id"]
    year=batch["year"]
    month=batch["month"]
    day_of_week=batch["day_of_week"]
    hour=batch["hour"]
    recomended=batch["recomended"]
    
    print(product_id.size(0))
    print(user_id.shape)
    print(year.shape)
    print(recomended.shape)
    print(month.shape)
    print(year.shape)
    print(day_of_week.shape)
    
    break

# train my model

In [None]:
# extraxt one batch 
for batch in train_dataloader:
    one_batch=batch
    break

In [None]:
batches=[]
for idx,batch in enumerate(train_dataloader):
    if(idx<4):
        batches.append(batch)
    else:
        break

In [None]:
#print(model_pred)

In [None]:
writer=SummaryWriter('runs/wide_deep_weights_grads')

In [None]:
#lets start the training
n_epoch=100
for epoch in range(n_epoch):
    for batch in batches:
        #getting the shape right
        product_id=batch['product_id']
        user_id=batch['user_id']
        year=batch['year']
        month=batch['month']
        day_of_week=batch['day_of_week']
        hour=batch['hour']
        recomended=batch['recomended']
    
        #changing the value of the recomended
        recomended=recomended.float()
        
        #feed forward model
        model_pred=model(
            product_id,
            user_id,
            year,
            month,
            day_of_week,
            hour
        )
    
        #print(f"loss at {epoch} epoch : is {model_pred} ")
    
        #calculate the loss
        loss=loss_fn(model_pred,recomended)
        
        #zero grad
        optim.zero_grad()
        loss.backward()
    
        #lets clip the gradient of the module
        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        
        optim.step() #update the gradient 

        #lets see the gradient
        for name,param in model.named_parameters():
            writer.add_histogram(name,param,epoch)

        #lets recoord the loss
        writer.add_scalar('loss/train',loss.item(),epoch)
        
        
        #display the loss every 5 epoch
    if(epoch%10==0):
        print(f"the loss is {loss.item()}")

#lets close the tenosrboard
writer.close()

# lets debug

## check for nan values in our dataset

In [None]:
# clean_data.columns

In [None]:
# print(f"product id: {clean_data['ProductId'].isnull().sum()} ")
# print(f"user id: {clean_data['UserId'].isnull().sum()} ")
# print(f"year : {clean_data['year'].isnull().sum()} ")
# print(f"month: {clean_data['month'].isnull().sum()} ")
# print(f"day of week: {clean_data['day_of_week'].isnull().sum()} ")
# print(f"hour: {clean_data['hour'].isnull().sum()} ")
# print(f"recomended: {clean_data['Recomended'].isnull().sum()} ")

# wide part

In [None]:
# lin=nn.Linear(2,1)
# a=one_batch['product_id']
# b=one_batch['user_id']

# a=a.float()
# b=b.float()

# #print(a)
# #print(b)
# for i in range(1000):
#     c=lin(torch.cat((a,b),dim=1))
#     if(i%100==0):
#         print(f"epoch is {i} and value is {c} ")

# deep part

In [None]:
# config.num_product=clean_data['ProductId'].nunique()+1
# config.num_users=clean_data['UserId'].nunique() +1
# config.num_year=clean_data["year"].nunique()+1
# config.num_time_day=clean_data["hour"].nunique()+1
# config.num_month=clean_data['month'].nunique()+1
# config.num_day_week=clean_data["day_of_week"].nunique()+1
# config.embedding_dim=100

In [None]:
# embed_input.shape

In [None]:
# em1=nn.Embedding(clean_data['ProductId'].nunique()+1,100)
# em2=nn.Embedding(clean_data['UserId'].nunique()+1,100)
# em3=nn.Embedding(clean_data['year'].nunique()+1,100)
# em4=nn.Embedding(clean_data['hour'].nunique()+1,100)
# em5=nn.Embedding(clean_data['month'].nunique()+1,100)
# em6=nn.Embedding(clean_data['day_of_week'].nunique()+1,100)

# #em6=nn.Embedding(clean_data['Recomended'].nunique()+1,100)
# true_label=one_batch['recomended']

# #lets forward data through it
# em1_embd=em1(one_batch['product_id'])
# em2_embd=em2(one_batch['user_id'])
# em3_embd=em3(one_batch['year'])
# em4_embd=em4(one_batch['hour'])
# em5_embd=em5(one_batch['month'])
# em6_embd=em6(one_batch['day_of_week'])


# #lets concatinate man
# embed_input=torch.cat((
#     em1_embd.view(em1_embd.size(0),-1),
#     em2_embd.view(em2_embd.size(0),-1),
#     em3_embd.view(em3_embd.size(0),-1),
#     em4_embd.view(em4_embd.size(0),-1),
#     em5_embd.view(em5_embd.size(0),-1),
#     em6_embd.view(em6_embd.size(0),-1)
# ),dim=1)

# #model architecture
# l1=nn.Linear(600,800)
# a1=nn.ReLU()

# l2=nn.Linear(800,400)
# a2=nn.ReLU()

# l3=nn.Linear(400,360)
# a3=nn.ReLU()

# l4=nn.Linear(360,1)

# all_model=nn.Sequential(
#     nn.Linear(600,800),
#     nn.ReLU(),
    
#     nn.Linear(800,400),
#     nn.ReLU(),
    
#     nn.Linear(400,200)
# )

# optim=torch.optim.Adam(all_model.parameters())
# loss_fn=nn.BCELoss()


# #lets feed forward through man
# for i in range(1000):
#     out1=l1(embed_input)
#     act=a1(out1)
    
#     out2=l2(act)
#     act2=a2(out2)
    
#     out3=l3(act2)
#     act3=a3(out3)
    
#     out4=l4(out3)

#     out4=torch.sigmoid(out4)

#     loss=loss_fn(out4,true_label)
    
#     if(i%100==0):
#         print(loss.item())

In [None]:
# true_label.shape