## **Movie Recommendation System**

In [5]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import Dataset, DataLoader, SequentialSampler, BatchSampler, TensorDataset
from tqdm import tqdm
from IPython.display import display, clear_output
import random
import os
from sklearn.model_selection import KFold
import copy


def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

## **Read Dataset**

In [8]:
df = pd.read_csv('ml-latest-small/ratings.csv')

In [9]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## **Data Investigation**

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [11]:
df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [12]:
df.duplicated().sum()

0

In [13]:
df.rating.unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

---------------------------------------------------------------------------------------------------------

## **Data Preprocessing**

In [14]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [15]:
df_train = df.groupby('userId').head(-2).reset_index(drop=True)
df_test = df.groupby('userId').tail(2).reset_index(drop=True)

In [16]:
df_copy = df
df_copy['Train'] = (df_copy.groupby(['userId']).cumcount(ascending=False) != 0).replace({True:1, False:0})
df_copy[['userId','movieId','Train']].head()

Unnamed: 0,userId,movieId,Train
0,1,1,1
1,1,3,1
2,1,6,1
3,1,47,1
4,1,50,1


In [17]:
df_copy.to_csv("train.csv")

In [18]:
df_copy = df[df['Train'] == 1].copy()
df_copy['ones'] = 1

history = df_copy.pivot_table(index='userId', columns='movieId', values='ones', fill_value=0)

In [19]:
history.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190219,191005,193565,193567,193571,193573,193579,193581,193583,193585
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
history_norm = history / history.values.sum(axis=1, keepdims=True)
history_norm

movieId,1,2,3,4,5,6,7,8,9,10,...,190219,191005,193565,193567,193571,193573,193579,193581,193583,193585
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.004329,0.000000,0.004329,0.0,0.0,0.004329,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.023256,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.000898,0.000000,0.000000,0.0,0.0,0.000000,0.000898,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.005376,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.001205,0.001205,0.001205,0.0,0.0,0.000000,0.000000,0.0,0.0,0.001205,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.027778,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.027778,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
history_norm.shape

(610, 9665)

In [22]:
user_movie = df[['userId','movieId']]
user_movie.head()

Unnamed: 0,userId,movieId
0,1,1
1,1,3
2,1,6
3,1,47
4,1,50


In [23]:
active_encoder = OneHotEncoder(sparse_output=False).fit(user_movie)

In [29]:
active_user_movie = pd.DataFrame(
    active_encoder.transform(
        df[active_encoder.feature_names_in_]
    ),
    columns=active_encoder.get_feature_names_out()
)

In [24]:
new_df = pd.concat([user_movie, df.timestamp, df.Train, df.rating], axis=1)
new_df

Unnamed: 0,userId,movieId,timestamp,Train,rating
0,1,1,964982703,1,4.0
1,1,3,964981247,1,4.0
2,1,6,964982224,1,4.0
3,1,47,964983815,1,5.0
4,1,50,964982931,1,5.0
...,...,...,...,...,...
100831,610,166534,1493848402,1,4.0
100832,610,168248,1493850091,1,5.0
100833,610,168250,1494273047,1,5.0
100834,610,168252,1493846352,1,5.0


In [25]:
train = new_df[new_df.Train == 1]
test = new_df[new_df.Train == 0]

In [26]:
len(df.movieId.unique())

9724

# **Model Building**

In [127]:
class DPMovieDataset(Dataset):
  def __init__(self, user_ids:np.ndarray, data: pd.DataFrame, agg_hist: pd.DataFrame, active_encoder: OneHotEncoder, recommendation: bool=False):
    self.user_ids = user_ids
    self.data = data
    self.agg_hist = agg_hist
    self.active_encoder = active_encoder
    self.recommendation = recommendation

  def __len__(self):
    return self.user_ids.shape[0]

  def __getitem__(self, idx):
    batch_data = self.data.iloc[idx] # Select the rows corresponding to the list of user indices `idx` from self.data dataframe
    agg_history = batch_data[['userId']].merge(self.agg_hist, left_on='userId', right_index=True) # Get the aggregated history for each selected transaction using merge
    others = batch_data[['timestamp']]
    active_groups = self.active_encoder.transform(batch_data[self.active_encoder.feature_names_in_]) # Use active_encoder to generate the active columns for user/item pairs available in batch_data
    features = torch.from_numpy(np.hstack((active_groups, agg_history.values))) # Concatenate the processed columns together horizontally

    if not self.recommendation:
      targets = batch_data['rating']
      return features, targets
    else:
      return features

In [128]:
dataset_train = DPMovieDataset(df.userId.unique(), train, history_norm, active_encoder)
dataset_test = DPMovieDataset(df.userId.unique(), test, history_norm, active_encoder)

In [32]:
dataloader_train = DataLoader(
    dataset_train,
    sampler=BatchSampler(SequentialSampler(dataset_train), batch_size=20, drop_last=False),
    batch_size=None)

dataloader_test = DataLoader(
    dataset_test,
    sampler=BatchSampler(SequentialSampler(dataset_test), batch_size=20, drop_last=False),
    batch_size=None)

In [33]:
class FactorizationMachine(torch.nn.Module):
  def __init__(self, n, k, bias=False):
    super(FactorizationMachine, self).__init__()
    self.n = n
    self.k = k
    self.linear = torch.nn.Linear(self.n, 1, bias)
    self.V = torch.nn.Parameter(torch.randn(n,k)) # Creating the latent matrix V of size (n X k) and initializing it with random values

  def forward(self, x_batch):
    x_batch = x_batch.float()
    part_1 = torch.matmul(x_batch, self.V).pow(2).sum(dim=1)  # perform the first part of the interaction term: row-wise-sum((XV)^2)
    part_2 = torch.matmul(x_batch.pow(2), self.V.pow(2)).sum(dim=1)  # perform the second part of the interaction term: row-wise-sum((X)^2 * (V)^2))
    inter_term = (part_1 - part_2) * 0.5 # Put the interaction term parts together (refer to the equations above)
    var_strength = self.linear(x_batch).squeeze(dim=1)  # Perform the linear part of the model equation (refer to the demo notebook on how to use layers in pytorch models)

    return var_strength + inter_term

In [34]:
for x, y in dataloader_train:
    print(x.shape)

torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])


torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([20, 20000])
torch.Size([10, 20000])


In [35]:
model = FactorizationMachine(n=20000, k=20)

# **Model Training**

In [36]:
def model_step(mode, x, y=None, optimizer=None, train=True):
  if train: # If we're in training phase, then zero the gradients and make sure the model is set to train
    model.train()
    optimizer.zero_grad()
  else: # If we're in evaluation phase, then make sure the model is set to eval
    model.eval()

  with torch.set_grad_enabled(train): # Either to perform the next lines with gradient tracing or not
    pred = model(x) # Get the model output from x
    pred = pred.reshape(pred.shape[0], ) # Flatten the prediction values

    y = torch.from_numpy(y.values.reshape(y.shape[0], )).float()

    criterion = torch.nn.MSELoss() # Define the criterion as MSELoss from torch
    loss = criterion(pred, y)

    if train:
      loss.backward()
      optimizer.step()
    else:
      return loss, pred

  return loss

In [37]:
def train_loop(model, train_loader, eval_loader, lr, w_decay, epochs, eval_step):
  step = 0
  """ Defining our optimizer """
  optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=w_decay)

  epochs_l, steps, t_losses, v_losses = [], [], [], []

  epochs_tqdm = tqdm(range(epochs), desc='Training in Progress', leave=True)
  for epoch in epochs_tqdm:
    for x, y in train_loader:
      loss_batch = model_step(model, x, y, optimizer, train=True)
      step +=1
      if step % eval_step == 0:
        train_loss = loss_batch
        val_loss = 0
        for x, y in eval_loader:
          val_loss += model_step(model, x, y, train=False)[0]

        steps.append(step)
        t_losses.append(train_loss.detach().numpy())
        v_losses.append(val_loss.detach().numpy())
        epochs_l.append(epoch+1)
        clear_output(wait=True)
        display(pd.DataFrame({'Epoch': epochs_l, 'Step': steps, 'Training Loss': t_losses, 'Validation Loss': v_losses}))

        optimizer.zero_grad()

In [750]:
train_loop(model, dataloader_train, dataloader_test, lr=0.004, w_decay=0.0003, epochs=2500, eval_step=2500)

Unnamed: 0,Epoch,Step,Training Loss,Validation Loss
0,81,2500,0.01249405,1995038.2
1,162,5000,0.00078166474,1794947.1
2,242,7500,0.0025054931,1862817.1
3,323,10000,0.09085606,1885066.0
4,404,12500,0.00014715325,1925194.8
5,484,15000,0.010368319,1883792.0
6,565,17500,0.003917676,1683660.0
7,646,20000,0.06885128,1829339.4
8,726,22500,9.9842495e-05,1748863.6
9,807,25000,0.0008763955,1896376.2


Training in Progress: 100%|██████████| 2500/2500 [35:46<00:00,  1.16it/s]


In [None]:
train_loop(model, dataloader_train, dataloader_test, lr=0.00004, w_decay=0.8, epochs=2500, eval_step=2500)

Unnamed: 0,Epoch,Step,Training Loss,Validation Loss
0,81,2500,0.47550693,1650423.4
1,162,5000,0.49834624,1658930.4
2,242,7500,0.525947,1673756.8
3,323,10000,0.9536068,1688523.5
4,404,12500,0.62350667,1702503.1
5,484,15000,0.9604079,1718183.6
6,565,17500,1.946258,1737076.6
7,646,20000,0.8570792,1760712.6
8,726,22500,0.88092434,1790356.5
9,807,25000,3.1751926,1824812.0


Training in Progress: 100%|██████████| 2500/2500 [27:27<00:00,  1.52it/s]


In [137]:
obs = dataset_test[[10]]
obs

(tensor([[0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64),
 1322    5.0
 Name: rating, dtype: float64)

In [138]:
model.eval()
with torch.no_grad():
  print(f'Predicted rating for User of interest: {model_step(model, obs[0], obs[1], train=False)}') # Get the model output on the user of interest after running the previous cell to now their new_id
  print(f'Actual Rating: {obs[1].values}')

Predicted rating for User of interest: (tensor(1130.7527), tensor([38.6267]))
Actual Rating: [5.]


---------------------------------------------

##### **Because Factorization Machine is very simple and naive model and it's a accuracy isn't good, So we will use factroization machines build-in model in DeepCTR-Torch library called DeepFM**

In [61]:
movies = pd.read_csv('ml-latest-small/movies.csv')

In [64]:
links = pd.read_csv('ml-latest-small/links.csv')

##### **References**
- https://www.kaggle.com/code/leejunseok97/deepfm-movie-len-pytorch 
- https://deepctr-torch.readthedocs.io/en/latest/deepctr_torch.models.deepfm.html 

## **Another Model (Implemented)**

In [None]:
pip install deepctr_torch

In [75]:
import gc 
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

In [115]:
df_ = df.copy()
complete_train_df = df_.merge(movies, how='left', on=['movieId'])[df_.Train == 1]

In [116]:
complete_test_df = df_.merge(movies[['movieId', 'genres']], how='left', on=['movieId'])[df_.Train == 0]

In [117]:
complete_train_df = complete_train_df.merge(links[['imdbId', 'movieId']], how='left', on=['movieId'])

In [118]:
complete_train_df.shape, complete_test_df.shape

((100226, 8), (610, 6))

In [119]:
complete_train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,Train,title,genres,imdbId
0,1,1,4.0,964982703,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709
1,1,3,4.0,964981247,1,Grumpier Old Men (1995),Comedy|Romance,113228
2,1,6,4.0,964982224,1,Heat (1995),Action|Crime|Thriller,113277
3,1,47,5.0,964983815,1,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,114369
4,1,50,5.0,964982931,1,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,114814


In [120]:
complete_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100226 entries, 0 to 100225
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100226 non-null  int64  
 1   movieId    100226 non-null  int64  
 2   rating     100226 non-null  float64
 3   timestamp  100226 non-null  int64  
 4   Train      100226 non-null  int64  
 5   title      100226 non-null  object 
 6   genres     100226 non-null  object 
 7   imdbId     100226 non-null  int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 6.1+ MB


### **Training**

In [571]:
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))



if __name__ == "__main__":
    data = complete_train_df
    sparse_features = ["userId", "movieId"]
    target = ['rating']

    # 1.Label Encoding for sparse features,and process sequence features
    for feat in sparse_features:
        lbe = LabelEncoder()
        # ohe = OneHotEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    key2index = {}
    genres_list = list(map(split, data['genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    # Notice : padding=`post`
    genres_list = tf.keras.utils.pad_sequences(genres_list, maxlen=max_len, padding='post', )

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
                              for feat in sparse_features]

    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean')]  # Notice : value 0 is for padding for sequence input feature
    
    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model
    model_input = {name: data[name] for name in sparse_features}  #
    model_input["genres"] = genres_list

    # 4.Define Model,compile and train

    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'

    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)

    model.compile("adam", "mse", metrics=['mse'], )
    history = model.fit(model_input, data[target].values, batch_size=256, epochs=100, verbose=2)

cpu
Train on 100226 samples, validate on 0 samples, 392 steps per epoch
Epoch 1/100
3s - loss:  1.6019 - mse:  1.6008
Epoch 2/100
3s - loss:  0.7109 - mse:  0.7110
Epoch 3/100
3s - loss:  0.6848 - mse:  0.6849
Epoch 4/100
3s - loss:  0.6725 - mse:  0.6726
Epoch 5/100
3s - loss:  0.6664 - mse:  0.6664
Epoch 6/100
3s - loss:  0.6609 - mse:  0.6608
Epoch 7/100
3s - loss:  0.6580 - mse:  0.6580
Epoch 8/100
4s - loss:  0.6555 - mse:  0.6553
Epoch 9/100
3s - loss:  0.6541 - mse:  0.6542
Epoch 10/100
3s - loss:  0.6520 - mse:  0.6520
Epoch 11/100
3s - loss:  0.6508 - mse:  0.6508
Epoch 12/100
3s - loss:  0.6497 - mse:  0.6499
Epoch 13/100
3s - loss:  0.6479 - mse:  0.6480
Epoch 14/100
3s - loss:  0.6475 - mse:  0.6473
Epoch 15/100
3s - loss:  0.6479 - mse:  0.6479
Epoch 16/100
3s - loss:  0.6464 - mse:  0.6463
Epoch 17/100
3s - loss:  0.6459 - mse:  0.6457
Epoch 18/100
3s - loss:  0.6454 - mse:  0.6458
Epoch 19/100
3s - loss:  0.6435 - mse:  0.6437
Epoch 20/100
3s - loss:  0.6435 - mse:  0.64

In [569]:
model_input['userId']

0           0
1           0
2           0
3           0
4           0
         ... 
100221    609
100222    609
100223    609
100224    609
100225    609
Name: userId, Length: 100226, dtype: int64

In [711]:
model_input['userId'][550]

4

In [612]:
model_input['movieId']

0            0
1            2
2            5
3           43
4           46
          ... 
100221    9380
100222    9381
100223    9405
100224    9406
100225    9407
Name: movieId, Length: 100226, dtype: int64

In [613]:
model_input['genres'][10]

array([ 7,  6, 14,  0,  0,  0,  0,  0,  0,  0])

In [614]:
[11, 2028] + split('Action|Drama|War')

[11, 2028, 7, 12, 13]

### **Testing**

In [621]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
key2index = {}
def model_step(data):

    sparse_features = ["userId", "movieId"]

    # 1.Label Encoding for sparse features,and process sequence features
    for feat in sparse_features:
        lbe = LabelEncoder()
        # ohe = OneHotEncoder()
    #     print(type(data[feat]))
        data[feat] = lbe.fit_transform(data[feat])
    #     print(type(data[feat]))

    genres_list = list(map(split, data['genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    
    # 3.generate input data for model
    model_input = {name: pd.Series(int(data[name])) for name in sparse_features}  #
    genres_list[0] += (10 - len(genres_list[0])) * [0]
    model_input["genres"] = np.array(genres_list)

    return model_input

In [622]:
tt = pd.DataFrame(complete_test_df.iloc[1].values, index=complete_test_df.columns).T
tt

Unnamed: 0,userId,movieId,rating,timestamp,Train,genres
0,2,131724,5.0,1445714851,0,Documentary


In [712]:
obs = model_step(pd.DataFrame(complete_test_df.iloc[69].values, index=complete_test_df.columns).T)

  model_input = {name: pd.Series(int(data[name])) for name in sparse_features}  #


In [714]:
obs

{'userId': 0    0
 dtype: int64,
 'movieId': 0    0
 dtype: int64,
 'genres': array([[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [626]:
model.predict(obs)

array([[5.68309593]])

------------------------------

In [109]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [110]:
movies[movies.movieId == 1].genres.values[0].split('|')

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']

# **Our App**

### **Unique Users and Movies**

In [417]:
len(movies.movieId.unique())

9742

In [418]:
unique_users = df.userId.unique()
unique_movies = df.movieId.unique()
unique_users[0], unique_movies

(1, array([     1,      3,      6, ..., 160836, 163937, 163981], dtype=int64))

### **Movies Rated By User**

In [474]:
def user_rated_movies(user_id):
    items_our_user_rated = sorted(complete_train_df[complete_train_df.userId==user_id][['movieId', 'title', 'genres','rating']].values, key=lambda x: x[1])
    return items_our_user_rated

### **Movies Recommended For User**

In [706]:
import pickle

saved_model = "model.pkl"
with open(saved_model, 'wb') as file:
    pickle.dump(model, file)

In [708]:
with open(saved_model, 'rb') as file:
    DeepFM_model = pickle.load(file)

In [475]:
def not_watched(user_id):
    items_our_user_can_rate = data[~data.userId.isin(user_rated_movies(user_id))].movieId.values
    return items_our_user_can_rate

In [668]:
data['prediction'] = 0

In [709]:
def user_recommends(user_id):
    obs = {}
    
    not_watched_list = data[~data.userId.isin(user_rated_movies(user_id))][data.userId == user_id].values
    for movie_info in not_watched_list:

        obs['userId'] = pd.Series(0, LabelEncoder().fit_transform(np.array([user_id])))
        obs['movieId'] = pd.Series(0, LabelEncoder().fit_transform(np.array([movie_info[1]])))
        obs['genres'] = np.array(split(movie_info[6])).reshape(1, -1)
        data.loc[(data.userId == user_id) & (data.movieId == movie_info[1]), 'prediction'] = DeepFM_model.predict(obs)[0][0]
    
    return sorted(data[data.userId == user_id].values, key=lambda x: x[-1], reverse=True)

In [710]:
user_recommends(0)

[array([0, 913, 5.0, 964982951, 1, 'Goodfellas (1990)', 'Crime|Drama',
        99685, 5.925660133361816], dtype=object),
 array([0, 1331, 5.0, 964983034, 1, 'Newton Boys, The (1998)',
        'Crime|Drama', 120769, 5.925660133361816], dtype=object),
 array([0, 1733, 5.0, 964983263, 1, 'American History X (1998)',
        'Crime|Drama', 120586, 5.925660133361816], dtype=object),
 array([0, 2369, 5.0, 964983873, 1, 'Green Mile, The (1999)',
        'Crime|Drama', 120689, 5.925660133361816], dtype=object),
 array([0, 734, 5.0, 964983219, 1, 'Mr. Smith Goes to Washington (1939)',
        'Drama', 31679, 5.91655969619751], dtype=object),
 array([0, 1430, 5.0, 964982176, 1, 'Rocky (1976)', 'Drama', 75148,
        5.91655969619751], dtype=object),
 array([0, 995, 5.0, 964984086, 1, 'Pink Floyd: The Wall (1982)',
        'Drama|Musical', 84503, 5.801673412322998], dtype=object),
 array([0, 1686, 4.0, 964982989, 1, 'Few Good Men, A (1992)',
        'Crime|Drama|Thriller', 104257, 5.763352870941

In [685]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,Train,title,genres,imdbId,prediction
0,0,0,4.0,964982703,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,5.266891
1,0,2,4.0,964981247,1,Grumpier Old Men (1995),Comedy|Romance,113228,4.941928
2,0,5,4.0,964982224,1,Heat (1995),Action|Crime|Thriller,113277,5.453032
3,0,43,5.0,964983815,1,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,114369,5.077784
4,0,46,5.0,964982931,1,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,114814,5.314119


### **Movie Info**

In [575]:
def movie_info(movie_id):
    try:
        return movies[movies.movieId == movie_id].values[0]
    except:
        return None


In [576]:
movie_info(99)

array([99, 'Heidi Fleiss: Hollywood Madam (1995)', 'Documentary'],
      dtype=object)

### **Most Similar Movies**

In [581]:
def adjusted_cosine_sim(vec_a, vec_b):

    a_avg = np.average(vec_a)
    b_avg = np.average(vec_b)
    
    sim_score = np.dot(vec_a - a_avg, vec_b - b_avg) / (np.linalg.norm(vec_a - a_avg) * np.linalg.norm(vec_b - b_avg))

    return sim_score

In [604]:
complete_train_df

Unnamed: 0,userId,movieId,rating,timestamp,Train,title,genres,imdbId
0,0,0,4.0,964982703,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709
1,0,2,4.0,964981247,1,Grumpier Old Men (1995),Comedy|Romance,113228
2,0,5,4.0,964982224,1,Heat (1995),Action|Crime|Thriller,113277
3,0,43,5.0,964983815,1,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,114369
4,0,46,5.0,964982931,1,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,114814
...,...,...,...,...,...,...,...,...
100221,609,9380,4.0,1493879365,1,Rogue One: A Star Wars Story (2016),Action|Adventure|Fantasy|Sci-Fi,3748528
100222,609,9381,4.0,1493848402,1,Split (2017),Drama|Horror|Thriller,4972582
100223,609,9405,5.0,1493850091,1,John Wick: Chapter Two (2017),Action|Crime|Thriller,4425200
100224,609,9406,5.0,1494273047,1,Get Out (2017),Horror,5052448


In [609]:
def most_sim(mid):
    sim_movies = []
    all_info = []
    utility_matrix = history
    for j in range(utility_matrix.shape[1]):
        sim_movies.append((j + 1, adjusted_cosine_sim(utility_matrix.iloc[:, mid - 1], utility_matrix.iloc[:, j])))
    
    sim_movies = sorted(sim_movies, key=lambda x: x[1], reverse=True)
    
    for mov in sim_movies:
        ret = complete_train_df[complete_train_df.movieId == mov]
        if ret is None:
            all_info.append((mov))
        else:
            all_info.append(ret)
    return all_info

In [747]:
def get_image(movie_id):
    try:
        return f"Images/{movie_id}.jpg"
    except:
        return "img.jpg"

In [748]:
get_image(3825)

'Images/3825.jpg'

--------------------------

### **Scarping Images**

In [None]:
from selenium import webdriver
import pandas as pd
import urllib
driver = webdriver.Chrome()
for imdb_id, movie_id in zip(links.imdbId.unique()[2646:], links.movieId.unique()[2646:]):
    i = 7 - len(str(imdb_id))
    imdb = "0" * i + str(imdb_id)
    driver.get(f"https://www.imdb.com/title/tt{imdb}/")
    url = driver.find_element("xpath", "//img[@class='ipc-image']")
    url = url.get_attribute('src')
    urllib.request.urlretrieve(str(url),f"./Images/{movie_id}.jpg")
driver.quit()