# **CF movie recommendation engine**

### Importing libraries

In [1]:
import pandas as pd
import numpy as np

### Loading Movies CSV

In [2]:
# movies_df = pd.read_csv("./data/movies.csv")
movies_df = pd.read_csv("./data/movies_modified.csv")
print(movies_df.info())
movies_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  86537 non-null  int64 
 1   movieId     86537 non-null  int64 
 2   title       86537 non-null  object
 3   genres      86537 non-null  object
 4   tags        53452 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.3+ MB
None


Unnamed: 0.1,Unnamed: 0,movieId,title,genres,tags
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,sci-fi|battleship game|water|arm|humorous|come...
1,1,2,Jumanji (1995),Adventure|Children|Fantasy,driving a car into a building|construction sit...
2,2,3,Grumpier Old Men (1995),Comedy|Romance,best friend|old|old age|NO_FA_GANES|moldy|sequ...
3,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,based on novel or book|girl movie|characters|i...
4,4,5,Father of the Bride Part II (1995),Comedy,Comedy|growing old|confidence|aging|Diane Keat...


### Loading ratings CSV

In [3]:
# ratings_df = pd.read_csv("./data/ratings.csv")
ratings_df = pd.read_csv("./data/rates_modified.csv")
print(ratings_df.info())
ratings_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   userId      int64  
 2   movieId     int64  
 3   rating      float64
 4   timestamp   int64  
dtypes: float64(1), int64(4)
memory usage: 1.3 GB
None


Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp
0,0,1,1,3.666667,1225734739
1,1,1,110,3.666667,1225865086
2,2,1,158,3.666667,1225733503
3,3,1,260,4.333333,1225735204
4,4,1,356,5.0,1225735119


### Loading tags CSV

In [4]:
tags_df = pd.read_csv("./data/tags.csv")
print(tags_df.info())
tags_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2328315 entries, 0 to 2328314
Data columns (total 4 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   userId     int64 
 1   movieId    int64 
 2   tag        object
 3   timestamp  int64 
dtypes: int64(3), object(1)
memory usage: 71.1+ MB
None


Unnamed: 0,userId,movieId,tag,timestamp
0,10,260,good vs evil,1430666558
1,10,260,Harrison Ford,1430666505
2,10,260,sci-fi,1430666538
3,14,1221,Al Pacino,1311600756
4,14,1221,mafia,1311600746


### Extracting user given tags for each movie

In [5]:
moviesTags = []
for id in movies_df["movieId"]:
    tags = set(tags_df[tags_df["movieId"] == id]["tag"].astype(str))
    moviesTags.append('|'.join(list(tags)))

### Adding tags column for the movies dataframe

In [6]:
movies_df["tags"] = moviesTags
movies_df.head(10)

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,kids and family|chrysler lebaron convertible|m...
1,2,Jumanji (1995),Adventure|Children|Fantasy,cutting one's own hair|reference to wilt chamb...
2,3,Grumpier Old Men (1995),Comedy|Romance,sequel|good soundtrack|Ann Margaret|old|moldy|...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,interracial relationship|revenge|slurs|girl mo...
4,5,Father of the Bride Part II (1995),Comedy,sequel|daughter|pregnancy|growing old|childhoo...
5,6,Heat (1995),Action|Crime|Thriller,In The Nucleus|Al Pacino Vs Robert De Niro|tra...
6,7,Sabrina (1995),Comedy|Romance,fashion assistant|sexuality|female protagonist...
7,8,Tom and Huck (1995),Adventure|Children,flintlock rifle|alabama|evil man|kiss|disarmin...
8,9,Sudden Death (1995),Action,mercilessness|evil man|disarming someone|stabb...
9,10,GoldenEye (1995),Action|Adventure|Thriller,m character|villainess|automobile|shaken not s...


In [7]:
# Saving modified dataframe
# movies_df.to_csv("./data/movies_modified.csv")

### Normalizing users rating

In [26]:
temp_df = ratings_df.copy()

# Calculate the minimum and maximum ratings for each user
user_min_ratings = temp_df.groupby("userId")["rating"].transform("min")
user_max_ratings = temp_df.groupby("userId")["rating"].transform("max")

# Normalize the ratings for each user
temp_df["rating"] = 4 / (user_max_ratings - user_min_ratings) * (temp_df["rating"] - user_max_ratings) + 5

temp_df.head(20)
ratings_df = temp_df

In [27]:
# Saving modified dataframe
# temp_df.to_csv("./data/rates_modified.csv")

### Importing deep learning libraries

In [29]:
# !pip install tensorflow
# !pip install keras
import tensorflow
from keras.layers import Embedding, Reshape, Concatenate
from keras.models import Sequential
from keras.optimizers import Adamax
from keras.callbacks import EarlyStopping, ModelCheckpoint

### Building Our Neural Network Model

In [32]:
factors = 100
model = Sequential()
users_layer = Sequential()
movies_layer = Sequential()
users_layer.add(Embedding(ratings_df["userId"].unique().shape[0], factors, input_length=1))
users_layer.add(Reshape(target_shape=(factors,)))
movies_layer.add(Embedding(movies_df.shape[0], factors, input_length=1))
movies_layer.add(Reshape(target_shape=(factors,)))
model.add(Concatenate([users_layer, movies_layer]))
model.compile(loss="mse", optimizer="adamax")

### Fitting Model

In [33]:
shuffled_ratings = ratings_df.sample(frac=1.)
U = shuffled_ratings["userId"]
M = shuffled_ratings["movieId"]
R = shuffled_ratings["rating"]
callbacks = [EarlyStopping('val_loss', patience=2), ModelCheckpoint('movie_weights.h5', save_best_only=True)]
model.fit([U, M], R, batch_size=1000000, validation_split=.1, callbacks=callbacks)

TypeError: in user code:

    File "C:\Users\YN\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\YN\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\YN\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\YN\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1126, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\YN\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\YN\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\layers\merging\concatenate.py", line 107, in build
        del reduced_inputs_shapes[i][self.axis]

    TypeError: Exception encountered when calling layer 'sequential_6' (type Sequential).
    
    list indices must be integers or slices, not ListWrapper
    
    Call arguments received by layer 'sequential_6' (type Sequential):
      • inputs=('tf.Tensor(shape=(None, 1), dtype=int64)', 'tf.Tensor(shape=(None, 1), dtype=int64)')
      • training=True
      • mask=None
