# <h1> <font color="teal"> Importing Libraries : </font> </h1>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from collections import defaultdict
import pickle
from sklearn.metrics.pairwise import cosine_distances
from wordcloud import WordCloud
from scipy.spatial import distance

### <h1> <font color="teal"> Importing Datasets : </font> </h1> 

In [2]:
movies = pd.read_csv("/Users/mukesh/Documents/01-Dissertation/ml-25m/movies.csv")
ratings = pd.read_csv("/Users/mukesh/Documents/01-Dissertation/ml-25m/ratings.csv")

### <h1> <font color="teal"> PreProcessing :  </font> </h1> 

In [3]:
temp = ratings.drop(['timestamp'], axis=1)

In [4]:
features = movies.drop(['title'], axis=1)

In [5]:
data = pd.merge(movies, temp, on="movieId")

###  <h1> <font color="teal"> Feature Engineering :   </font> </h1> 

---
<font face="Times New Roman" color="green">**Data Preprocessing Steps:**</font>
---

<font face="Times New Roman" color="blue">
A series of data preprocessing steps are performed to extract and transform specific features for further analysis.
</font>


<font face="Times New Roman" color="blue">
    From the title column of the movies DataFrame, the year of release is extracted using a regular expression pattern. 
    </font>

In [6]:
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')

<font face="Times New Roman" color="blue">
    The genres column in the movies DataFrame contains genre information for each movie, often with multiple genres separated by the '|' character. The str.get_dummies('|') method is employed to convert this categorical genre data into a one-hot encoded format.
      </font>

In [7]:
genre_columns = movies["genres"].str.get_dummies('|')

<font face="Times New Roman" color="blue">
    A new DataFrame named content is created by dropping the title column from the movies DataFrame.
      </font>

In [8]:
content = movies.drop("title", axis=1, inplace=False)

In [9]:
content = pd.concat([content, genre_columns], axis=1)

<font face="Times New Roman" color="blue">
    Any rows with missing values in the year column are removed from both the content and movies DataFrames
    </font>

In [10]:
content.dropna(subset=['year'], inplace=True)

In [11]:
movies.dropna(subset=['year'], inplace=True)

<font face="Times New Roman" color="blue">
    The extracted year values are initially of string data type due to the extraction process. For ease of analysis and to ensure accurate numerical operations, the year column in the content DataFrame is converted to an integer
    </font>

In [12]:
content['year'] = content['year'].astype(int)

In [13]:
content.drop("genres", axis=1, inplace=True)

<font face="Times New Roman" color="blue">
    Created a new array name genres_features and stored all the genres information in it. From scikit-learn library I've used the StandardScaler to standardize the genres_features around zero and scales it based on the standard deviation
    </font>

In [14]:
genres_features = content.iloc[:, 2:-1].values

In [15]:
scaler = StandardScaler()
genres_features_scaled = scaler.fit_transform(genres_features)

<font face="Times New Roman" color="blue">
    Since the data is too large to handle, I selected a subset of the data that has the most active users and the most popular movies. The top (new_sample_size =2500) active users and popular movies are identified based on their frequency of occurrence in the dataset. 
    </font>

In [16]:
new_sample_size = 2500

active_users = data["userId"].value_counts().head(new_sample_size).index
popular_movies = data["title"].value_counts().head(new_sample_size).index

In [17]:
subset_data = data[data["userId"].isin(active_users) & data["title"].isin(popular_movies)]

<font face="Times New Roman" color="blue">
    To help the Collaborative Filtering method, I transformed the subset_data DataFrame into a pivot table, user_movie. This table has users as rows, movies as columns, and the corresponding ratings as values. Also, I replaced all missing values (NaN) with zeros.
    </font>

In [18]:
user_movie = subset_data.pivot_table(index="userId", columns="title", values="rating")

In [19]:
user_movie.fillna(0, inplace=True)

### For content based model

In [20]:
genre_weight = 0.8
year_weight = 0.2

In [21]:
weighted_features = (genres_features_scaled * genre_weight) + (content['year'].values.reshape(-1, 1) * year_weight)


### For collaborative based model

In [22]:
user_similarity_matrix = cosine_similarity(user_movie)

In [23]:
np.fill_diagonal(user_similarity_matrix, 0)
user_similarity_matrix = pd.DataFrame(user_similarity_matrix, index=user_movie.index, columns=user_movie.index)

In [24]:
file_path = '/Users/mukesh/Documents/01-Dissertation/final/content.csv'
content.to_csv(file_path, index=False)

In [25]:
file_path = '/Users/mukesh/Documents/01-Dissertation/final/movies.csv'
movies.to_csv(file_path, index=False)

In [26]:
file_path = '/Users/mukesh/Documents/01-Dissertation/final/ratings.csv'
ratings.to_csv(file_path, index=False)

In [27]:
file_path = '/Users/mukesh/Documents/01-Dissertation/final/genres_features_scaled.npy'
np.save(file_path, genres_features_scaled)

In [28]:
file_path = '/Users/mukesh/Documents/01-Dissertation/final/weighted_features.npy'
np.save(file_path, weighted_features)

In [29]:
file_path = '/Users/mukesh/Documents/01-Dissertation/final/user_movie.csv'
user_movie.to_csv(file_path, index=True)

In [30]:
file_path = '/Users/mukesh/Documents/01-Dissertation/final/user_similarity_matrix.csv'
user_similarity_matrix.to_csv(file_path, index=True)