# Recommendation System

In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline
import sys

print('python:',sys.version)
print('Numpy:',np.__version__)
print('Pandas:',pd.__version__)
print('Scikitlearn:',sklearn.__version__)
print('Seaborn: ',sns.__version__)
print('matplotlib:',matplotlib.__version__)


In [None]:

MoviesDF=pd.read_csv(r'Data1\tmdb_5000_movies.csv',low_memory=False)#, dtype={'overview': np.string_},low_memory=False)
MoviesDF


In [None]:
VotesDF=pd.read_csv(r'Data1\tmdb_5000_credits.csv',low_memory=False)#, dtype={'overview': np.string_},low_memory=False)
VotesDF


# I. EDA 

### 1) Data Summarization:

In [None]:
MoviesDF.shape
MoviesDF.info()


In [None]:
from collections import Counter

Counter(MoviesDF['homepage'])


In [None]:

MoviesDF.drop(['homepage', 'tagline', 'title'], axis=1,inplace=True, errors='raise')
MoviesDF


In [None]:

Movies_DFF=pd.concat([MoviesDF,VotesDF] , axis=1, join='outer', ignore_index=False, 
                     keys=None, verify_integrity=False, 
                     sort=False, copy=False)
Movies_DFF


In [None]:
Movies_DFF['id'].info()
Movies_DFF['movie_id'].info()
Movies_DFF.drop(['movie_id'], axis=1,inplace=True, errors='raise')
Movies_DFF


In [None]:
from ast import literal_eval

Movies_DFF['genres'] = Movies_DFF['genres'].fillna('['']').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
Movies_DFF['genres']


In [None]:

Movies_DFF['keywords'] = Movies_DFF['keywords'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
Movies_DFF['keywords']


In [None]:

Movies_DFF['Year']= pd.to_datetime(Movies_DFF['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
Movies_DFF['Year']


In [None]:
Movies_DFF.isnull().any()


In [None]:
Movies_DFF.describe()

In [None]:
numeric=list(Movies_DFF.describe().columns)
categoric=list(set(Movies_DFF.columns).difference(numeric))
Movies_DFF[categoric]
numeric.remove('id')
numeric


In [None]:
Movies_DFF[categoric].isnull().any()
Movies_DFF['release_date'].info()


In [None]:

plt.figure(figsize=(20,12))
sns.boxplot(Movies_DFF[numeric])
plt.xticks(rotation=45)
plt.ylim(0,1000)
plt.show()


# II. Modelling:

### 1) Calculate parameters:

the model predicts the rating for movie $i$ by user $j$ as <br>
$y^{(i,j)} = \mathbf{w}^{(j)}\cdot \mathbf{x}^{(i)} + b^{(i)}$ <br>


<b>y</b>= Item feature matrix - This parameter represents the latent features or characteristics of the items in the recommendation system. It captures the underlying properties or attributes of the items that are relevant for making recommendations.<br>
<b>w </b>= Weight matrix, Each weight corresponds to the importance or relevance of a particular feature in determining the recommendation. <br>
<b>x</b>= Input vector or User feature matrix - This parameter represents the latent features or preferences of the users in the recommendation system. It captures the underlying characteristics or preferences of the users that influence their interactions with items.<br>
<b>b</b>= Bias term - This represents an additional constant term that is added to the recommendation scores. The bias term captures the overall tendency or bias in the recommendation system, such as the average rating or popularity of items, or the average behavior of users. <br>
<b>r</b>= Rating matrix - This parameter represents the user-item interaction matrix, where each entry corresponds to the rating or preference of a user for an item.<br>


In [None]:

RatingsDF=pd.read_csv(r'Data2\ratings_small.csv',low_memory=False,
                      index_col=None)
RatingsDF
#Movies_DFF


Using the TMDB Ratings to come up with our **Top Movies Chart.** I will use IMDB's *weighted rating* formula to construct my chart. Mathematically, it is represented as follows:

Weighted Rating (WR) = $(\frac{v}{v + m} . R) + (\frac{m}{v + m} . C)$

where,
* *v* is the number of votes for the movie
* *m* is the minimum votes required to be listed in the chart
* *R* is the average rating of the movie
* *C* is the mean vote across the whole report


In [None]:
v=Movies_DFF['vote_count']
m=np.min(Movies_DFF['vote_count'], axis=0, out=None)
R=Movies_DFF['vote_average']
C=np.mean(Movies_DFF['vote_average'])
C


In [None]:
vote_counts = Movies_DFF[Movies_DFF['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = Movies_DFF[Movies_DFF['vote_average'].notnull()]['vote_average'].astype('int')
C=vote_averages.mean()
m = vote_counts.quantile(0.95)
C


In [None]:
def weighted_rating(x):
    return (v/(v+m) * R) + (m/(m+v) * C)


In [None]:
Qualified = Movies_DFF[(Movies_DFF['vote_count'] >= m) & (Movies_DFF['vote_count'].notnull()) & (Movies_DFF['vote_average'].notnull())][['title', 'Year', 'vote_count', 'vote_average', 'popularity', 'genres']]
Qualified['vote_count'] = Qualified['vote_count'].astype('int')
Qualified['vote_average'] = Qualified['vote_average'].astype('int')
Qualified.shape


In [None]:

Qualified['WR']=weighted_rating(Qualified)
Top=Qualified.sort_values('WR', ascending=False)
Top

In [None]:

transformed_genre = [[str(element) for element in sublist] for sublist in Qualified['genres']]
len(transformed_genre)


In [None]:

Qualified['genres']=transformed_genre
Qualified


In [None]:

Gen = Qualified[Qualified['genres'].apply(lambda x: 'Crime' in x)]
Top2=Gen.sort_values('WR', ascending=False)
Top2


### 2) Collaborative Filtering:

In [None]:
RatingsDF


In [None]:
from surprise import Reader, Dataset, SVD, evaluate

reader = Reader()
data = Dataset.load_from_df(RatingsDF[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)
data


### 2) 2nd Method

In [None]:
# Y
col=list(Movies_DFF.columns)
col.remove('vote_average')
col.remove('vote_count')
Y=Movies_DFF[col]
Y


In [None]:
# W

W=pd.concat([Movies_DFF['id'], Movies_DFF['revenue'],Movies_DFF['popularity'], ],axis=1)
W


In [None]:
# X

X=pd.concat([Movies_DFF['title'], Movies_DFF['cast'],Movies_DFF['genres'], Movies_DFF['keywords'],
             Movies_DFF['popularity'], Movies_DFF['revenue'],Movies_DFF['production_countries'],
             Movies_DFF['release_date'],Movies_DFF['spoken_languages']], axis=1)
X


In [None]:
# B  (Constant)
B=pd.concat([Movies_DFF['id'], Movies_DFF['vote_average']], axis=1)
B
            

In [None]:
# R

R=RatingsDF.iloc[:,0:3]
R


In [None]:
param=[X, W, B, Y, R]
print(f'the parameters [X, W, B, Y, R] shapes are:')
for a in param:
    print(f'\n param=', a.shape)


### Collaborative filtering cost function

The collaborative filtering cost function is given by
$$ J({\mathbf{x}^{(0)},...,\mathbf{x}^{(n_m-1)},\mathbf{w}^{(0)},b^{(0)},...,\mathbf{w}^{(n_u-1)},b^{(n_u-1)}})= \frac{1}{2}\sum_{(i,j):r(i,j)=1}(\mathbf{w}^{(j)} \cdot \mathbf{x}^{(i)} + b^{(j)} - y^{(i,j)})^2
+\underbrace{
\frac{\lambda}{2}
\sum_{j=0}^{n_u-1}\sum_{k=0}^{n-1}(\mathbf{w}^{(j)}_k)^2
+ \frac{\lambda}{2}\sum_{i=0}^{n_m-1}\sum_{k=0}^{n-1}(\mathbf{x}_k^{(i)})^2
}_{regularization}
\tag{1}$$
The first summation in (1) is "for all $i$, $j$ where $r(i,j)$ equals $1$" and could be written:

$$
= \frac{1}{2}\sum_{j=0}^{n_u-1} \sum_{i=0}^{n_m-1}r(i,j)*(\mathbf{w}^{(j)} \cdot \mathbf{x}^{(i)} + b^{(j)} - y^{(i,j)})^2
+\text{regularization}
$$

You should now write cofiCostFunc (collaborative filtering cost function) to return this cost.

In [None]:

def cofi_cost_func(X, W, B, Y, R, lambda_):
    nm, nu = Y.shape
    J = 0
    for j in range(nu):
        for i in range(nm):
            w = W.iloc[:,j]
            b_j = B.iloc[:,j]
            x = X.iloc[i,:]
            y = Y.iloc[i,j]
            r = R.iloc[i,j]
            J += r * np.square((np.dot(w,x) + b_j - y ))
    J += (lambda_) * (np.sum(np.square(W)) + np.sum(np.square(X)))
    J = J/2
    return J


In [None]:
num_users_r = 300
num_movies_r = 2000
num_features_r = 18

X_r = X.iloc[:num_movies_r, : ]
W_r = W.iloc[:num_users_r,  :]
b_r = B.iloc[0 :num_users_r]#.reshape(1,-1)
Y_r = Y.iloc[:num_movies_r]#, :num_users_r]
R_r = R[:num_movies_r]

# Evaluate cost function
J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")


In [None]:
R_r

### 2) Data Transformation : 