In [None]:
import pandas as pd
import numpy as np
import ast
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from google.colab import drive

import warnings; warnings.simplefilter('ignore')

## Loading Dataset from Google drive in Google Colab

In [None]:
drive.mount("/content/drive")

In [None]:
links_small = pd.read_csv('link_small_path')
md = pd.read_csv('movies_metadata_path')

In [None]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

## Converting some data from string to int and drop some if their id was null

In [None]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [None]:
md['id'] = md['id'].apply(convert_int)
md = md.dropna(subset = ['id'])
md['id'] = md['id'].astype('int')
smd = md[md['id'].isin(links_small)]

In [None]:
smd['tagline'] = smd['tagline'].fillna('')
smd['genres'] = smd['genres'].fillna('')
smd['genres'] = [','.join(map(str, l)) for l in smd['genres']]
smd['description'] = smd['overview'] + smd['tagline'] + smd['genres']
smd['description'] = smd['description'].fillna('')

## Creating the TF x IDF Vector

In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

## Calculating the Cosine Similarity

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [None]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

## Testing

In [None]:
get_recommendations('The Godfather').head(10)