In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD

DATA COLLECTION:

In [3]:
movies=pd.read_csv('top10K-TMDB-movies.csv')

Data Preprocessing:

In [4]:
movies=movies[['id','title','genre','overview']]

In [5]:
movies

Unnamed: 0,id,title,genre,overview
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...
...,...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,Fantasy","The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventure",The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"Action,Science Fiction,War","During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",A man named Farmer sets out to rescue his kidn...


Feature Engineering:

In [6]:
movies['details']=movies['genre']+movies['overview']

In [7]:
movies

Unnamed: 0,id,title,genre,overview,details
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second...","Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o...","Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...,"Drama,CrimeIn the continuing saga of the Corle..."
...,...,...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,Fantasy","The story follows the adventures of Aang, a yo...","Action,Adventure,FantasyThe story follows the ..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventure",The sharks take bite out of the East Coast whe...,"Action,TV Movie,Science Fiction,Comedy,Adventu..."
9997,13995,Captain America,"Action,Science Fiction,War","During World War II, a brave, patriotic Americ...","Action,Science Fiction,WarDuring World War II,..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",A man named Farmer sets out to rescue his kidn...,"Adventure,Fantasy,Action,DramaA man named Farm..."


In [8]:
movies=movies.drop(columns=['genre','overview'])

In [9]:
movies = movies.dropna(subset=['details'])

In [10]:
movies

Unnamed: 0,id,title,details
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."
...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,FantasyThe story follows the ..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventu..."
9997,13995,Captain America,"Action,Science Fiction,WarDuring World War II,..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,DramaA man named Farm..."


#COUNT VECTORIZER
#converting text data into a structured numerical format

In [11]:
# Count Vectorizer
cv = CountVectorizer(max_features=1000, stop_words='english')
count_matrix = cv.fit_transform(movies['details'].values.astype('U')).toarray()

In [12]:
cv

In [13]:
# X_train_cv
count_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [14]:
# X_train_cv.shape
count_matrix.shape

(9985, 1000)

Model Building:

In [15]:
# Cosine similarity matrix using Count Vectorizer
cs_matrix = cosine_similarity(count_matrix)
cs_matrix

array([[1.        , 0.11009638, 0.09534626, ..., 0.1254363 , 0.11396058,
        0.05025189],
       [0.11009638, 1.        , 0.17320508, ..., 0.        , 0.        ,
        0.        ],
       [0.09534626, 0.17320508, 1.        , ..., 0.0438529 , 0.05976143,
        0.        ],
       ...,
       [0.1254363 , 0.        , 0.0438529 , ..., 1.        , 0.05241424,
        0.04622502],
       [0.11396058, 0.        , 0.05976143, ..., 0.05241424, 1.        ,
        0.06299408],
       [0.05025189, 0.        , 0.        , ..., 0.04622502, 0.06299408,
        1.        ]])

In [None]:
# KNN model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn

In [None]:
svd = TruncatedSVD(n_components=2, random_state=0)
matrix_svd = svd.fit_transform(count_matrix)
matrix_svd

array([[ 1.21593765,  0.84559315],
       [ 1.28719421,  0.63602032],
       [ 1.64213033,  0.76206855],
       ...,
       [ 1.0726891 , -1.09874716],
       [ 1.01157149, -0.92447201],
       [ 0.6918453 , -0.55033016]])

Model Evaluation:

In [None]:

# Cosine similarity matrix using Count Vectorizer
cs_matrix = cosine_similarity(count_matrix)

# Recommendation function
def recommend(movie, cs_matrix, movie_data):
    try:
        index = movie_data[movie_data['title'] == movie].index[0]
        dist = sorted(list(enumerate(cs_matrix[index])), reverse=True, key=lambda x: x[1])
        recommended_titles = [movie_data.iloc[i[0]].title for i in dist[1:6]]  # Excluding the movie itself
        return recommended_titles
    except IndexError:
        return []

# Split data into train and test sets
train_data, test_data = train_test_split(movies, test_size=0.2, random_state=0)

# Generate recommendations for test set movies using Cosine Similarity
recommendations_cs = {}
for movie in test_data['title']:
    recommendations_cs[movie] = recommend(movie, cs_matrix, movies)

# Ground truth for evaluation (mock data)
ground_truth = {movie: recommend(movie, cs_matrix, movies)[:5] for movie in test_data['title']}

# KNN model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(count_matrix)

def knn_recommend(movie_title, model_knn, movie_data, count_matrix, n_recommendations=5):
    try:
        movie_index = movie_data[movie_data['title'] == movie_title].index[0]
        distances, indices = model_knn.kneighbors(count_matrix[movie_index], n_neighbors=n_recommendations + 1)
        recommendations = [movie_data.iloc[i]['title'] for i in indices.flatten()[1:]]
        return recommendations
    except IndexError:
        return []

# Generate recommendations using KNN
recommendations_knn = {movie: knn_recommend(movie, model_knn, movies, count_matrix) for movie in test_data['title']}

# SVD model
svd = TruncatedSVD(n_components=2, random_state=0)
matrix_svd = svd.fit_transform(count_matrix)

def svd_recommend(movie_title, matrix_svd, movie_data, n_recommendations=5):
    try:
        movie_index = movie_data[movie_data['title'] == movie_title].index[0]
        movie_svd_vector = matrix_svd[movie_index]
        similarity_scores = cosine_similarity([movie_svd_vector], matrix_svd)[0]
        similar_movies = movie_data.iloc[np.argsort(similarity_scores)[-n_recommendations-1:-1]].title.values
        return similar_movies
    except IndexError:
        return []

# Generate recommendations using SVD
recommendations_svd = {movie: svd_recommend(movie, matrix_svd, movies) for movie in test_data['title']}

# Evaluate recommendations
def evaluate_recommendations(ground_truth, recommendations):
    all_y_true = []
    all_y_pred = []
    for movie, recs in recommendations.items():
        if movie in ground_truth:
            y_true = [1 if rec in ground_truth[movie] else 0 for rec in recs]
            y_pred = [1] * len(recs)
            all_y_true.extend(y_true)
            all_y_pred.extend(y_pred)
    precision = precision_score(all_y_true, all_y_pred, zero_division=0)
    recall = recall_score(all_y_true, all_y_pred, zero_division=0)
    f1 = f1_score(all_y_true, all_y_pred, zero_division=0)
    return precision, recall, f1

# Calculate precision, recall, and F1-score for each algorithm
precision_cs, recall_cs, f1_cs = evaluate_recommendations(ground_truth, recommendations_cs)
precision_knn, recall_knn, f1_knn = evaluate_recommendations(ground_truth, recommendations_knn)
precision_svd, recall_svd, f1_svd = evaluate_recommendations(ground_truth, recommendations_svd)

# Print results
print(f"Cosine Similarity - Precision: {precision_cs}, Recall: {recall_cs}, F1-score: {f1_cs}")
print(f"KNN - Precision: {precision_knn}, Recall: {recall_knn}, F1-score: {f1_knn}")
print(f"SVD - Precision: {precision_svd}, Recall: {recall_svd}, F1-score: {f1_svd}")


Cosine Similarity - Precision: 1.0, Recall: 1.0, F1-score: 1.0
KNN - Precision: 0.9649370277078085, Recall: 1.0, F1-score: 0.9821556763408881
SVD - Precision: 0.0017128463476070528, Recall: 1.0, F1-score: 0.003419835043250855


conclusion:

COSINE SIMILARITY achieves perfect scores in precision, recall, and F1-score, indicating that it is the most effective algorithm for your movie recommendation system in this context.

KNN also performs well, with a high precision and F1-score, but slightly lower than Cosine Similarity.

SVD performs poorly, especially in terms of precision and F1-score, making it the least effective algorithm for this task.

In [None]:
import pickle

In [None]:
pickle.dump(movies, open('moviess.pkl','wb'))

In [None]:
pickle.dump(cs_matrix, open('similarityy.pkl','wb'))

In [None]:
pickle.load(open('moviess.pkl','rb'))

Unnamed: 0,id,title,details
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."
...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,FantasyThe story follows the ..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventu..."
9997,13995,Captain America,"Action,Science Fiction,WarDuring World War II,..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,DramaA man named Farm..."


In [None]:
pickle.load(open('similarityy.pkl','rb'))

array([[1.        , 0.11009638, 0.09534626, ..., 0.1254363 , 0.11396058,
        0.05025189],
       [0.11009638, 1.        , 0.17320508, ..., 0.        , 0.        ,
        0.        ],
       [0.09534626, 0.17320508, 1.        , ..., 0.0438529 , 0.05976143,
        0.        ],
       ...,
       [0.1254363 , 0.        , 0.0438529 , ..., 1.        , 0.05241424,
        0.04622502],
       [0.11396058, 0.        , 0.05976143, ..., 0.05241424, 1.        ,
        0.06299408],
       [0.05025189, 0.        , 0.        , ..., 0.04622502, 0.06299408,
        1.        ]])