In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import requests
import time
import warnings; warnings.simplefilter('ignore')
import os
os.makedirs('../backend/data', exist_ok=True)
import re
from collections import Counter
import pickle

In [3]:
# Reading Cleaned Dataset

md = pd.read_csv("../backend/data/good_cinema.csv")

## Building the content-based recommendation system

In [6]:
# Implementing content based filtering based on Overviews, Cast, Director, Writer, and Genre

In [8]:
# For manual standardization of most common named entities

MOVIE_ENTITY_ABBREVIATIONS = {
    # Wars and Historical Events
    r'\bworld war i\b': 'WWI',
    r'\bworld war ii\b': 'WWII',
    r'\bworld war 1\b': 'WWI',
    r'\bworld war 2\b': 'WWII',
    r'\bthe great war\b': 'WWI',
    r'\bsecond world war\b': 'WWII',
    
    # Popular Movie Franchises
    r'\bstar wars(?:\s*episode\s*[ivxlcdm]+)?\b': 'SW',
    r'\blord of the rings\b': 'LOTR',
    r'\bthe lord of the rings\b': 'LOTR',
    r'\bmarvel cinematic universe\b': 'MCU',
    r'\bdc extended universe\b': 'DCEU',
    
    # Countries/Regions
    r'\bunited states\b': 'US',
    r'\bunited states of america\b': 'US',
    r'\bamerica\b': 'US',
    r'\busa\b': 'US',
    r'\bunited kingdom\b': 'UK',
    r'\bgreat britain\b': 'UK',
    r'\bsoviet union\b': 'USSR',
    
    # Time Periods
    r'\b17\d{2}s\b': r'\g<0>',  # Preserve "1980s" etc
    r'\b18\d{2}s\b': r'\g<0>',
    r'\b19\d{2}s\b': r'\g<0>',
    r'\b20\d{2}s\b': r'\g<0>',
}

In [10]:
# Standardizing the named entities from the MOVIE_ENTITY_ABBREVIATIONS dictionary

def standardize_entities(text, abbreviation_map=None):
    if pd.isna(text):
        return ""
    
    if abbreviation_map is None:
        abbreviation_map = MOVIE_ENTITY_ABBREVIATIONS
    
    standardized = text
    for pattern, abbrev in abbreviation_map.items():
        standardized = re.sub(
            pattern, 
            abbrev, 
            standardized, 
            flags=re.IGNORECASE
        )
    return standardized

In [12]:
# Processing 'overview'

md['processed_overview'] = (
    md['overview']
    .apply(standardize_entities)  # manual standardization
    .str.lower()                 # lowercasing
    .apply(lambda x: re.sub(r'[^\w\s]', '', x))  # Removing punctuation
    .apply(lambda x: ' '.join([w for w in x.split() if len(w) > 2])))  # Removing short words

In [14]:
all_genres = pd.Series([genre.strip() for sublist in md['genres'].str.split(',') for genre in sublist]).unique()

In [16]:
# Removing Space between words so that our engine doesn't confuse between Ryan Gosling and Ryan Reynolds

md['genres_clean'] = md['genres'].str.replace(' ', '')
md['directors_clean'] = md['directed_by'].str.replace(' ', '')
md['writers_clean'] = md['written_by'].str.replace(' ', '')
md['cast_clean'] = md['cast'].str.replace(' ', '')

# Replacing commas with a space for TF-IDF

md['genres_clean'] = md['genres_clean'].str.replace(',', ' ')
md['directors_clean'] = md['directors_clean'].str.replace(',', ' ')
md['writers_clean'] = md['writers_clean'].str.replace(',', ' ')
md['cast_clean'] = md['cast_clean'].str.replace(',', ' ')

# Replacing NaNs with empty string

md.fillna({'processed_overview': '', 'genres_clean': '', 'directors_clean': '', 'writers_clean': '', 'cast_clean': ''}, inplace=True)

md[['cast_clean','directors_clean']].head(5)
#md[md['directed_by'].str.contains(',',na=False)].head(10)['directed_by']

Unnamed: 0,cast_clean,directors_clean
0,EetuHilkamo TuroPajala JormaMarkkula HannuKivi...,AkiKaurismäki
1,HaijeAlanoja AkiKaurismäki Jukka-PekkaPalo Esk...,AkiKaurismäki
2,AntonioBanderas SammiDavis KimberlyBlair PaulS...,AllisonAnders QuentinTarantino RobertRodriguez...
3,StephenDorff DeirdreKelly EmilioEstevez Robert...,StephenHopkins
4,HalWamsley RobertA.Denham PamRose TimCondren I...,GeorgeLucas


In [18]:
# Creating a custom transformer for each feature

class ColumnExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        return X[self.column]

In [22]:
# Define vectorizers

overview_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english', min_df=0.0,)  # overview
people_tfidf = TfidfVectorizer(analyzer='word', min_df=0.0)  # directors, writers, cast
genres_tfidf = TfidfVectorizer(analyzer='word', min_df=0.0) # genres

In [24]:
# Creating the FeatureUnion pipeline

feature_union = FeatureUnion([
    ('overview', Pipeline([
        ('extractor', ColumnExtractor('processed_overview')),
        ('tfidf', overview_tfidf)
    ])),
    ('genres', Pipeline([
        ('extractor', ColumnExtractor('genres_clean')),
        ('tfidf', genres_tfidf)
    ])),
    ('directors', Pipeline([
        ('extractor', ColumnExtractor('directors_clean')),
        ('tfidf', people_tfidf)
    ])),
    ('writers', Pipeline([
        ('extractor', ColumnExtractor('writers_clean')),
        ('tfidf', people_tfidf)
    ])),
    ('cast', Pipeline([
        ('extractor', ColumnExtractor('cast_clean')),
        ('tfidf', people_tfidf)
    ]))
])

In [26]:
# Generating combined TF-IDF matrix

tfidf_matrix = feature_union.fit_transform(md)

In [28]:
tfidf_matrix.shape

(18790, 717128)

In [30]:
# Computing cosine similarity

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [32]:
cosine_sim.shape

(18790, 18790)

In [38]:
md = md.reset_index()
titles = md['title']
indices = pd.Series(md.index, index=md['title'])

In [40]:
def get_recommendations(title, cosine_sim=cosine_sim, movies_df=md, top_n=10):
    idx = indices[title]  # Find movie index
    sim_scores = list(enumerate(cosine_sim[idx]))  # Get similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  # Sort
    sim_scores = sim_scores[1:top_n+1]  # Exclude self & get top matches
    movie_indices = [i[0] for i in sim_scores]
    return movies_df.iloc[movie_indices]  # Return recommendations

In [48]:
get_recommendations('I, Daniel Blake').head(10)

Unnamed: 0,index,imdb_id,tmdb_id,title,original_title,overview,imdb_rating,imdb_votes,genres,cast,...,cinematography,music_by,release_date,original_language,poster_path,processed_overview,genres_clean,directors_clean,writers_clean,cast_clean
15663,15663,tt8359816,522369,Sorry We Missed You,Sorry We Missed You,Ricky and his family have been fighting an uph...,7.6,27052.0,Drama,"Andy Kidd, Lee Hall, Randolph Paul, Christine ...",...,Robbie Ryan,George Fenton,2019-10-04,en,/jNvlqNDnXH8aqBeiBxNNP0wWWO3.jpg,ricky and his family have been fighting uphill...,Drama,KenLoach,PaulLaverty,AndyKidd LeeHall RandolphPaul ChristineBeck Ru...
17931,17931,tt19883634,970348,The Old Oak,The Old Oak,A pub landlord in a previously thriving mining...,7.1,12647.0,Drama,"Chrissie Robinson, Ali Mohamed, Sabah Hassoun,...",...,Robbie Ryan,George Fenton,2023-09-29,en,/xN293BKB0MDqpILmLCtGGYxQXKW.jpg,pub landlord previously thriving mining commun...,Drama,KenLoach,PaulLaverty,ChrissieRobinson AliMohamed SabahHassoun KikaM...
2072,2072,tt0807054,7992,It's a Free World...,It's a Free World...,Angie is a working class woman. After being fi...,7.0,6426.0,Drama,"Kierston Wareing, Frank Gilhooley, Davoud Rast...",...,Nigel Willoughby,George Fenton,2007-09-28,en,/mf21jW15cjoe95yIW1DvlYUN4YO.jpg,angie working class woman after being fired sh...,Drama,KenLoach,PaulLaverty,KierstonWareing FrankGilhooley DavoudRastgou J...
2109,2109,tt0151691,8129,My Name Is Joe,My Name Is Joe,"Two thirtysomethings, unemployed former alcoho...",7.4,9135.0,Drama,"Paul Gillan, Louise Goodall, Ann Marie Laffert...",...,Barry Ackroyd,George Fenton,1998-05-15,en,/e9awu6tsCyWPAUA2eEoelFSTbcb.jpg,two thirtysomethings unemployed former alcohol...,Drama,KenLoach,PaulLaverty,PaulGillan LouiseGoodall AnnMarieLafferty Simo...
5539,5539,tt1242545,18898,Looking for Eric,Looking for Eric,A man trying to put his life back on track get...,7.1,18330.0,"Drama, Comedy","Justin Moorhouse, Stefan Gumbs, Mick Ferry, Ry...",...,Barry Ackroyd,George Fenton,2009-05-27,en,/fTHUY0E1FYMFKiQ8UDKx793LtW0.jpg,man trying put his life back track gets some a...,Drama Comedy,KenLoach,PaulLaverty,JustinMoorhouse StefanGumbs MickFerry RyanPope...
8075,8075,tt0212826,37691,Bread and Roses,Bread and Roses,Maya is a quick-witted young woman who comes o...,7.0,6249.0,"Comedy, Drama","Maria Orellana, Sherman Augustus, Tom Gilroy, ...",...,Barry Ackroyd,George Fenton,2000-09-14,en,/aNk0yx0xRcs66OdByiy2pJaFhI6.jpg,maya quickwitted young woman who comes over th...,Comedy Drama,KenLoach,PaulLaverty,MariaOrellana ShermanAugustus TomGilroy Cooper...
11356,11356,tt1924394,103747,The Angels' Share,The Angels' Share,"Narrowly avoiding jail, new dad Robbie vows to...",7.0,27609.0,"Drama, Comedy","Gary Maitland, Scott Dymond, Caz Dunlop, Finla...",...,"Robbie Ryan, Barry Ackroyd",George Fenton,2012-06-01,en,/bt9kJ8gwTMaDcmL0INlJYIZ887t.jpg,narrowly avoiding jail new dad robbie vows tur...,Drama Comedy,KenLoach,PaulLaverty,GaryMaitland ScottDymond CazDunlop FinlayHarri...
839,839,tt0380366,1404,Ae Fond Kiss...,Ae Fond Kiss...,A young man upsets his Punjabi family when he ...,7.1,7480.0,"Drama, Romance","Ahmad Riaz, Karen Fraser, John Yule, Shabana A...",...,Barry Ackroyd,George Fenton,2004-02-13,en,/tShULKbs8svdBcekljqGPO42PNj.jpg,young man upsets his punjabi family when falls...,Drama Romance,KenLoach,PaulLaverty,AhmadRiaz KarenFraser JohnYule ShabanaAkhtarBa...
823,823,tt0313670,1376,Sweet Sixteen,Sweet Sixteen,Determined to have a normal family life once h...,7.4,16120.0,"Crime, Drama","Junior Walker, William Ruane, Rikki Traynor, T...",...,Barry Ackroyd,George Fenton,2002-09-07,en,/p5h50WEZQ4pRFffnSULqL4cEuLp.jpg,determined have normal family life once his mo...,Crime Drama,KenLoach,PaulLaverty,JuniorWalker WilliamRuane RikkiTraynor TommyMc...
770,770,tt0460989,1116,The Wind That Shakes the Barley,The Wind That Shakes the Barley,In 1920s Ireland young doctor Damien O'Donovan...,7.5,56652.0,"Drama, War","Tim O'Mahon, Colin McClery, Aine O'Connor, Mar...",...,Barry Ackroyd,George Fenton,2006-06-23,en,/9XquDdOGrlC0EAbPoOXALqS2dDh.jpg,1920s ireland young doctor damien odonovan pre...,Drama War,KenLoach,PaulLaverty,TimO'Mahon ColinMcClery AineO'Connor MarkBryce...


### Exporting components

In [51]:
# Saving the TF-IDF vectorizer
with open('../backend/data/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

# Saving the cosine similarity matrix for fast lookups
with open('../backend/data/cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)