#### Importing relevant libraries

In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)

import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import random
import re
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS 
from nltk.util import ngrams
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

In [3]:
df = pd.read_csv(r"C:\Users\lily\Desktop\IT alapok privát\PRIV DATA SC NLP\netflix_titles.csv\netflix_titles.csv")

In [4]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Action & Adventure","To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV Comedies","In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life."


#### Sanity check

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [7]:
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [8]:
len(df)

8807

In [9]:
df = df.drop(columns=['show_id', 'date_added', 'release_year', 'duration', 'country'])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   type         8807 non-null   object
 1   title        8807 non-null   object
 2   director     6173 non-null   object
 3   cast         7982 non-null   object
 4   rating       8803 non-null   object
 5   listed_in    8807 non-null   object
 6   description  8807 non-null   object
dtypes: object(7)
memory usage: 481.8+ KB


In [11]:
#df = df.dropna()

In [12]:
len(df)

8807

#### Preprocessing

*Note: title will be preprocessed once the OMDB api request is done'*

In [15]:
stop_words = set(stopwords.words('english'))

df[['description', 'listed_in']] = df[['description', 'listed_in']].apply(lambda x: x.str.lower())

df['description_cl'] = df['description'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
df['listed_in_cl'] = df['listed_in'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

df['description_tokens'] = df['description_cl'].str.split()
df['listed_in_tokens'] = df['listed_in_cl'].str.split()

df['description_tokens'] = df['description_tokens'].apply(lambda x: [word for word in x if word not in stop_words])
df['listed_in_tokens'] = df['listed_in_tokens'].apply(lambda x: [word for word in x if word not in stop_words])

lemmatizer = WordNetLemmatizer()
df['description_tokens'] = df['description_tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['listed_in_tokens'] = df['listed_in_tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [16]:
df.columns

Index(['type', 'title', 'director', 'cast', 'rating', 'listed_in',
       'description', 'description_cl', 'listed_in_cl', 'description_tokens',
       'listed_in_tokens'],
      dtype='object')

In [17]:
df = df.drop(columns=['description','description_cl', 'listed_in', 'listed_in_cl'])

In [18]:
df['type'].value_counts()

Movie      6131
TV Show    2676
Name: type, dtype: int64

In [19]:
df['rating'].value_counts()

TV-MA       3207
TV-14       2160
TV-PG        863
R            799
PG-13        490
TV-Y7        334
TV-Y         307
PG           287
TV-G         220
NR            80
G             41
TV-Y7-FV       6
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: rating, dtype: int64

Encoding the df['type']

In [21]:
encoder = OneHotEncoder(drop='first', sparse=False)  
type_encoded = encoder.fit_transform(df[['type']])

type_encoded_df = pd.DataFrame(type_encoded, columns=encoder.get_feature_names_out(['type']))
df = df.drop(columns=['type']).join(type_encoded_df)

In [22]:
df.head()

Unnamed: 0,title,director,cast,rating,description_tokens,listed_in_tokens,type_TV Show
0,Dick Johnson Is Dead,Kirsten Johnson,,PG-13,"[father, nears, end, life, filmmaker, kirsten, johnson, stage, death, inventive, comical, way, help, face, inevitable]",[documentary],0.0
1,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng",TV-MA,"[crossing, path, party, cape, town, teen, set, prove, whether, privateschool, swimming, star, sister, abducted, birth]","[international, tv, show, tv, drama, tv, mystery]",1.0
2,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",TV-MA,"[protect, family, powerful, drug, lord, skilled, thief, mehdi, expert, team, robber, pulled, violent, deadly, turf, war]","[crime, tv, show, international, tv, show, tv, action, adventure]",1.0
3,Jailbirds New Orleans,,,TV-MA,"[feud, flirtation, toilet, talk, go, among, incarcerated, woman, orleans, justice, center, new, orleans, gritty, reality, series]","[docuseries, reality, tv]",1.0
4,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar",TV-MA,"[city, coaching, center, known, train, india, finest, collegiate, mind, earnest, unexceptional, student, friend, navigate, campus, life]","[international, tv, show, romantic, tv, show, tv, comedy]",1.0


#### Vectorization & similarity

##### Recommendation 1 // using df['description_tokens']

In [25]:
df['description_tokens'] = df['description_tokens'].apply(lambda x: ' '.join(x))
df['description_tokens'] = df['description_tokens'].fillna('')

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['description_tokens'])

tfidf_matrix.shape

(8807, 17894)

In [26]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [27]:
indices = pd.Series(df.index, index= df['title']).drop_duplicates()

In [28]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return df['title'].iloc[movie_indices]

In [29]:
get_recommendations('Catch Me If You Can')

7148                            Julius Jr.
5644               Rampage: President Down
188                       2 Alone in Paris
3227                          The Irishman
7865                    Revolutionary Road
142                        Freedom Writers
8739    Why We Fight: The Battle of Russia
2214        Sinatra: All or Nothing at All
4352                    Angela's Christmas
5854                       Frank and Cindy
Name: title, dtype: object

In [30]:
cosine_sim 

array([[1.        , 0.        , 0.        , ..., 0.        , 0.01445954,
        0.03846602],
       [0.        , 1.        , 0.        , ..., 0.04238867, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.03181666, 0.        ,
        0.02181876],
       ...,
       [0.        , 0.04238867, 0.03181666, ..., 1.        , 0.        ,
        0.        ],
       [0.01445954, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.03846602, 0.        , 0.02181876, ..., 0.        , 0.        ,
        1.        ]])

#### Getting more data from omdb

In [32]:
import requests
import pandas as pd

API_KEY = '4c81a644'
base_url = 'http://www.omdbapi.com/'


In [33]:
def get_movie_data(title):
    params = {
        't': title,  
        'apikey': API_KEY
    }
    response = requests.get(base_url, params=params)
    movie_data = response.json()
    
    if movie_data['Response'] == 'True':
        return {
            'Title': movie_data['Title'],
            'Year': movie_data['Year'],
            'Rated': movie_data['Rated'],
            'Released': movie_data['Released'],
            'Genre': movie_data['Genre'],
            'Director': movie_data['Director'],
            'Actors': movie_data['Actors'],
            'Plot': movie_data['Plot'],
            'imdbRating': movie_data['imdbRating'],
            'imdbID': movie_data['imdbID']
        }
    else:
        return None

In [34]:
def enrich_dataset_with_imdb_data(df):
    movie_data_list = []
    
    for index, row in df.iterrows():
        title = row['title']
        movie_data = get_movie_data(title)
        
        if movie_data:
            movie_data_list.append(movie_data)
        else:
            movie_data_list.append({
                'Title': title,
                'Year': None,
                'Rated': None,
                'Released': None,
                'Genre': None,
                'Director': None,
                'Actors': None,
                'Plot': None,
                'imdbRating': None,
                'imdbID': None
            })
    
    movie_data_df = pd.DataFrame(movie_data_list)
    enriched_df = pd.concat([df, movie_data_df], axis=1)
    return enriched_df

In [None]:
enriched_df = enrich_dataset_with_imdb_data(df)

In [None]:
enriched_df.head()

In [None]:
enriched_df.info()

*Comment: Unfortunately only 896 match was found, but Cath me if you can was one of them.*

In [None]:
enriched_df = enriched_df.drop(columns=['imdbID', 'Rated', 'Released'])

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

columns_to_process = ['Title', 'Genre', 'Director', 'Actors', 'Plot']

def clean_and_process_column(col):
    enriched_df[col] = enriched_df[col].fillna('').astype(str)
    enriched_df[f'{col}_cl'] = enriched_df[col].str.lower().apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))  
    enriched_df[f'{col}_tokens'] = enriched_df[f'{col}_cl'].str.split()  
    enriched_df[f'{col}_tokens'] = enriched_df[f'{col}_tokens'].apply(lambda x: [word for word in x if word not in stop_words])  
    enriched_df[f'{col}_tokens'] = enriched_df[f'{col}_tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])  
    return enriched_df

for col in columns_to_process:
    enriched_df = clean_and_process_column(col)

##### Recommendation 2 //using enriched_df['description_tokens']

In [None]:
tfidf02 = TfidfVectorizer(stop_words='english')
tfidf_matrix02 = tfidf02.fit_transform(enriched_df['description_tokens'])

In [None]:
cosine_sim02 = linear_kernel(tfidf_matrix02, tfidf_matrix02)

In [None]:
def get_recommendations02(title, cosine_sim=cosine_sim):
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return enriched_df['title'].iloc[movie_indices]

In [None]:
get_recommendations02('Catch Me If You Can')

#### Using more column as a basis

##### Recommendation 3 // using enriched_df['combined_features'] 

In [None]:
enriched_df['description_tokens'] = enriched_df['description_tokens'].fillna('')
enriched_df['rating'] = enriched_df['rating'].fillna('')
enriched_df['Director'] = enriched_df['Director'].fillna('')
enriched_df['Actors'] = enriched_df['Actors'].fillna('')

In [None]:
enriched_df['combined_features'] = enriched_df['description_tokens'] + ' ' + enriched_df['rating'] + ' ' + enriched_df['Director'] + ' ' + enriched_df['Actors']

In [None]:
tfidf03 = TfidfVectorizer(stop_words='english')
tfidf_matrix03 = tfidf03.fit_transform(enriched_df['combined_features'])

In [None]:
cosine_sim03 = linear_kernel(tfidf_matrix03, tfidf_matrix03)

In [None]:
def get_recommendations03(title, cosine_sim03=cosine_sim03):
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return enriched_df['title'].iloc[movie_indices]

In [None]:
get_recommendations03('Catch Me If You Can')

**Since the result does not seem very satisfying, therefore the following techniques could be considered:**

- **Weighted Similarity**: Assigning weights to different features e.g.: more importance to genres and directors.
- **Latent Semantic Analysis (LSA):**: Reduce dimensionality of the feature space using SVD (Singular Value Decomposition) for better similarity computation.
- **Collaborative Filtering**: Matrix Factorization:Singular Value Decomposition (SVD) or Alternating Least Squares (ALS) to find latent factors for users and movies.
- **Deep Learning**: Deep neural networks (e.g., autoencoders or embeddings from recommendation models like Neural Collaborative Filtering (NCF)).