In [2]:
import nltk
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from rake_nltk import Rake
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
import string

In [3]:
DATASET_PATH = './IMDBdata_MainData.csv'

df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,tomatoURL
0,Code Name: K.O.Z.,2015,NOT RATED,13 Feb 2015,114 min,"Crime, Mystery",Celal Çimen,,"Cem Kurtoglu, Hakan Ural, Hazim Körmükçü, Tolg...",A look at the 17-25 December 2013 corruption s...,...,1.6,24600,tt4458206,movie,,,,,True,
1,Saving Christmas,2014,PG,14 Nov 2014,80 min,"Comedy, Family",Darren Doane,"Darren Doane, Cheston Hervey","Kirk Cameron, Darren Doane, Bridgette Cameron,...",Kirk is enjoying the annual Christmas party ex...,...,1.6,12686,tt4009460,movie,03 Nov 2015,"$2,778,297",IPD/Samuel Goldwyn Films,http://www.savingchristmas.com/,True,http://www.rottentomatoes.com/m/kirk_camerons_...
2,Superbabies: Baby Geniuses 2,2004,PG,27 Aug 2004,88 min,"Comedy, Family, Sci-Fi",Bob Clark,"Steven Paul (story), Gregory Poppen (screenplay)","Jon Voight, Scott Baio, Vanessa Angel, Skyler ...",A group of smart-talking toddlers find themsel...,...,2.0,26376,tt0270846,movie,04 Jan 2005,"$9,016,422",Sony,http://www.sonypictures.com/movies/superbabies...,True,http://www.rottentomatoes.com/m/super_babies_b...
3,Daniel der Zauberer,2004,NOT RATED,12 Aug 2004,81 min,"Comedy, Crime, Fantasy",Ulli Lommel,Ulli Lommel (screenplay),"Daniel Küblböck, Ulli Lommel, Rudolf Waldemar ...","Evil assassins want to kill Daniel Kublbock, t...",...,1.9,13060,tt0421051,movie,,,,,True,http://www.rottentomatoes.com/m/daniel_der_zau...
4,Manos: The Hands of Fate,1966,NOT RATED,15 Nov 1966,70 min,Horror,Harold P. Warren,Harold P. Warren (screenplay),"Tom Neyman, John Reynolds, Diane Adelson, Haro...",A family gets lost on the road and stumbles up...,...,1.9,31768,tt0060666,movie,07 Oct 2003,,Sinister Cinema,,True,http://www.rottentomatoes.com/m/manos-the-hand...


In [4]:
df = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,Code Name: K.O.Z.,"Crime, Mystery",Celal Çimen,"Cem Kurtoglu, Hakan Ural, Hazim Körmükçü, Tolg...",A look at the 17-25 December 2013 corruption s...
1,Saving Christmas,"Comedy, Family",Darren Doane,"Kirk Cameron, Darren Doane, Bridgette Cameron,...",Kirk is enjoying the annual Christmas party ex...
2,Superbabies: Baby Geniuses 2,"Comedy, Family, Sci-Fi",Bob Clark,"Jon Voight, Scott Baio, Vanessa Angel, Skyler ...",A group of smart-talking toddlers find themsel...
3,Daniel der Zauberer,"Comedy, Crime, Fantasy",Ulli Lommel,"Daniel Küblböck, Ulli Lommel, Rudolf Waldemar ...","Evil assassins want to kill Daniel Kublbock, t..."
4,Manos: The Hands of Fate,Horror,Harold P. Warren,"Tom Neyman, John Reynolds, Diane Adelson, Haro...",A family gets lost on the road and stumbles up...


In [5]:
df = df.dropna()
df.isnull().sum()

Title       0
Genre       0
Director    0
Actors      0
Plot        0
dtype: int64

In [6]:
df.describe().T

Unnamed: 0,count,unique,top,freq
Title,5254,5242,Out of the Blue,2
Genre,5254,470,Drama,244
Director,5254,2708,Steven Spielberg,26
Actors,5254,5215,"William Shatner, Leonard Nimoy, DeForest Kelle...",6
Plot,5254,5254,A look at the 17-25 December 2013 corruption s...,1


In [7]:
eng_stopwords = stopwords.words('english')
punctuation_list = string.punctuation
wordnet_lemmatizer = WordNetLemmatizer()

def remove_stopwords(w_list):
    return [word for word in w_list if word not in eng_stopwords]

def remove_punctuation(w_list):
    return [word for word in w_list if word not in punctuation_list]

def get_pos_tag(tag):

    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(w_list):

    lemmatized = []
    tags = pos_tag(w_list)

    for word, tag in tags:

        lemmatize_tag = get_pos_tag(tag)

        lemmatized.append(wordnet_lemmatizer.lemmatize(word, lemmatize_tag))


    lemmatized_string = ' '.join(lemmatized)
    
    return lemmatized_string

In [8]:
def preprocess(w_list):
    
    w_list = word_tokenize(w_list)

    w_list = remove_stopwords(w_list)
    w_list = remove_punctuation(w_list)
    w_list = lemmatize_text(w_list)

    return w_list

In [9]:
df['Preprocessed Plot'] = df['Plot'].apply(preprocess)

df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot,Preprocessed Plot
0,Code Name: K.O.Z.,"Crime, Mystery",Celal Çimen,"Cem Kurtoglu, Hakan Ural, Hazim Körmükçü, Tolg...",A look at the 17-25 December 2013 corruption s...,A look 17-25 December 2013 corruption scandal ...
1,Saving Christmas,"Comedy, Family",Darren Doane,"Kirk Cameron, Darren Doane, Bridgette Cameron,...",Kirk is enjoying the annual Christmas party ex...,Kirk enjoy annual Christmas party extravaganza...
2,Superbabies: Baby Geniuses 2,"Comedy, Family, Sci-Fi",Bob Clark,"Jon Voight, Scott Baio, Vanessa Angel, Skyler ...",A group of smart-talking toddlers find themsel...,A group smart-talking toddler find center medi...
3,Daniel der Zauberer,"Comedy, Crime, Fantasy",Ulli Lommel,"Daniel Küblböck, Ulli Lommel, Rudolf Waldemar ...","Evil assassins want to kill Daniel Kublbock, t...",Evil assassins want kill Daniel Kublbock third...
4,Manos: The Hands of Fate,Horror,Harold P. Warren,"Tom Neyman, John Reynolds, Diane Adelson, Haro...",A family gets lost on the road and stumbles up...,A family get lose road stumble upon hidden und...


In [10]:
rake = Rake()

df['Plot_key_words'] = ''

for index, row in df.iterrows():
    
    rake.extract_keywords_from_text(row['Preprocessed Plot'])
    key_word_dist_scores = rake.get_word_degrees()
    row['Plot_key_words'] = list(key_word_dist_scores.keys())

print(df['Plot_key_words'])


0       [look, 17, 25, december, 2013, corruption, sca...
1       [kirk, enjoy, annual, christmas, party, extrav...
2       [group, smart, talking, toddler, find, center,...
3       [evil, assassins, want, kill, daniel, kublbock...
4       [family, get, lose, road, stumble, upon, hidde...
                              ...                        
5267    [write, term, paper, woman, get, involve, myst...
5269    [two, hour, backdoor, pilot, tv, series, name,...
5270    [new, york, attorney, send, shanghai, business...
5271                       [surreal, take, zombie, genre]
5272    [``, n, take, risk, waste, soul, '', drew, bar...
Name: Plot_key_words, Length: 5254, dtype: object


In [11]:
df['Genre'] = df['Genre'].map(lambda x: x.split(','))
df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])
df['Director'] = df['Director'].map(lambda x: x.split(','))

In [12]:
df['Genre']

0                 [Crime,  Mystery]
1                 [Comedy,  Family]
2        [Comedy,  Family,  Sci-Fi]
3        [Comedy,  Crime,  Fantasy]
4                          [Horror]
                   ...             
5267                [Drama,  Crime]
5269               [Comedy,  Drama]
5270     [Comedy,  Drama,  Romance]
5271    [Drama,  Horror,  Thriller]
5272                  [Documentary]
Name: Genre, Length: 5254, dtype: object

In [13]:
for _, row in df.iterrows():
    row['Genre'] = [word.lower().replace(' ','') for word in row['Genre']]
    row['Actors'] = [word.lower().replace(' ','') for word in row['Actors']]
    row['Director'] = [word.lower().replace(' ','') for word in row['Director']]

In [14]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot,Preprocessed Plot,Plot_key_words
0,Code Name: K.O.Z.,"[crime, mystery]",[celalçimen],"[cemkurtoglu, hakanural, hazimkörmükçü]",A look at the 17-25 December 2013 corruption s...,A look 17-25 December 2013 corruption scandal ...,"[look, 17, 25, december, 2013, corruption, sca..."
1,Saving Christmas,"[comedy, family]",[darrendoane],"[kirkcameron, darrendoane, bridgettecameron]",Kirk is enjoying the annual Christmas party ex...,Kirk enjoy annual Christmas party extravaganza...,"[kirk, enjoy, annual, christmas, party, extrav..."
2,Superbabies: Baby Geniuses 2,"[comedy, family, sci-fi]",[bobclark],"[jonvoight, scottbaio, vanessaangel]",A group of smart-talking toddlers find themsel...,A group smart-talking toddler find center medi...,"[group, smart, talking, toddler, find, center,..."
3,Daniel der Zauberer,"[comedy, crime, fantasy]",[ullilommel],"[danielküblböck, ullilommel, rudolfwaldemarbrem]","Evil assassins want to kill Daniel Kublbock, t...",Evil assassins want kill Daniel Kublbock third...,"[evil, assassins, want, kill, daniel, kublbock..."
4,Manos: The Hands of Fate,[horror],[haroldp.warren],"[tomneyman, johnreynolds, dianeadelson]",A family gets lost on the road and stumbles up...,A family get lose road stumble upon hidden und...,"[family, get, lose, road, stumble, upon, hidde..."


In [15]:
df['Bag_of_words'] = ''

cols = ['Genre', 'Director', 'Actors', 'Plot_key_words']

for _, row in df.iterrows():

    words = ''

    for col in cols:
        
        words += ' '.join(row[col]) + ' '
        
    row['Bag_of_words'] = words

df = df[['Title', 'Bag_of_words']]

In [16]:
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(df['Bag_of_words'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)

print(cosine_sim)

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.07692308 ... 0.07692308 0.         0.        ]
 [0.         0.07692308 1.         ... 0.07692308 0.         0.        ]
 ...
 [0.         0.07692308 0.07692308 ... 1.         0.05661385 0.        ]
 [0.         0.         0.         ... 0.05661385 1.         0.06019293]
 [0.         0.         0.         ... 0.         0.06019293 1.        ]]


In [17]:
indices = pd.Series(df['Title'])

def recommend_movies(title, recommend_num= 10 , cosine_sim= cosine_sim):

    recommended_movies = []

    try:

        idx = indices[indices == title].index[0]
        series_score = pd.Series(cosine_sim[idx]).sort_values(ascending= False)
        top_recommended = list(series_score.iloc[1: recommend_num].index)

        for i in top_recommended:
            recommended_movies.append(list(df['Title'])[i])

      
        
        return recommended_movies

    except:
        print('No movie recommendations...')
        return None 


In [20]:
import json

def load_search_history(filename='search_history.json'):
    try:
        with open(filename, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        return []

def save_search_history(history, filename='search_history.json'):
    with open(filename, 'w') as file:
        json.dump(history, file)

def main_menu():
    search_history = load_search_history()
    
    while True:
        print("\n--- Movie Recommendation Menu ---")
        print("1. Search for a movie")
        print("2. View search history")
        print("3. Exit")
        
        choice = input("Please choose an option (1/2/3): ")
        
        if choice == '1':
            search_title = input("Enter the movie title to search: ")
            recommended_movies = recommend_movies(title=search_title)
            
            if recommended_movies:
                print(f'\nRecommended Movies Similar to "{search_title}"')

                search_history.append(search_title)
                save_search_history(search_history)

                for idx, movie in enumerate(recommended_movies):
                    print(f'{idx + 1}. {movie}')

            else:
                print(f'\nNo recommendations found for "{search_title}"')
            
        
        elif choice == '2':
            if search_history:
                print("\nSearch History:")
                for idx, title in enumerate(search_history):
                    print(f'{idx + 1}. {title}')
            else:
                print("\nNo search history available.")
        
        elif choice == '3':
            print("Exiting the menu. Goodbye!")
            break
        
        else:
            print("Invalid option, please choose again.")

main_menu()



--- Movie Recommendation Menu ---
1. Search for a movie
2. View search history
3. Exit
No movie recommendations...

No recommendations found for "1"

--- Movie Recommendation Menu ---
1. Search for a movie
2. View search history
3. Exit
Invalid option, please choose again.

--- Movie Recommendation Menu ---
1. Search for a movie
2. View search history
3. Exit
No movie recommendations...

No recommendations found for "Naruto"

--- Movie Recommendation Menu ---
1. Search for a movie
2. View search history
3. Exit

Search History:
1. The Avengers
2. Superman

--- Movie Recommendation Menu ---
1. Search for a movie
2. View search history
3. Exit

Recommended Movies Similar to "The Avengers"
1. Avengers: Age of Ultron
2. Universal Soldier: The Return
3. Battlefield Earth
4. The X Files
5. Independence Day
6. Guardians of the Galaxy Vol. 2
7. Prince of Space
8. Logan's Run
9. Batman & Robin

--- Movie Recommendation Menu ---
1. Search for a movie
2. View search history
3. Exit
Invalid optio