# Import Library

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import re

# Describe The Data

In [2]:
df = pd.read_csv('data/list_movies.csv')

df = df[['Title','Genre','Director','Writer','Actors','Released','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Writer,Actors,Released,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",14 Oct 1994,Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",24 Mar 1972,The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",20 Dec 1974,The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",18 Jul 2008,When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",01 Apr 1957,A jury holdout attempts to prevent a miscarria...


In [3]:
df.shape 

(250, 7)

In [4]:
df.dropna(thresh=5, inplace=True)
df.fillna(method='ffill', inplace=True)
df.drop_duplicates(inplace=True)

# Training dan Modeling

In [5]:
df['rangkuman'] = df['Genre'] + ' ' + df['Director'] + ' ' + df['Actors'] + ' ' + df['Plot']

In [6]:
df.head()

Unnamed: 0,Title,Genre,Director,Writer,Actors,Released,Plot,rangkuman
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",14 Oct 1994,Two imprisoned men bond over a number of years...,"Crime, Drama Frank Darabont Tim Robbins, Morga..."
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",24 Mar 1972,The aging patriarch of an organized crime dyna...,"Crime, Drama Francis Ford Coppola Marlon Brand..."
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",20 Dec 1974,The early life and career of Vito Corleone in ...,"Crime, Drama Francis Ford Coppola Al Pacino, R..."
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",18 Jul 2008,When the menace known as the Joker emerges fro...,"Action, Crime, Drama Christopher Nolan Christi..."
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",01 Apr 1957,A jury holdout attempts to prevent a miscarria...,"Crime, Drama Sidney Lumet Martin Balsam, John ..."


In [7]:
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z #+_]')
stopworda = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = clean_spcl.sub(' ', text)
    text = clean_symbol.sub('', text)
    text = ' '.join(word for word in text.split() if word not in stopworda) # hapus stopword dari kolom deskripsi
    return text
    
df['desc_clean'] = df['rangkuman'].apply(clean_text)

In [8]:
df.head()

Unnamed: 0,Title,Genre,Director,Writer,Actors,Released,Plot,rangkuman,desc_clean
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",14 Oct 1994,Two imprisoned men bond over a number of years...,"Crime, Drama Frank Darabont Tim Robbins, Morga...",crime drama frank darabont tim robbins morgan ...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",24 Mar 1972,The aging patriarch of an organized crime dyna...,"Crime, Drama Francis Ford Coppola Marlon Brand...",crime drama francis ford coppola marlon brando...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",20 Dec 1974,The early life and career of Vito Corleone in ...,"Crime, Drama Francis Ford Coppola Al Pacino, R...",crime drama francis ford coppola al pacino rob...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",18 Jul 2008,When the menace known as the Joker emerges fro...,"Action, Crime, Drama Christopher Nolan Christi...",action crime drama christopher nolan christian...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",01 Apr 1957,A jury holdout attempts to prevent a miscarria...,"Crime, Drama Sidney Lumet Martin Balsam, John ...",crime drama sidney lumet martin balsam john fi...


In [9]:
df.set_index('Title', inplace=True)

In [17]:
count = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words=stopwords.words('english'))
count_matrix = count.fit_transform(df['desc_clean'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim 

array([[1.        , 0.08163265, 0.07087699, ..., 0.02083786, 0.02083786,
        0.02083786],
       [0.08163265, 1.        , 0.28350796, ..., 0.02083786, 0.02083786,
        0.02083786],
       [0.07087699, 0.28350796, 1.        , ..., 0.01809233, 0.01809233,
        0.01809233],
       ...,
       [0.02083786, 0.02083786, 0.01809233, ..., 1.        , 0.0212766 ,
        0.0212766 ],
       [0.02083786, 0.02083786, 0.01809233, ..., 0.0212766 , 1.        ,
        0.0212766 ],
       [0.02083786, 0.02083786, 0.01809233, ..., 0.0212766 , 0.0212766 ,
        1.        ]])

In [11]:
indices = pd.Series(df.index)

def recommendations(Title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    idx = indices[indices == Title].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_10_indexes = list(score_series.iloc[1:11].index)
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])  
    return recommended_movies

# Result

In [12]:
recommendations('Toy Story')

['Toy Story 3',
 'Aladdin',
 'Inside Out',
 'Monsters, Inc.',
 'Finding Nemo',
 'Zootopia',
 'Up',
 'The Gold Rush',
 'Annie Hall',
 'Deadpool']

In [13]:
recommendations('The Dark Knight')

['Batman Begins',
 'The Dark Knight Rises',
 'The Prestige',
 'The Green Mile',
 'L.A. Confidential',
 'Out of the Past',
 'Heat',
 'Reservoir Dogs',
 'The Godfather: Part II',
 'Interstellar']

In [14]:
recommendations('The Avengers')

['Guardians of the Galaxy Vol. 2',
 'Guardians of the Galaxy',
 'Spider-Man: Homecoming',
 'Rush',
 'Terminator 2: Judgment Day',
 'Aliens',
 'Kill Bill: Vol. 1',
 'The Terminator',
 'Shutter Island',
 'The Matrix']

In [15]:
recommendations('La La Land')

['Whiplash',
 'The Help',
 'Monty Python and the Holy Grail',
 'The Graduate',
 'The Truman Show',
 'The Gold Rush',
 'Life of Brian',
 'His Girl Friday',
 'The Kid',
 'The Pianist']