<a href="https://colab.research.google.com/github/naenumtou/statisticalModel/blob/main/recommendationCountVector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Import dataset
df = pd.read_csv('https://raw.githubusercontent.com/naenumtou/statisticalModel/main/datasets/IMDB_Movie_1000_Data.csv')
df.head(5)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [3]:
# Selected features
useCols = ['Title', 'Genre', 'Director', 'Director']
df = df[useCols]

In [4]:
# Check missing values
df.isnull().sum()

Title       0
Genre       0
Director    0
Director    0
dtype: int64

In [5]:
# Concatinate features
df['Feature'] = df.select_dtypes('object').apply(' '.join, axis = 1)
df.head(5)

Unnamed: 0,Title,Genre,Director,Director.1,Feature
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,James Gunn,"Guardians of the Galaxy Action,Adventure,Sci-F..."
1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,Ridley Scott,"Prometheus Adventure,Mystery,Sci-Fi Ridley Sco..."
2,Split,"Horror,Thriller",M. Night Shyamalan,M. Night Shyamalan,"Split Horror,Thriller M. Night Shyamalan M. Ni..."
3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,Christophe Lourdelet,"Sing Animation,Comedy,Family Christophe Lourde..."
4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,David Ayer,"Suicide Squad Action,Adventure,Fantasy David A..."


In [6]:
# Covert 'Feature' column to token count matrix
countVec = CountVectorizer()
countVec.fit(df['Feature'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [7]:
# Transform count vector to numpy array
matrixWord = countVec.transform(df['Feature'])

In [8]:
# Calculate cosine similarity of matrix word
similarWord = cosine_similarity(matrixWord)
print(similarWord.shape)

(1000, 1000)


In [9]:
# Get recommended movie
def recommendMovies(movieName, n):
  movieIndex = df[df['Title'] == movieName].index[0]

  scores = []
  for i, score in enumerate(similarWord[movieIndex]):
    scores.append([i, score])
  
  scores = sorted(scores, key = lambda x: x[1], reverse = True)
  scores = scores[1:n + 1]

  movies = []
  for i in scores:
    movieName = df[df.index == i[0]]['Title'].values[0]
    movies.append(movieName)

  return movies

In [10]:
# Call 'recommendMovies' function
recommendMovies('Guardians of the Galaxy', 5)

['Slither',
 'Super',
 'The Wolverine',
 'The Lost City of Z',
 'The Purge: Election Year']

In [11]:
# Random call 'recommendMovies' function
recommendMovies('The Happening', 5)

['The Visit',
 'Split',
 'After Earth',
 'The Last Airbender',
 'Lady in the Water']