In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk #for stemming process
from nltk.stem.porter import PorterStemmer

from sklearn.metrics.pairwise import cosine_similarity


In [2]:
data = pd.read_csv('../data/train_data.csv')

In [3]:
data.head()

Unnamed: 0,movie_id,title,runtime,status,vote_average,overview_2,tags
0,19995,Avatar,2 h 42 min,Released: 2009,7.2,"In the 22nd century, a paraplegic Marine is di...","in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,2 h 49 min,Released: 2007,6.9,"Captain Barbossa, long believed to be dead, ha...","captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,2 h 28 min,Released: 2015,6.3,A cryptic message from Bond’s past sends him o...,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,2 h 45 min,Released: 2012,7.6,Following the death of District Attorney Harve...,following the death of district attorney harve...
4,49529,John Carter,2 h 12 min,Released: 2012,6.1,"John Carter is a war-weary, former military ca...","john carter is a war-weary, former military ca..."


## Text Vectorization

In [4]:
tfv = TfidfVectorizer(max_features=None,stop_words='english')

In [5]:
vectors = tfv.fit_transform(data['tags']).toarray()

In [6]:
vectors.shape

(1494, 18002)

In [7]:
tfv.get_feature_names()



['00',
 '000',
 '007',
 '10',
 '100',
 '1000',
 '101',
 '108',
 '10th',
 '11',
 '117',
 '11th',
 '12',
 '13',
 '14',
 '1408',
 '15',
 '150',
 '15th',
 '16',
 '1630s',
 '17',
 '1748',
 '17th',
 '17thcentury',
 '18',
 '1800',
 '1818',
 '1820',
 '1820s',
 '1831',
 '1845',
 '1856',
 '1863',
 '1885',
 '1890',
 '18th',
 '18thcentury',
 '19',
 '1905',
 '1910s',
 '1912',
 '1919',
 '1920',
 '1926',
 '1927',
 '1929',
 '1930s',
 '1937',
 '1940s',
 '1942',
 '1944',
 '1945',
 '1949',
 '1950',
 '1950s',
 '1952',
 '1955',
 '1957',
 '1958',
 '1959',
 '1960s',
 '1962',
 '1964',
 '1965',
 '1967',
 '1969',
 '1970s',
 '1972',
 '1973',
 '1974',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1981',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1990s',
 '1991',
 '1992',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '19th',
 '19thcentury',
 '20',
 '200',
 '2000',
 '2000ad',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',


In [8]:
vectors

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Stemming

In [9]:
import nltk #for stemming process
from nltk.stem.porter import PorterStemmer

In [10]:
ps = PorterStemmer()

#defining the stemming function
def stem(text):
    y=[]
    
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

data['tags'] = data['tags'].apply(stem) #appplying stemming process on tags column

In [11]:
data.head()

Unnamed: 0,movie_id,title,runtime,status,vote_average,overview_2,tags
0,19995,Avatar,2 h 42 min,Released: 2009,7.2,"In the 22nd century, a paraplegic Marine is di...","in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,2 h 49 min,Released: 2007,6.9,"Captain Barbossa, long believed to be dead, ha...","captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,2 h 28 min,Released: 2015,6.3,A cryptic message from Bond’s past sends him o...,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,2 h 45 min,Released: 2012,7.6,Following the death of District Attorney Harve...,follow the death of district attorney harvey d...
4,49529,John Carter,2 h 12 min,Released: 2012,6.1,"John Carter is a war-weary, former military ca...","john carter is a war-weary, former militari ca..."


## Cosine Similarity

In [12]:
similarity = cosine_similarity(vectors)

In [13]:
print(similarity)

[[1.         0.01415371 0.00774091 ... 0.         0.02417797 0.00436943]
 [0.01415371 1.         0.00877843 ... 0.00758872 0.         0.        ]
 [0.00774091 0.00877843 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.00758872 0.         ... 1.         0.         0.01585106]
 [0.02417797 0.         0.         ... 0.         1.         0.        ]
 [0.00436943 0.         0.         ... 0.01585106 0.         1.        ]]


In [14]:
print(data[data['title'] == 'Spectre'].index)

Int64Index([2], dtype='int64')


## Recommendation Function

In [None]:
def recommend(movie):
    movie_index = data[data['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True, key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(data.iloc[i[0]].title)

In [None]:
recommend('Spectre')

In [None]:
recommend('Spider-Man')

## Saving codes for deployment

In [None]:
pickle.dump(similarity,open('similarity.pkl','wb'))