In [30]:
import pandas as pd 
import numpy as np 
import ast

In [31]:
df = pd.read_csv("file2.csv")

In [32]:
df.drop(columns = ["keywords" , "overview" , "cast" , "crew" , "genres"] , inplace = True)

In [33]:
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[['In', 'the', '22nd', 'century,', 'a', 'parap..."
1,285,Pirates of the Caribbean: At World's End,"[['Captain', 'Barbossa,', 'long', 'believed', ..."
2,206647,Spectre,"[['A', 'cryptic', 'message', 'from', 'Bond’s',..."
3,49026,The Dark Knight Rises,"[['Following', 'the', 'death', 'of', 'District..."
4,49529,John Carter,"[['John', 'Carter', 'is', 'a', 'war-weary,', '..."


In [34]:
df["tags"]  = df["tags"].str.replace("[^\w\s\,]" , "" , regex = True)

  df["tags"]  = df["tags"].str.replace("[^\w\s\,]" , "" , regex = True)


In [35]:
df["tags"] = df["tags"].apply(lambda x: " ".join(x.split(", ")))

In [36]:
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bonds past sends him on...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a warweary, former military cap..."


In [37]:
df["tags"] = df["tags"].apply(lambda x:x.lower())

In [38]:
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bonds past sends him on...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a warweary, former military cap..."


In [39]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000 , stop_words = "english")

In [40]:
cv.fit_transform(df["tags"]).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [41]:
vector = cv.fit_transform(df["tags"]).toarray()

In [42]:
vector[0]

array([0, 0, 0, ..., 0, 0, 0])

In [43]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [44]:
import nltk

In [45]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [46]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)
        

    

In [47]:
df["tags"].apply(stem)

0       in the 22nd century, a parapleg marin is dispa...
1       captain barbossa, long believ to be dead, ha c...
2       a cryptic messag from bond past send him on a ...
3       follow the death of district attorney harvey d...
4       john carter is a warweary, former militari cap...
                              ...                        
4801    el mariachi just want to play hi guitar and ca...
4802    a newlyw coupl honeymoon is upend by the arriv...
4803    signed, sealed, deliv introduc a dedic quartet...
4804    when ambiti new york attorney sam is sent to s...
4805    ever sinc the second grade when he first saw h...
Name: tags, Length: 4806, dtype: object

In [48]:
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bonds past sends him on...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a warweary, former military cap..."


In [49]:
from sklearn.metrics.pairwise import cosine_similarity

In [50]:
similarity = cosine_similarity(vector)

In [51]:
similarity[0] 

array([1.        , 0.09128709, 0.06189845, ..., 0.02541643, 0.02946278,
       0.        ])

In [52]:
def recommend(movie):
    index = df[df["title"] ==movie].index[0]
    distance = similarity[index]
    movie_list  =sorted(list(enumerate(distance)), reverse = True  , key = lambda x:x[1])[1:6]
    for i in movie_list:
        print(df.iloc[i[0]].title)

In [53]:
recommend("Batman Begins")

The Dark Knight
The Dark Knight Rises
Batman
Batman
Amidst the Devil's Wings


In [87]:
sorted(list(enumerate(similarity[0])), reverse = True  , key = lambda x:x[1])[1:6]

[(539, np.float64(0.2611164839335468)),
 (260, np.float64(0.2519763153394848)),
 (507, np.float64(0.24948506639458295)),
 (1192, np.float64(0.24873416908154547)),
 (1202, np.float64(0.24659848095803594))]

In [32]:
import joblib
joblib.dump(df , "df_model")

['df_model']

In [29]:
df["title"].values

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)

In [54]:
import joblib
joblib.dump(similarity ,  "similarity")

['similarity']