### Running TF-IDF Vectorizer and Cosine Similarity

In [1]:
import pandas as pd
import numpy as np

In [19]:
tf = pd.read_csv('./data/moviesSuper.csv').iloc[:,1:].set_index('movieId')
tf.shape

(62423, 5)

In [21]:
# Constrain dataset to conserve memory
tf = tf[tf.releaseYr >= 2014]
print(tf.shape)
tf.head()

(12225, 5)


Unnamed: 0_level_0,movieName,genres,releaseYr,rating,text
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
79607,"Millions Game, The (Das Millionenspie",Action|Drama|Sci-Fi|Thriller,2015,3.4,Modern Avg Action Drama Sci-Fi Thriller danger...
107516,Punk's Dead: SLC Punk! 2,Comedy,2014,2.392857,Modern belowAvg Comedy added
107769,Paranormal Activity: The Marked Ones,Horror|Thriller,2014,2.761905,Modern belowAvg Horror Thriller demon found fo...
107916,Yves Saint Laurent,Drama|Romance,2014,3.178571,Modern Avg Drama Romance biography fashion fas...
108156,Ride Along,Action|Comedy,2014,3.181733,Modern Avg Action Comedy Atlanta buddy movie p...


In [22]:
tf.to_csv('./data/tf14-18.csv')

#### TF-IDF Vectorizer

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfinf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tfinf.fit_transform(tf['text'])

In [24]:
# Output shape
tfidf_matrix.shape

(12225, 87300)

#### Calculating Cosine Similarity on Vectorized Data 

In [25]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [26]:
# Output shape, save npy file for more efficient streamlit ops
np.save('./data/movieMatrix14-18.npy', cosine_sim)
#cosine_sim = np.load('./data/movieMatrix.npy')
cosine_sim.shape

(12225, 12225)

#### Recommendations Function 

In [27]:
tf = tf.reset_index()
titles = tf['movieName']
indices = pd.Series(tf.index, index=tf['movieName'])

In [33]:
def get_recommendations(searchItem):
    any((title := s).startswith(searchItem) for s in tf.movieName)
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    df = pd.DataFrame(sim_scores, columns =['idx', 'sim_scores'])
    df = df.set_index("idx")
    df2 = df.merge(titles.iloc[movie_indices].to_frame(), left_index=True, right_index=True)
    return print(f'--- Searched for: {title} ---\n\nRecommendations:\n{df2[["movieName", "sim_scores"]]}')


In [34]:
get_recommendations('Ex M')

--- Searched for: Ex Machina ---

Recommendations:
                         movieName  sim_scores
513                        Chappie    0.513092
4871                     Pass-Thru    0.471539
9100                            Ta    0.449297
2724                       Uncanny    0.424846
6728                    Amelia 2.0    0.423528
261            Autómata (Automata)    0.412459
7222                   Singularity    0.407655
5962                        Somnio    0.384374
7751                       AlphaGo    0.379739
9095           2036 Origin Unknown    0.375084
68                   Transcendence    0.372376
1304                         Debug    0.362378
721                           Vice    0.361531
10498                A Crimson Man    0.342863
7779              The God Question    0.307450
5677                       Teleios    0.290876
8492   Do You Trust this Computer?    0.290820
9026                       Upgrade    0.285853
34                    Interstellar    0.277124
3997     