In [1]:
import pandas as pd
import numpy as np
import re
import unicodedata
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import euclidean_distances
from IPython.display import display, Markdown

In [2]:
df_ = pd.read_csv("/home/ninja/Dropbox/datasets/netflix/netflix_titles.csv")

In [5]:
def remove_stopwords(x):
    return " ".join([word for word in str(x).split() if word not in stopwords])

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFKD', s)).encode('ascii', 'ignore').decode("utf-8")

def normalize(x, remove_accents=True):
    if x != None:
        x = str(x).lower()
        if remove_accents:
            x = strip_accents(x)
        x = remove_stopwords(x.lower())
        x = re.sub(r'[\-%+&|{}()\[\]^\'\"~*?:\\/!]', " ", x)
        x = re.sub(r'\s+', ' ', x).strip()
        return x
    else: return None

def get_duration_type(x):
    x = x.split(" ")[-1]
    if "season" in x:
        return "s"
    elif "min" in x:
        return "m"
    else:
        return ""

def _compute_embedding_matrix(df,vectorizer,tfidf_params):
    df = df.fillna("")
    X = vectorizer.fit_transform(df)
    tfidf_params["embedder"] = vectorizer
    return X.toarray()

def train_embedding_matrix(tfidf_params,tfidf_cols,df):
    Xs = []
    for item in tfidf_cols:
        Xs += [_compute_embedding_matrix(
            df[item],
            tfidf_params[item]["vectorizer"](
                max_features=tfidf_params[item]["max_features"]
            ),
            tfidf_params[item]
        )]
    
    Xs += [df["type"].values.reshape(-1,1)]
    Xs += [pd.get_dummies(df["rating"]).values[:,1:]]
    Xs += [pd.get_dummies(df["n_countries"]).values[:,1:]]

    return np.concatenate(Xs,axis=1)

def find_item_by_title(name,df,n=10):
    display(df[df["title"].str.contains(name)]["title"].iloc[:n].to_dict())

def recomend_by_id(idx,df):
    recommended_ids = model.kneighbors(X[input_id].reshape(1,-1),return_distance=False)[0].tolist()[1:]
    display(df.iloc[recommended_ids][tfidf_cols])

def get_recommendations_from_inputs(input_ids,df):
    recommendations_dict = {}
    for idx in input_ids:
        recommended_ids = model.kneighbors(X[idx].reshape(1,-1),return_distance=False)[0].tolist()
        x = df.iloc[recommended_ids]["title"].tolist()
        recommendations_dict[x[0]] = x[1:]
    return recommendations_dict

def display_recommendations(input_ids,df):
    recommendations_dict = get_recommendations_from_inputs(input_ids,df)
    msg = '| input | rec 01 | rec 02 | rec 03 | rec 04 | rec 05 | rec 06 | rec 07 | rec 08 | rec 09 |\n'
    msg += '| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n'
    for input_x, recs in recommendations_dict.items():
        msg += f"| **{input_x}** |{'|'.join(recs)}|\n"

    display(Markdown(msg))
    
def find_item_by_title_token(name,df,n=10):
    return df[df["title"].str.contains(name)]["title"].iloc[:n].to_dict()

def show_recommendations(title_token,df):
    retrieved_items = find_item_by_title_token(title_token,df,n=20)
    display_recommendations(list(retrieved_items.keys()),df)

In [9]:
# df_ = df.copy()
obj_cols = [item for item in df_.columns if df_[item].dtype == "O"]
tfidf_cols = ["title","director","cast","country","listed_in","description"]
tfidf_params = {
    "title":{
        "vectorizer":TfidfVectorizer,
        "max_features":150,
    },
    "director":{
        "vectorizer":CountVectorizer,
        "max_features":100,
    },
    "cast":{
        "vectorizer":CountVectorizer,
        "max_features":200,
    },
    "country":{
        "vectorizer":CountVectorizer,
        "max_features":30,
    },
    "listed_in":{
        "vectorizer":CountVectorizer,
        "max_features":30,
    },
    "description":{
        "vectorizer":TfidfVectorizer,
        "max_features":200,
    }
}
categorical_cols = ["type","rating","n_countries"]
minmax_cols = ["release_year"]
drop_cols = ["date_added","duration_type","show_id"]
stopwords = []

for item in obj_cols:
    df_[item] = df_[item].apply(normalize)

df_["duration_type"] = df_["duration"].apply(get_duration_type)
df_["duration"] = df_["duration"].apply(lambda x: x.split(" ")[0])
df_["type"] = df_["type"].apply(lambda x: 1 if x == "movie" else 0)
df_["n_countries"] = df_["country"].apply(lambda x: len(x.split(" ")) if len(x.split(" ")) < 3 else 3)
df_ = df_[df_["rating"].isin(["tv ma","tv 14","tv pg","r","pg 13"])]
df_ = df_[(~df_["title"].isna())&(df_["title"]!="")].reset_index()
df_.drop(drop_cols,axis=1,inplace=True)

In [10]:
X = train_embedding_matrix(tfidf_params,tfidf_cols,df_)

In [14]:
model = NearestNeighbors(n_neighbors=10,leaf_size=1000,radius=2.0)
model.fit(X)

In [7]:
# import pickle

# pickle.dump( model, open( "model.pickle", "wb" ) )
# model = pickle.load( open( "model.pickle", "rb" ) )
# df_.to_csv("/home/ninja/Dropbox/ml/recommender_systems/netflix/streamlit/normalized_netflix.csv",index=None)
# np.save("/home/ninja/Dropbox/ml/recommender_systems/netflix/streamlit/features_space.npy", X)

In [11]:
find_item_by_title("1922",df_,n=20)
input_dict = {
    "dark":1930,
    "elite":566,
    "sabrina":1225,
    "umbrella-academy":1810,
    "altered-carbon":2402,
    "altered-carbon-resleeved":2338,
    "after-life":2204,
    "one-punch-man":4830,
    "locke-key":2461,
    "hill-house":3881,
    "marianne":2966,
    "monday":4610,
    "1922":4508
}

{4508: '1922'}

In [17]:
show_recommendations("dark",df_)

| input | rec 01 | rec 02 | rec 03 | rec 04 | rec 05 | rec 06 | rec 07 | rec 08 | rec 09 |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| **dark skies** |sweetheart|dark light|await further instructions|assimilate|snervous tyler oakley|tremors 4 the legend begins|f.r.e.d.i.|tau|wildling|
| **resident evil infinite darkness** |cannon busters|kengan ashura|haunted|equinox|kabaneri of the iron fortress the battle of unato|ultraman|sirius the jaeger|santa clarita diet|parasyte the maxim|
| **trese after dark** |jugaad|audrey|i dream of dance|operation odessa|9to5 the story of a movement|black holes the edge of all we know|creating an army of the dead|fishpeople|chef s table france|
| **the sons of sam a descent into darkness** |cocaine cowboys the kings of miami|the pharmacist|the trials of gabriel fernandez|the innocence files|drug lords|night stalker the hunt for a serial killer|dope|evil genius|crime scene the vanishing at the cecil hotel|
| **the darkest hour** |dragonheart 3 the sorcerer|jupiter ascending|left behind|singularity|the lord of the rings the return of the king|stargate|the signal|incoming|hardcore henry|
| **dark city beneath the beat** |residente|trixie mattel moving parts|clarence clemons who do i think i am|woodstock|shawn mendes in wonder|the remix hip hop x fashion|lil peep everybodys everything|once in a lifetime sessions with noel gallagher|bikram yogi, guru, predator|
| **dark forces** |romina|the golem|rakkhosh|sinister circle|the maus|rencor tatuado|veronica|munafik 2|nang nak|
| **we summon the darkness** |polaroid|concrete cowboy|unfriended|case 39|in the deep|the strangers prey at night|american honey|candyman|sinister 2|
| **in the dark** |trinkets|unsolved|somewhere between|frequency|battle creek|women behind bars|footprints in the sand|everything sucks|scandal|
| **dark desire** |tijuana|elite|hache|la casa de papel|apaches|dandy|somos.|monzon a knockout blow|you cannot hide|
| **dark** |the woods|1983|ultraviolet|babylon berlin|sakho mangane|case|criminal france|deadwind|warrior|
| **dark waters** |cairo station|the blazing sun|amar s hands|alexandria ... why|step outside|flimflam|beirut oh beirut|komola rocket|the emigrant|
| **the darkness** |the detained|14 cameras|the haunting of molly hartley|terrifier|the ring|clinical|the hurt business|the witch files|desolation|
| **light in the dark** |road to yesterday|in line|catch.er|black rose|the bling lagosians|gagarin first in space|namaste wahala|the island|wives on strike|
| **dark light** |wildling|1st summoning|dark skies|st. agatha|twinsanity|cam|haunting on fraternity row|await further instructions|creep 2|
| **night on earth shot in the dark** |night on earth|diana in her own words|derren brown the push|influx|elizabeth and margaret love and loyalty|89|flinch|jackie a tale of two sisters|derren brown miracle|
| **monsters dark continent** |superfly|au coeur des gangs|into the wild|strange weather|monsoon|enola holmes|moonlight|paid in full|contract|
| **the crystal calls making the dark crystal age of resistance** |secrets of althorp the spencers|secrets of highclere castle|we, the marines|secrets of chatsworth|snervous tyler oakley|mitt|stealing history|zion|the memphis belle a story of a flying fortress|
| **the dark crystal age of resistance** |lost in space|the shannara chronicles|sweet tooth|dreamworks how to train your dragon legends|hit run|raising dion|the umbrella academy|the 4400|star trek the next generation|
| **hikaru utada laughter in the dark tour 2018** |theeya velai seyyanum kumaru|nasha natasha|minsara kanavu|gen hoshino stadium tour pop virus|takizawa kabuki zero 2020 the movie|amrapali|dtc yukemuri junjo hen from high low|maine pyar kiya|k on the movie|


In [None]:
# input_ids = [5163,1930,402,7187,484,6882,1207,647]
input_ids = list(input_dict.values())
display_reccomendations(input_ids)

In [None]:
input_id = 1930 # dark
recomend_by_id(input_id)

In [93]:
imdb_df = pd.read_table("/home/ninja/Dropbox/datasets/netflix/title.basics.tsv")
# imdb_df2 = pd.read_table("/home/ninja/Dropbox/datasets/netflix/title.ratings.tsv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [87]:
imdb_df["originalTitle"] = imdb_df["originalTitle"].apply(normalize)

In [88]:
net_titles = df_["title"].drop_duplicates().tolist()

In [91]:
imdb_df[imdb_df["originalTitle"].isin(net_titles)]["originalTitle"].value_counts()

alone                             388
the gift                          386
the end                           385
love                              361
the interview                     353
                                 ... 
monsters dark continent             1
house party 3                       1
ouran high school host club         1
andaz apna apna                     1
bangkok love stories innocence      1
Name: originalTitle, Length: 6112, dtype: int64

In [105]:
imdb_df = imdb_df[(imdb_df["startYear"]!="\\N")]
imdb_df["startYear"] = imdb_df["startYear"].astype(int)
imdb_df = imdb_df[(imdb_df["startYear"]>= 2002)]

In [106]:
imdb_df.shape

(5452248, 9)

In [107]:
imdb_df["titleType"].unique()

array(['movie', 'short', 'tvMovie', 'tvEpisode', 'video', 'tvSeries',
       'videoGame', 'tvMiniSeries', 'tvShort', 'tvSpecial', 'tvPilot'],
      dtype=object)

In [108]:
imdb_df[~imdb_df["titleType"].isin(["short","tvShort","tvPilot","tvSpecial","videoGame"])].shape

(4791117, 9)