### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

### Data Cleaning

In [2]:
#It uses the database from https://datasets.imdbws.com/

ratings = pd.read_csv(r'datasets/ratings.tsv',sep='\t', low_memory=False)
akas = pd.read_csv(r'datasets/akas.tsv',sep='\t',low_memory=False)
basics = pd.read_csv(r'datasets/basics.tsv',sep='\t',low_memory=False)

In [3]:
df = pd.merge(basics,ratings,on='tconst')

In [4]:
df.drop(columns=['isAdult', 'runtimeMinutes'], axis=1, inplace=True)

In [7]:
df['startYear'] = [0 if year == '\\N' else int(year) for year in df['startYear']]
df['endYear'] = [year if year == '\\N' else int(year) for year in df['endYear']]
df = df[(df['numVotes']>=2500) & (df['averageRating']>=6.4) & (df['startYear']>=1960)]

In [53]:
df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,endYear,genres,averageRating,numVotes
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,2001,\N,"Comedy,Fantasy,Romance",6.4,77701
1,tt0052698,movie,Classe Tous Risques,Classe tous risques,1960,\N,"Crime,Drama,Romance",7.5,3102
2,tt0052832,movie,The Fugitive Kind,The Fugitive Kind,1960,\N,"Drama,Romance",7.2,5398
3,tt0052997,movie,The League of Gentlemen,The League of Gentlemen,1960,\N,"Comedy,Crime,Thriller",7.3,3757
4,tt0053106,movie,Letter Never Sent,Neotpravlennoe pismo,1960,\N,"Adventure,Drama",7.9,2940
...,...,...,...,...,...,...,...,...,...
13940,tt9827854,series,Hollywood,Hollywood,2020,\N,Drama,7.6,24608
13941,tt9877170,movie,Malang,Malang - Unleash the Madness,2020,\N,"Action,Crime,Drama",6.5,6853
13942,tt9879074,series,The Disappearance of Madeleine McCann,The Disappearance of Madeleine McCann,2019,\N,"Crime,Documentary,Drama",6.6,7545
13943,tt9900092,series,Motherland: Fort Salem,Motherland: Fort Salem,2020,\N,"Drama,Fantasy,Sci-Fi",7.1,5111


In [55]:
df.to_csv('datasets/imdb_large.csv', index=False)

In [9]:
#Simplify the title_types
def select_titles(title_type):
    if title_type in ['movie','tvMovie','tvSpecial']:
        return 'movie'
    elif title_type in ['tvSeries','tvMiniSeries']:
        return 'series'
    else:
        return np.nan

In [10]:
df['titleType'] =  df['titleType'].apply(select_titles)

In [11]:
df.dropna(inplace=True)

In [12]:
df.reset_index(drop=True,inplace=True)

In [16]:
imdb['titleType'].value_counts()

movie     10793
series     3152
Name: titleType, dtype: int64

In [17]:
# Create a new column title with movie/tv name with date
title = []
for row in imdb[['titleType','primaryTitle','startYear','endYear']].itertuples():
    if row.titleType == 'movie':
        title.append('{} ({})'.format(row.primaryTitle,row.startYear))
    else:
        title.append( '{} ({} -)'.format(row.primaryTitle,row.startYear) if row.endYear == '\\N' else '{} ({} - {})'.format(row.primaryTitle,row.startYear, row.endYear))
imdb['title']=title

In [2]:
imdb.iloc[8838]['title']

NameError: name 'imdb' is not defined

In [None]:
imdb.to_csv('datasets/imdb.csv', index=False)

### Recomm Builder

In [6]:
imdb = pd.read_csv('datasets/imdb.csv', low_memory=False)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
cv = CountVectorizer(dtype=np.uint8)
MMS = MinMaxScaler()

In [15]:
imdb['genres'] = [genre.replace(',', ' ') for genre in imdb['genres']]
dtm = cv.fit_transform(imdb['genres']).toarray()

In [9]:
# reshape(-1,1) is equivalent to transpose
new_matrix = np.concatenate((dtm, np.array(imdb['averageRating']).reshape(-1,1)),axis=1)

In [10]:
numVotes = np.array(imdb['numVotes']).reshape(-1,1)
numVotes = MMS.fit_transform(numVotes)

In [11]:
new_matrix = np.concatenate((new_matrix,numVotes),axis=1)

In [12]:
similarities = cosine_similarity(new_matrix)

In [13]:
def get_recoms(titles_ids, type='movie'):
    number_of_titles = len(titles_ids)
    sim = similarities[[8838,7600]].sum(axis=0).argsort()[::-1][number_of_titles:500]   # To omit recommending the input movies
    tv_similar = [(imdb['title'].iloc[index], imdb['tconst'].iloc[index]) for index in sim if imdb['titleType'].iloc[index]=='series']
    movie_similar = [(imdb['title'].iloc[index], imdb['tconst'].iloc[index]) for index in sim if imdb['titleType'].iloc[index]=='movie']
    
    if type=='movie':
        return movie_similar[:10]
    else:
        return tv_similar[:10]
    

In [14]:
get_recoms([8838,7600,134344134,23442]) 

[('Jurassic Park (1993)', 'tt0107290'),
 ('Star Wars: Episode VII - The Force Awakens (2015)', 'tt2488496'),
 ('Avengers: Infinity War (2018)', 'tt4154756'),
 ('Captain America: The Winter Soldier (2014)', 'tt1843866'),
 ('X-Men: Days of Future Past (2014)', 'tt1877832'),
 ('Aliens (1986)', 'tt0090605'),
 ('The Hunger Games (2012)', 'tt1392170'),
 ('Captain America: Civil War (2016)', 'tt3498820'),
 ('X-Men: First Class (2011)', 'tt1270798'),
 ('Avengers: Age of Ultron (2015)', 'tt2395427')]

Unnamed: 0,tconst,titleType,startYear,endYear,genres,averageRating,numVotes,title
0,tt0035423,movie,2001,\N,Comedy Fantasy Romance,6.4,77701,Kate & Leopold (2001)
8711,tt1314652,movie,2010,\N,Drama Thriller,6.4,9126,The Housemaid (2010)
2809,tt0099819,movie,1990,\N,Comedy Crime,6.4,13124,I Love You to Death (1990)
2806,tt0099800,movie,1990,\N,Comedy Music Romance,6.4,12792,House Party (1990)
2805,tt0099797,movie,1990,\N,Crime Drama Romance,6.4,10212,The Hot Spot (1990)
...,...,...,...,...,...,...,...,...
5041,tt0244911,series,1987,2006,Drama,9.5,3139,Malgudi Days (1987 - 2006)
13378,tt7450814,series,2017,2018,Crime Drama Thriller,9.6,16061,Koombiyo (2017 - 2018)
8089,tt10680606,series,2019,\N,Action Adventure Comedy,9.7,3574,Minecraft Epic (2019 -)
13893,tt9471404,series,2017,\N,Drama History,9.8,7737,The Chosen (2017 -)


## Saving the data for future uses

In [57]:
# Create the sample data to be used later:
# imdb.drop(columns=['primaryTitle','originalTitle'], inplace=True)
# imdb.to_csv('datasets/imdb.csv',index=False)

In [None]:
## Below this are some test and trial stuff not really needed

In [68]:
arr

<PandasArray>
[                         'Kate & Leopold (2001)',
                     'Classe Tous Risques (1960)',
                       'The Fugitive Kind (1960)',
                 'The League of Gentlemen (1960)',
                       'Letter Never Sent (1960)',
                     'Eyes Without a Face (1960)',
                              'Breathless (1960)',
           'The Andy Griffith Show (1960 - 1968)',
              'The Bugs Bunny Show (1960 - 1975)',
                  'The Flintstones (1960 - 1966)',
 ...
                              'Isi & Ossi (2020)',
                       'Brave New World (2020 -)',
                       'Bandish Bandits (2020 -)',
                       'Unorthodox (2020 - 2020)',
                                'Just 6.5 (2019)',
                             'Hollywood (2020 -)',
                                  'Malang (2020)',
 'The Disappearance of Madeleine McCann (2019 -)',
                'Motherland: Fort Salem (2020 -)',
            

In [100]:
arr = list(imdb['title'])

In [83]:
import difflib

In [65]:
%timeit -n3 

429 ms ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [119]:
from rapidfuzz import process

In [123]:
from difflib import SequenceMatcher
from heapq import nlargest as _nlargest
def get_best_matches(word, possibilities, n=3, cutoff=0.6):
    """Use SequenceMatcher to return a list of the indexes of the best 
    "good enough" matches. word is a sequence for which close matches 
    are desired (typically a string).
    possibilities is a list of sequences against which to match word
    (typically a list of strings).
    Optional arg n (default 3) is the maximum number of close matches to
    return.  n must be > 0.
    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
    that don't score at least that similar to word are ignored.
    """

    if not n >  0:
        raise ValueError("n must be > 0: %r" % (n,))
    if not 0.0 <= cutoff <= 1.0:
        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
    result = []
    s = SequenceMatcher()
    s.set_seq2(word)
    for idx, x in enumerate(possibilities):
        s.set_seq1(x)
        if s.real_quick_ratio() >= cutoff and \
           s.quick_ratio() >= cutoff and \
           s.ratio() >= cutoff:
            result.append((s.ratio(), idx))

    # Move the best scorers to head of list
    result = _nlargest(n, result)

    # Strip scores for the best n matches
    return [x for score, x in result]

In [132]:
process.extractBests('jlksdfjldksjfslkaj;fdlkjsdfkljdsflkjdfsalkjdsflkdsjfa;lskdjsldak', imdb['title'])

[('Salako (1974)', 42.352941176470594, 5173),
 ('JFK (1991)', 40.0, 2949),
 ('Wadjda (2012)', 38.86363636363636, 10430),
 ('Dalida (2016)', 38.86363636363636, 12337),
 ('Leak (2000)', 38.57142857142857, 4731)]

In [128]:
get_close_matches_indexes('inception', imdb['title'], cutoff=0.5)

87.9 ms ± 3.59 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [129]:
imdb['title'].iloc[8838]

'Inception (2010)'

In [99]:
imdb.iloc[5173]

tconst                        tt0253623
titleType                         movie
startYear                          1974
endYear                              \N
genres           Adventure Comedy Crime
averageRating                       7.4
numVotes                           5484
title                     Salako (1974)
Name: 5173, dtype: object