In [1]:
import re
from urllib import request
import gzip
import shutil
import requests
import pyodbc
import urllib
from multiprocessing.pool import ThreadPool
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import time

**First let's get the data**

In [2]:
def download_url(url):
    print("downloading: ",url)
    file_title = re.split(pattern='/', string=url)[-1]
    urlrtv = request.urlretrieve(url=url, filename=file_title)
    
    title = re.split(pattern=r'\.tsv', string=file_title)[0] +".csv"
    
    with gzip.open(file_title, 'rb') as f_in:
        with open(title, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

In [3]:
urls = ["https://datasets.imdbws.com/title.ratings.tsv.gz",
        "https://datasets.imdbws.com/title.basics.tsv.gz"]

results = ThreadPool(5).imap_unordered(download_url, urls)

downloading: downloading:  https://datasets.imdbws.com/title.basics.tsv.gz
 https://datasets.imdbws.com/title.ratings.tsv.gz


**The following datasets are large and might still be zipping open in to your folder from the last block. If it gives you an error, wait for a second and run this same block again (It's not our fault, it's your computer's. Don't penalise us! Also a reminder, if nothing else is run, then jupyter does not actually run the same block again if it failed, so throw in a cheeky block run here.).**

In [4]:
time.sleep(10)

In [5]:

ratings = pd.read_csv("title.ratings.csv", sep="\\t", na_values=["\\N"])

In [6]:
time.sleep(60)

In [7]:
basics = pd.read_csv("title.basics.csv" ,sep="\\t" ,na_values=["\\N"])

In [8]:
abstracts = pd.read_csv('movies_metadata.csv')
abstracts.rename(columns = {'imdb_id':'tconst'}, inplace = True)
abstracts = abstracts[["overview", "tconst"]]

data = pd.merge(basics, ratings, on='tconst', how='outer')
data = pd.merge(data, abstracts, on="tconst", how="inner")

**Some preprocessing**

In [9]:
data = data.drop(['originalTitle', 'isAdult', "endYear"], axis=1)
data = data.dropna(subset=['averageRating', 'primaryTitle', 'genres', 'titleType'])
data = data[data['titleType'] == "movie"]
data = data.reset_index()

In [10]:
genres = np.array(data['genres'])

for i in range(len(genres)):
    try:
        genres[i] = genres[i].split(",")
    except:
        continue
        
data["genres"] = genres
genres = np.array(data['genres'])

flat_genres = [j for i in genres for j in i]
allgenres = np.unique(flat_genres)

In [11]:
data = data.drop(['index', 'titleType'], axis=1)
data["averageRating"] = data["averageRating"].astype(float)

In [12]:
styear = np.array(data['startYear'])
rm = np.array(data['runtimeMinutes'])
        
data = data.drop(['startYear', 'runtimeMinutes'], axis=1)
data['startYear'] = styear.astype(int)
data['runtimeMinutes'] = rm.astype(int)

In [13]:
rid = []
for i in range(len(data['startYear'])):
    try:
        if data['startYear'][i] > 1940 and data['runtimeMinutes'][i] > 0 and data['averageRating'][i] > 0 and data["numVotes"][i]>10000:
            rid.append(1)
        else:
            rid.append(0)
    except:
        rid.append(0)
        
data["getrid"] = rid
data = data[data["getrid"] == 1]
data = data.reset_index()
data = data.drop(['getrid'], axis=1)
data = data.drop(['index'], axis=1)

In [14]:
data1=data.copy()

a=-1
for j in range(int(len(data1["overview"]))):
    b=[]
    a+=1
    try:
        for i in data1["overview"][j].split(" "):
            i=i.strip(".").strip().strip(",")
            i=i.lower()
            b.append(i)
    except:
        continue
    data1["overview"][j]=b

**Some nice functions that we use**

In [15]:
choices = ', '.join(allgenres[allgenres != 'Adult'])

def genre_q():
    print(f"\nWhat genre are you feelin rn? Your choices are: {choices}")
    while True:
        x1 = input("\nSelect one genre from the list: ")
        x1=x1.split(",")
        if len(x1) > 3:
            print("I said select three, you weasel!")
        elif (all(i in choices for i in x1)):
            genres=x1
            print(f"Yeaah, added {genres[0]} to your possible matches ;)")
            break
        else:
            print("Oopsie daisy! Those aren't real! Try again (Pls dude, write it EXACTLY as in the examples given).\n")
    return(genres)
 
def long_q():
    while True:
        x2 = input("\nHow much free time do you have (in minutes)? ")
        try:
            long = int(x2)
            break
        except:
            print("Maaan, gimme the minutes pls")
    return(long)

def dec_q():
    decades=["1940s", "1950s", "1960s", "1970s", "1980s", "1990s", "2000s", "2010s", "I DON'T CARE"]
    while True:
        x3 =str(input(f"So, hombre, do you want a new or an old movie? Pick your fav dec pls, thnx: {decades}: "))
        if x3.upper() in decades or x3.lower() in decades or x3 in decades:
            print("\nOkay, let me find the right movie for you...")
            if x3.upper()== "I DON'T CARE":
                decade=0
            else:
                List1 = [str(element) for element in x3]  
                decade = int(''.join(List1[:-1]))
            break
        else:
            print("I don't know what you mean there, babes... So come on baby hit me one more time:")
    return(decade)

**Let's start asking questions**

In [16]:
data2=data.copy()
s=[]

while True:
    genres2=genre_q()
    long=long_q()
    decade=dec_q()
    for i in range(len(data2)):
        genres2=list(genres2)
        if (all(e in data2["genres"][i] for e in genres2)):
            if decade==0:
                if data2["runtimeMinutes"][i] <=long:
                        s.append(i)
            elif data2["startYear"][i] > decade and data2["startYear"][i] < (decade+9):
                if data2["runtimeMinutes"][i] <=long:
                        s.append(i)
        else:
            continue

    if len(s)==0:
        new=input("\nayyy, we didn't find anything from the database for ya, buddy. But it's all right. Do you wanna try to find a different movie or something of the sorts? (Y/n) ")
        if new.upper()=="Y":
            continue
        elif new.lower()=="n":
            break
        else:
            print("YOU HAD TWO CHOICES SCOUNDREL. THIS WILL BE THE END OF YOU!")
            break
    else:
        print(f"Congrats, we found {len(s)} matches for ya. The most highly rated of them will appear on your screen soon (hopefully)!")
        data_sel=data2.iloc[s]
        display(data_sel[["primaryTitle", "genres", "averageRating", "runtimeMinutes"]].sort_values(by='averageRating', ascending=False).reset_index().drop(['index'], axis=1).head(16))
        break
        
     
time.sleep(min(len(data_sel), 10))       
        
fav=input("Did you find what you were looking for in the table already? (Y/n) ")
if fav.upper()=="Y":
    print("Well then, congrats. We are done here then!")
    
elif fav.lower()=="n":
    prompt=input("\nSo, help us to help you. Name us something you are into (e.g. woman, man, war etc.). Literally, give us anything!\n")
    words=[]
    for i in prompt.split(" "):
        i=i.strip(".").strip().strip(",")
        i=i.lower()
        words.append(i)        
        
    match=[]
    
    for i in range(len(data_sel)):
        d=0
        for j in range(len(words)):
            try:
                if words[j] in data_sel.iloc[i]["overview"]:
                    d+=1
            except:
                d=0
        if d <= len(words) and d > 0:
            match.append(i)
            
    if len(match) != 0:
        print("\nOkay, dokey, so this is the list that we could give ya from the word search (in the order of best to worst): ")
        data_found=data_sel.iloc[match]
        ordered=data_found.sort_values(by=['averageRating'], ascending=False).reset_index()
        display(ordered[["primaryTitle", "startYear", "genres", "averageRating", "runtimeMinutes"]])
    
    if len(match)==0:
        for i in range(int(len(data1))):
            d=0
            for j in range(int(len(words))):
                try:
                    if words[j] in data1["overview"][i]:
                        d+=1
                except:
                    d=0
            if d > 0:
                match.append(i)
                
        if len(match)!=0:
            print("\nOkay, dokey, so the keyword you specified did not fit any of the movies in the previous list, but hey, here are some other movies that do ;)")
            data_found=data1.iloc[match]
            ordered=data_found.sort_values(by=['averageRating'], ascending=False).reset_index()
            display(ordered[["primaryTitle", "startYear", "genres", "averageRating", "runtimeMinutes"]])
        
        else:
            print("Sorry mate, we did not find any movies with your prompt")



What genre are you feelin rn? Your choices are: Action, Adventure, Animation, Biography, Comedy, Crime, Documentary, Drama, Family, Fantasy, Film-Noir, History, Horror, Music, Musical, Mystery, News, Reality-TV, Romance, Sci-Fi, Short, Sport, Thriller, War, Western

Select one genre from the list: Action
Yeaah, added Action to your possible matches ;)

How much free time do you have (in minutes)? wer
Maaan, gimme the minutes pls

How much free time do you have (in minutes)? 123
So, hombre, do you want a new or an old movie? Pick your fav dec pls, thnx: ['1940s', '1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010s', "I DON'T CARE"]: I don't care

Okay, let me find the right movie for you...
Congrats, we found 1498 matches for ya. The most highly rated of them will appear on your screen soon (hopefully)!


Unnamed: 0,primaryTitle,genres,averageRating,runtimeMinutes
0,Star Wars: Episode IV - A New Hope,"[Action, Adventure, Fantasy]",8.6,121
1,Léon: The Professional,"[Action, Crime, Drama]",8.5,110
2,Oldboy,"[Action, Drama, Mystery]",8.4,120
3,Indiana Jones and the Raiders of the Lost Ark,"[Action, Adventure]",8.4,115
4,"Lock, Stock and Two Smoking Barrels","[Action, Comedy, Crime]",8.2,107
5,Kill Bill: Vol. 1,"[Action, Crime, Drama]",8.2,111
6,Yojimbo,"[Action, Drama, Thriller]",8.2,110
7,Rush,"[Action, Biography, Drama]",8.1,123
8,Mad Max: Fury Road,"[Action, Adventure, Sci-Fi]",8.1,120
9,The Iron Giant,"[Action, Adventure, Animation]",8.1,86


Did you find what you were looking for in the table already? (Y/n) n

So, help us to help you. Name us something you are into (e.g. woman, man, war etc.). Literally, give us anything!
war

Okay, dokey, so this is the list that we could give ya from the word search (in the order of best to worst): 


Unnamed: 0,primaryTitle,startYear,genres,averageRating,runtimeMinutes
0,Yojimbo,1961,"[Action, Drama, Thriller]",8.2,110
1,How to Train Your Dragon,2010,"[Action, Adventure, Animation]",8.1,98
2,The Terminator,1984,"[Action, Sci-Fi]",8.1,107
3,Sanjuro,1962,"[Action, Drama, Thriller]",8.0,96
4,"Crouching Tiger, Hidden Dragon",2000,"[Action, Adventure, Drama]",7.9,120
...,...,...,...,...,...
168,Superman IV: The Quest for Peace,1987,"[Action, Adventure, Sci-Fi]",3.7,90
169,Mortal Kombat: Annihilation,1997,"[Action, Adventure, Fantasy]",3.6,95
170,Steel,1997,"[Action, Adventure, Crime]",2.9,97
171,Dragonball Evolution,2009,"[Action, Adventure, Fantasy]",2.6,85
