# Scrape IMDB to match movie IDs with movie titles

In [1]:
import pandas as pd
import numpy as np
from requests import get
from bs4 import BeautifulSoup
import re
import os
import sys
sys.path.append("../movielingo/")
%load_ext autoreload
%autoreload 2
import time
import requests
import json
import tqdm

In [2]:
def get_movie_characteristics(imdb_id):
    url = 'https://www.imdb.com/title/tt' + imdb_id + '/'
    response = get(url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    title = html_soup.title.text
    description = html_soup.find('script', type="application/ld+json").contents[0]
    description = json.loads(description)
    genre = description['genre']
    movie_or_tv = description['@type']
    keywords = description['keywords']
    rating = description['aggregateRating']['ratingValue']
    return title, genre, movie_or_tv, keywords, rating

In [3]:
from movielingo.config import subtitle_dir
files = os.listdir(subtitle_dir)

In [4]:
ids = []
for file in files:
    filename = subtitle_dir / file
    imdb_id = re.search("(\d{7})", file).group()
    ids.append(imdb_id)

In [None]:
movie_titles = []
unique_ids = list(set(ids))

for imdb_id in tqdm.tqdm(unique_ids[6000:]):
    try:
        title, genre, movie_or_tv, keywords, rating = get_movie_characteristics(imdb_id)
        time.sleep(5)
        with open('movie_characteristics.txt', 'a') as movie_characteristics_file:
            movie_characteristics_file.write(imdb_id + '\t' + 
                                      title + '\t' + 
                                      genre + '\t' +
                                      movie_or_tv + '\t' +
                                      keywords + '\t' +
                                      rating +
                                      '\n')
    except:
        pass

  3%|▎         | 205/6581 [19:07<10:34:28,  5.97s/it]

In [35]:
imdb = pd.read_csv('/Users/aglushko/Desktop/insight_fellows/insight-project/insight-data-project/notebooks/movie_characteristics.txt',
                   sep = '\t',
                   names = ['id', 'title', 'genre', 'movie_or_show', 'keywords', 'rating'],
                   dtype = {'id': str})

In [36]:
imdb['rating'] = imdb.rating.astype(float)
imdb['id'] = imdb.id.astype(str)

In [37]:
imdb_good = imdb[imdb.rating > 7].reset_index(drop=True).copy()

In [38]:
imdb_good

Unnamed: 0,id,title,genre,movie_or_show,keywords,rating
0,0067484,Nie lubie poniedzialku (1971) - IMDb,Comedy,Movie,"italian,statement in title,monday,thief,robbery",7.6
1,0109686,Dumb and Dumber (1994) - IMDb,Comedy,Movie,"road trip,briefcase of money,woman wearing a s...",7.3
2,0112220,The Wayans Bros. (TV Series 1995–1999) - IMDb,Comedy,TVSeries,"three word title,title ends with period,period...",7.2
3,0047238,Miseria e nobiltà (1954) - IMDb,Comedy,Movie,"false identity,italy,spaghetti,pasta,poverty",7.8
4,0268077,That's My Bush! (TV Series 2001) - IMDb,Comedy,TVSeries,"american politics,political leader,republican,...",7.3
...,...,...,...,...,...,...
137,0414773,The Smoking Room (TV Series 2004–2005) - IMDb,Comedy,TVSeries,"sitcom,british comedy,smoking room",7.9
138,0460091,My Name Is Earl (TV Series 2005–2009) - IMDb,Comedy,TVSeries,"heavy metal,crab,karma,list,low life",7.7
139,0795156,Monty Python's Personal Best (TV Series 2006) ...,Comedy,TVSeries,"compilation,flail,sketch comedy,comedy troupe,...",8.0
140,0081590,Sällskapsresan eller Finns det svenskt kaffe p...,Comedy,Movie,"or as title separator,first part,part of serie...",7.2


In [39]:
imdb_good.loc[:,'title'] = imdb_good.title.str.split(' \(.{4,40}\) - IMDb', expand=True)[0].str.strip()

In [40]:
from movielingo.config import processed_data_dir
imdb_good.to_csv(processed_data_dir / 'imdb_title_and_id_matches.csv', index = False)

In [16]:
IMDB = pd.read_csv(processed_data_dir / 'imdb_title_and_id_matches.csv')
IMDB.head()

Unnamed: 0,id,title,genre,movie_or_show,keywords,rating
0,67484,Nie lubie poniedzialku,Comedy,Movie,"italian,statement in title,monday,thief,robbery",7.6
1,109686,Dumb and Dumber,Comedy,Movie,"road trip,briefcase of money,woman wearing a s...",7.3
2,112220,The Wayans Bros.,Comedy,TVSeries,"three word title,title ends with period,period...",7.2
3,47238,Miseria e nobiltà,Comedy,Movie,"false identity,italy,spaghetti,pasta,poverty",7.8
4,268077,That's My Bush!,Comedy,TVSeries,"american politics,political leader,republican,...",7.3


In [41]:
from movielingo.movie_info_output import *

In [None]:
def show_difficulty(movie_title, subtitle_dir, model_dir, model = 'regression'):
    imdb_id = get_imdb_id_from_db(movie_title)
    html_soup = get_imdb_page_for_movie(imdb_id)
    movie_poster_link = get_link_to_movie_poster(html_soup)
    movie_title_correct = get_movie_title(html_soup)
    df = create_df_from_subtitles(imdb_id, subtitle_dir)
    loaded_model_name = model_dir / 'movielingo_model.sav'
    loaded_model = pickle.load(open(loaded_model_name, 'rb'))
    text_preds = []
    for text_id in df.text_id.unique():
        text_slice = df[df.text_id == text_id]
        text_slice = text_slice.drop(columns = ['text_id','L2_proficiency']).reset_index(drop=True)
        text_pred = loaded_model.predict(text_slice)
        text_preds.append(text_pred)
    if model == 'regression':
        prof_labels = [toeic2cefr(float(x)) for x in text_preds[0].tolist()]
    else:
        prof_labels = text_preds
    levels = Counter(prof_labels).keys()
    classified_as = list(Counter(prof_labels).values())
    n_windows = sum(Counter(prof_labels).values())
    results = []
    for level, label_count in zip(levels, classified_as):
        results.append([level, round(100*label_count/n_windows,2)])
    plot = plot_subtitle_difficulty(results, movie_title_correct)
    return results, movie_poster_link, plot

In [42]:
from movielingo.config import subtitle_dir, model_dir

In [43]:
results, movie_poster_link, plot = show_difficulty('Dumb and Dumber', subtitle_dir, model_dir, model = 'regression')

100%|██████████| 221/221 [00:01<00:00, 159.36it/s]


In [49]:
l2_level = 'UpperInterAdv'
results

[['A2', 92.31], ['B1', 7.69]]

In [52]:
def get_result(results, l2_level):
    labels = []
    vals = []
    for i in range(len(results)):
        labels.append(results[i][0])
        vals.append(results[i][1])
    b1_can_understand = 0
    if 'A2' in labels:
        i = labels.index('A2')
        b1_can_understand += vals[i]
    if 'B1' in labels:
        i = labels.index('B1')
        b1_can_understand += vals[i]
    if l2_level == 'BegInter':
        if b1_can_understand >= 75:
            result = 'is just right for you!'
        elif b1_can_understand >= 50:
            result = 'might be a bit too difficult for you!'
        else:
            'is probably too difficult for you.'
    if l2_level == 'UpperInterAdv':
        if b1_can_understand >= 75:
            result = 'is almost too easy for you!'
        else:
            result = 'is just right for you!'
    return result

In [53]:
get_result(results, l2_level)

'is almost too easy for you!'