First I build a scraper that can get user watchlist data which is essential for building a recommendation system.


In [80]:
import requests
import requests.exceptions
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv
import random
import re
import json

In [6]:


def fetch(url, retries=5, delay=5, backoff_factor=2):
    for attempt in range(retries):
        try:
            response = requests.get(url)
            if response.status_code == 403:
                print("Private ratings encountered. Skipping user.")
                return None
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            if response.status_code != 403:
                sleep_time = delay * (backoff_factor ** attempt) + random.uniform(0, 0.1)
                print(f"Request failed with {e}. Retrying in {sleep_time:.2f} seconds...")
                time.sleep(sleep_time)
    print("Failed to fetch the URL after multiple retries. Skipping user.")
    return None  # Return None when the request fails after all retries
    

def scrape_user_ratings(user_url):
    user_id = user_url.split("/")[4]
    ratings = []
    user_url += 'ratings'

    while user_url:
        response = fetch(user_url)
        
        if response is None:  
            break
        
        soup = BeautifulSoup(response.content, "html.parser")

        rating_elements = soup.find_all("div", class_="lister-item")

        if not rating_elements:
            break

        for element in rating_elements:
            try:
                imdb_id = element.find("a", href=lambda href: href and "/title/" in href)["href"].split("/")[2]
                rating = element.find_all("span", class_="ipl-rating-star__rating")[1].text.strip()
                ratings.append((user_id, imdb_id, rating))
            except Exception as e:
                print(f"Error occurred while parsing a rating for user {user_id}: {e}. Skipping this rating.")
        
        next_button = soup.find("a", class_="flat-button lister-page-next next-page")
        
        if next_button:
            user_url = "https://www.imdb.com" + next_button["href"]
        else:
            user_url = None

    return ratings



In [54]:
url = "https://www.imdb.com/user/ur115536310/ratings"
ratings = scrape_user_ratings(url)
len(ratings)

2186

Dabar reikia budo gauti daug useriu url'u, geriausias budas kuri sugalvoju, eiti per filmu reviewsus

In [3]:
def scrape_user_urls_from_movie_reviews(movie_id):
    user_urls = []
    url = f"https://www.imdb.com/title/{movie_id}/reviews"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    #  randam elementa su visa data
    review_elements = soup.find_all("div", class_="review-container")

    # pasiemam useriu urls
    for element in review_elements[:10]:
        user_url = element.find("a", href=lambda href: href and "/user/" in href)
        if user_url:
            user_urls.append("https://www.imdb.com" + user_url["href"])

    return user_urls

In [67]:
movie_ids = []
# naudoju savo paties filmu watchlista pasiimti filmus kuriu reviewsus scrapinsiu useriu urlam gauti
with open('IMDb_ids.txt', mode='r', encoding='utf-8') as file:
    for line in file:
        movie_ids.append(line.strip())

In [68]:
# scrapinam user urls

user_urls = []
csv_file = 'data\user_ratings_data\user_urls.csv'

with open(csv_file, mode='w', encoding='utf-8', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['url'])

    for id in movie_ids:
        print(f'Now_scraping movie: {id}')
        current_user_urls = scrape_user_urls_from_movie_reviews(id)
        user_urls.extend(current_user_urls)
        
        for url in current_user_urls:
            csv_writer.writerow([url])

Now_scraping movie: tt1001526
Now_scraping movie: tt10095582
Now_scraping movie: tt1010048
Now_scraping movie: tt0101410
Now_scraping movie: tt1016150
Now_scraping movie: tt10185752
Now_scraping movie: tt1024648
Now_scraping movie: tt10272386
Now_scraping movie: tt1028528
Now_scraping movie: tt10288566
Now_scraping movie: tt1029234
Now_scraping movie: tt0102926
Now_scraping movie: tt10293406
Now_scraping movie: tt1034415
Now_scraping movie: tt10366460
Now_scraping movie: tt0103772
Now_scraping movie: tt1038919
Now_scraping movie: tt1038988
Now_scraping movie: tt0104431
Now_scraping movie: tt1045658
Now_scraping movie: tt0104652
Now_scraping movie: tt1049413
Now_scraping movie: tt1051906
Now_scraping movie: tt0105236
Now_scraping movie: tt0105323
Now_scraping movie: tt1060277
Now_scraping movie: tt0106308
Now_scraping movie: tt10633456
Now_scraping movie: tt10640346
Now_scraping movie: tt10648342
Now_scraping movie: tt1065073
Now_scraping movie: tt0106677
Now_scraping movie: tt0107048
N

Now_scraping movie: tt1748122
Now_scraping movie: tt0175142
Now_scraping movie: tt1772341
Now_scraping movie: tt1778304
Now_scraping movie: tt1790809
Now_scraping movie: tt1790864
Now_scraping movie: tt1791528
Now_scraping movie: tt1798709
Now_scraping movie: tt0180093
Now_scraping movie: tt0181689
Now_scraping movie: tt1821694
Now_scraping movie: tt1825683
Now_scraping movie: tt0183505
Now_scraping movie: tt1840309
Now_scraping movie: tt1843866
Now_scraping movie: tt1853728
Now_scraping movie: tt1856101
Now_scraping movie: tt0185937
Now_scraping movie: tt1860213
Now_scraping movie: tt1860353
Now_scraping movie: tt0187078
Now_scraping movie: tt1872181
Now_scraping movie: tt1877830
Now_scraping movie: tt1877832
Now_scraping movie: tt1878870
Now_scraping movie: tt1895587
Now_scraping movie: tt1912398
Now_scraping movie: tt1922777
Now_scraping movie: tt1924435
Now_scraping movie: tt1931533
Now_scraping movie: tt1935156
Now_scraping movie: tt1937390
Now_scraping movie: tt1950186
Now_scrapi

Now_scraping movie: tt0413267
Now_scraping movie: tt0413300
Now_scraping movie: tt4154664
Now_scraping movie: tt4154756
Now_scraping movie: tt4154796
Now_scraping movie: tt0416449
Now_scraping movie: tt0417741
Now_scraping movie: tt4178092
Now_scraping movie: tt0418279
Now_scraping movie: tt4209788
Now_scraping movie: tt0421715
Now_scraping movie: tt0425112
Now_scraping movie: tt0042619
Now_scraping movie: tt4263482
Now_scraping movie: tt0427312
Now_scraping movie: tt0430922
Now_scraping movie: tt0432348
Now_scraping movie: tt0433035
Now_scraping movie: tt0434409
Now_scraping movie: tt0435761
Now_scraping movie: tt4364194
Now_scraping movie: tt0438097
Now_scraping movie: tt0440963
Now_scraping movie: tt0441773
Now_scraping movie: tt4425200
Now_scraping movie: tt0443453
Now_scraping movie: tt0443706
Now_scraping movie: tt0446029
Now_scraping movie: tt0448115
Now_scraping movie: tt0448157
Now_scraping movie: tt4483220
Now_scraping movie: tt0448694
Now_scraping movie: tt0449059
Now_scrapi

Now_scraping movie: tt0948470
Now_scraping movie: tt9484998
Now_scraping movie: tt0095016
Now_scraping movie: tt0095953
Now_scraping movie: tt9620288
Now_scraping movie: tt0096283
Now_scraping movie: tt9639470
Now_scraping movie: tt9684220
Now_scraping movie: tt0974015
Now_scraping movie: tt0097576
Now_scraping movie: tt9764362
Now_scraping movie: tt0983193
Now_scraping movie: tt0985694
Now_scraping movie: tt0986263
Now_scraping movie: tt0988045
Now_scraping movie: tt0099077
Now_scraping movie: tt0993846
Now_scraping movie: tt0099423
Now_scraping movie: tt0099685
Now_scraping movie: tt0099785
Now_scraping movie: tt0099871


In [4]:
with open('user_urls.csv', mode='r', encoding='utf-8', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)
    next(csv_reader)
    user_urls = list(set(row[0] for row in csv_reader))

total_users = len(user_urls)
total_users

3412

In [8]:
# scrapinam useriu reitingus



ratings_csv_file = 'movie_ratings3.csv'
continue_from = 2900

with open(ratings_csv_file, mode='w', encoding='utf-8', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['user_id', 'imdb_id', 'rating'])

    for index, user_url in enumerate(user_urls[continue_from-1:], start=continue_from):  # Start from 171 and slice the user_urls list
        print(f'Now scraping user {index}/{total_users}: {user_url}')
        user_ratings = scrape_user_ratings(user_url)
        print(f'Total reviews scraped: {len(user_ratings)}')
        for rating in user_ratings:
            csv_writer.writerow(rating)

Now scraping user 2900/3412: https://www.imdb.com/user/ur127235583/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 2901/3412: https://www.imdb.com/user/ur122400466/
Total reviews scraped: 743
Now scraping user 2902/3412: https://www.imdb.com/user/ur121409096/
Total reviews scraped: 26
Now scraping user 2903/3412: https://www.imdb.com/user/ur56938771/
Total reviews scraped: 1070
Now scraping user 2904/3412: https://www.imdb.com/user/ur30774546/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 2905/3412: https://www.imdb.com/user/ur4731393/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 2906/3412: https://www.imdb.com/user/ur39224458/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 2907/3412: https://www.imdb.com/user/ur19511193/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 2908/3412: https:/

Total reviews scraped: 175
Now scraping user 2973/3412: https://www.imdb.com/user/ur84948719/
Total reviews scraped: 301
Now scraping user 2974/3412: https://www.imdb.com/user/ur0600113/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 2975/3412: https://www.imdb.com/user/ur0902607/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 2976/3412: https://www.imdb.com/user/ur1257208/
Total reviews scraped: 780
Now scraping user 2977/3412: https://www.imdb.com/user/ur59754734/
Total reviews scraped: 253
Now scraping user 2978/3412: https://www.imdb.com/user/ur12164854/
Total reviews scraped: 265
Now scraping user 2979/3412: https://www.imdb.com/user/ur134158291/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 2980/3412: https://www.imdb.com/user/ur19009969/
Total reviews scraped: 22254
Now scraping user 2981/3412: https://www.imdb.com/user/ur34031905/
Private ratings encountered.

Total reviews scraped: 1483
Now scraping user 3043/3412: https://www.imdb.com/user/ur17582233/
Total reviews scraped: 1241
Now scraping user 3044/3412: https://www.imdb.com/user/ur75968031/
Total reviews scraped: 216
Now scraping user 3045/3412: https://www.imdb.com/user/ur46400079/
Total reviews scraped: 9
Now scraping user 3046/3412: https://www.imdb.com/user/ur15932214/
Total reviews scraped: 99
Now scraping user 3047/3412: https://www.imdb.com/user/ur1261820/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3048/3412: https://www.imdb.com/user/ur14692191/
Total reviews scraped: 1076
Now scraping user 3049/3412: https://www.imdb.com/user/ur3223254/
Total reviews scraped: 2345
Now scraping user 3050/3412: https://www.imdb.com/user/ur5631672/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3051/3412: https://www.imdb.com/user/ur24747273/
Total reviews scraped: 1318
Now scraping user 3052/3412: https://www.imd

Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3119/3412: https://www.imdb.com/user/ur34632605/
Total reviews scraped: 451
Now scraping user 3120/3412: https://www.imdb.com/user/ur3663051/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3121/3412: https://www.imdb.com/user/ur0897979/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3122/3412: https://www.imdb.com/user/ur75672959/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3123/3412: https://www.imdb.com/user/ur43980738/
Total reviews scraped: 217
Now scraping user 3124/3412: https://www.imdb.com/user/ur22779251/
Total reviews scraped: 90
Now scraping user 3125/3412: https://www.imdb.com/user/ur12680645/
Total reviews scraped: 270
Now scraping user 3126/3412: https://www.imdb.com/user/ur1195884/
Error occurred while parsing a rating for user ur1195884: list index out of range. S

Total reviews scraped: 1549
Now scraping user 3191/3412: https://www.imdb.com/user/ur15606002/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3192/3412: https://www.imdb.com/user/ur0678462/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3193/3412: https://www.imdb.com/user/ur39517558/
Total reviews scraped: 5068
Now scraping user 3194/3412: https://www.imdb.com/user/ur106007927/
Total reviews scraped: 610
Now scraping user 3195/3412: https://www.imdb.com/user/ur12142982/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3196/3412: https://www.imdb.com/user/ur28009688/
Total reviews scraped: 2919
Now scraping user 3197/3412: https://www.imdb.com/user/ur8160890/
Total reviews scraped: 1692
Now scraping user 3198/3412: https://www.imdb.com/user/ur22487163/
Total reviews scraped: 1132
Now scraping user 3199/3412: https://www.imdb.com/user/ur35595984/
Private ratings encounte

Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3263/3412: https://www.imdb.com/user/ur0950119/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3264/3412: https://www.imdb.com/user/ur65195412/
Total reviews scraped: 2899
Now scraping user 3265/3412: https://www.imdb.com/user/ur23222655/
Total reviews scraped: 75
Now scraping user 3266/3412: https://www.imdb.com/user/ur0499651/
Total reviews scraped: 970
Now scraping user 3267/3412: https://www.imdb.com/user/ur45060413/
Total reviews scraped: 1519
Now scraping user 3268/3412: https://www.imdb.com/user/ur18244116/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3269/3412: https://www.imdb.com/user/ur1538156/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3270/3412: https://www.imdb.com/user/ur21384504/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping 

Error occurred while parsing a rating for user ur5424352: list index out of range. Skipping this rating.
Total reviews scraped: 3488
Now scraping user 3334/3412: https://www.imdb.com/user/ur13549369/
Total reviews scraped: 238
Now scraping user 3335/3412: https://www.imdb.com/user/ur2403537/
Total reviews scraped: 2828
Now scraping user 3336/3412: https://www.imdb.com/user/ur0024463/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3337/3412: https://www.imdb.com/user/ur6157371/
Total reviews scraped: 1594
Now scraping user 3338/3412: https://www.imdb.com/user/ur1016453/
Total reviews scraped: 729
Now scraping user 3339/3412: https://www.imdb.com/user/ur41955249/
Total reviews scraped: 764
Now scraping user 3340/3412: https://www.imdb.com/user/ur55249699/
Total reviews scraped: 371
Now scraping user 3341/3412: https://www.imdb.com/user/ur98033888/
Total reviews scraped: 1027
Now scraping user 3342/3412: https://www.imdb.com/user/ur76298875/
Private

Total reviews scraped: 1088
Now scraping user 3408/3412: https://www.imdb.com/user/ur96934780/
Total reviews scraped: 97
Now scraping user 3409/3412: https://www.imdb.com/user/ur35544190/
Total reviews scraped: 1282
Now scraping user 3410/3412: https://www.imdb.com/user/ur5930448/
Total reviews scraped: 2
Now scraping user 3411/3412: https://www.imdb.com/user/ur22255390/
Private ratings encountered. Skipping user.
Total reviews scraped: 0
Now scraping user 3412/3412: https://www.imdb.com/user/ur1219578/
Total reviews scraped: 2204


In [98]:
#also need a scraper to get the metacritic score and review count for each movie in the imdb dataset

# def scrape_movie_data(imdb_id):
#     url = f"https://www.imdb.com/title/{imdb_id}"
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"}
#     response = requests.get(url, headers=headers)
#     soup = BeautifulSoup(response.content, "html.parser")
    
    
#     try:
#         review_counts = soup.find("ul", {"data-testid": "reviewContent-all-reviews"}).find_all("span", class_="score")
#         user_reviews = review_counts[0].text.strip()
#         critic_reviews = review_counts[1].text.strip()
#     except:
#         user_reviews = None
#         critic_reviews = None

#     try:
#         metascore = soup.find("span", class_="score-meta").text.strip()
#     except:
#         metascore = None

   
#     budget_match = re.search(r'Budget</span>\s*<span class="ipc-metadata-list-item__list-content-item">(\$[\d,]+)', response.content.decode("utf-8"))
#     worldwide_gross_match = re.search(r'Gross worldwide</span>\s*<span class="ipc-metadata-list-item__list-content-item">(\$[\d,]+)', response.content.decode("utf-8"))

#     budget = budget_match.group(1) if budget_match else None
#     worldwide_gross = worldwide_gross_match.group(1) if worldwide_gross_match else None


#     data = {
#         "imdb_id": imdb_id,
#         "critic_reviews": critic_reviews,
#         "user_reviews": user_reviews,
#         "metascore": metascore,
#         "budget": budget,
#         "worldwide_gross": worldwide_gross
#     }
#     df = pd.DataFrame(data, index=[0])

#     return df

In [99]:
with open('data/user_ratings_data/imdb_ids.csv', mode='r', encoding='utf-8', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)
    imdb_ids = list(set(row[0] for row in csv_reader))
total_films = len(imdb_ids)

In [100]:
total_films

287519

In [101]:
# metacritic_data = 'movie_metacritic_and_budget.csv'

# with open(metacritic_data, mode='w', encoding='utf-8', newline='') as csvfile:
#     csv_writer = csv.writer(csvfile)
#     csv_writer.writerow(['imdb_id', 'critic_reviews', 'user_reviews', 'metascore', 'budget', 'worldwide_gross'])

#     for index, film_id in enumerate(imdb_ids):  
#         print(f'Now scraping film {index}/{total_films}: {film_id}')
#         movie_data = scrape_movie_data(film_id)
#         print(f'Total reviews scraped: {len(movie_data)}')
#         for _, row in movie_data.iterrows():
#             csv_writer.writerow(row.tolist())

Now scraping film 0/287519: tt0044338
Total reviews scraped: 1
Now scraping film 1/287519: tt0497830
Total reviews scraped: 1
Now scraping film 2/287519: tt9598172
Total reviews scraped: 1
Now scraping film 3/287519: tt0378645
Total reviews scraped: 1
Now scraping film 4/287519: tt0155693
Total reviews scraped: 1
Now scraping film 5/287519: tt0071452
Total reviews scraped: 1
Now scraping film 6/287519: tt5129254
Total reviews scraped: 1
Now scraping film 7/287519: tt0021844
Total reviews scraped: 1
Now scraping film 8/287519: tt2842566
Total reviews scraped: 1
Now scraping film 9/287519: tt8461156
Total reviews scraped: 1
Now scraping film 10/287519: tt1110266
Total reviews scraped: 1
Now scraping film 11/287519: tt14715970
Total reviews scraped: 1
Now scraping film 12/287519: tt0041603
Total reviews scraped: 1
Now scraping film 13/287519: tt2162711
Total reviews scraped: 1
Now scraping film 14/287519: tt0052604
Total reviews scraped: 1
Now scraping film 15/287519: tt9676900
Total revi

Total reviews scraped: 1
Now scraping film 128/287519: tt6082156
Total reviews scraped: 1
Now scraping film 129/287519: tt1479398
Total reviews scraped: 1
Now scraping film 130/287519: tt8011328
Total reviews scraped: 1
Now scraping film 131/287519: tt1123948
Total reviews scraped: 1
Now scraping film 132/287519: tt0080551
Total reviews scraped: 1
Now scraping film 133/287519: tt0297244
Total reviews scraped: 1
Now scraping film 134/287519: tt0363771
Total reviews scraped: 1
Now scraping film 135/287519: tt1482101
Total reviews scraped: 1
Now scraping film 136/287519: tt0119622
Total reviews scraped: 1
Now scraping film 137/287519: tt0959518
Total reviews scraped: 1
Now scraping film 138/287519: tt0069013
Total reviews scraped: 1
Now scraping film 139/287519: tt1351665
Total reviews scraped: 1
Now scraping film 140/287519: tt6871426
Total reviews scraped: 1
Now scraping film 141/287519: tt7779818
Total reviews scraped: 1
Now scraping film 142/287519: tt0131043
Total reviews scraped: 1


Total reviews scraped: 1
Now scraping film 254/287519: tt5599536
Total reviews scraped: 1
Now scraping film 255/287519: tt21322850
Total reviews scraped: 1
Now scraping film 256/287519: tt1031658
Total reviews scraped: 1
Now scraping film 257/287519: tt9913872
Total reviews scraped: 1
Now scraping film 258/287519: tt0110216
Total reviews scraped: 1
Now scraping film 259/287519: tt1506990
Total reviews scraped: 1
Now scraping film 260/287519: tt0279162
Total reviews scraped: 1
Now scraping film 261/287519: tt0253757
Total reviews scraped: 1
Now scraping film 262/287519: tt0202167
Total reviews scraped: 1
Now scraping film 263/287519: tt3166926
Total reviews scraped: 1
Now scraping film 264/287519: tt10408656
Total reviews scraped: 1
Now scraping film 265/287519: tt0191242
Total reviews scraped: 1
Now scraping film 266/287519: tt4136696
Total reviews scraped: 1
Now scraping film 267/287519: tt1681372
Total reviews scraped: 1
Now scraping film 268/287519: tt1270707
Total reviews scraped: 

Total reviews scraped: 1
Now scraping film 380/287519: tt4857736
Total reviews scraped: 1
Now scraping film 381/287519: tt8168944
Total reviews scraped: 1
Now scraping film 382/287519: tt0799976
Total reviews scraped: 1
Now scraping film 383/287519: tt0318745
Total reviews scraped: 1
Now scraping film 384/287519: tt0064641
Total reviews scraped: 1
Now scraping film 385/287519: tt0140720
Total reviews scraped: 1
Now scraping film 386/287519: tt0070861
Total reviews scraped: 1
Now scraping film 387/287519: tt8688864
Total reviews scraped: 1
Now scraping film 388/287519: tt0071995
Total reviews scraped: 1
Now scraping film 389/287519: tt0181972
Total reviews scraped: 1
Now scraping film 390/287519: tt0199016
Total reviews scraped: 1
Now scraping film 391/287519: tt0052705
Total reviews scraped: 1
Now scraping film 392/287519: tt5864288
Total reviews scraped: 1
Now scraping film 393/287519: tt1198406
Total reviews scraped: 1
Now scraping film 394/287519: tt0310657
Total reviews scraped: 1


Total reviews scraped: 1
Now scraping film 506/287519: tt2848446
Total reviews scraped: 1
Now scraping film 507/287519: tt0166769
Total reviews scraped: 1
Now scraping film 508/287519: tt11891246
Total reviews scraped: 1
Now scraping film 509/287519: tt0291224
Total reviews scraped: 1
Now scraping film 510/287519: tt12414062
Total reviews scraped: 1
Now scraping film 511/287519: tt1233329
Total reviews scraped: 1
Now scraping film 512/287519: tt11426572
Total reviews scraped: 1
Now scraping film 513/287519: tt0368679
Total reviews scraped: 1
Now scraping film 514/287519: tt12885338
Total reviews scraped: 1
Now scraping film 515/287519: tt11807888
Total reviews scraped: 1
Now scraping film 516/287519: tt0086566
Total reviews scraped: 1
Now scraping film 517/287519: tt11378230
Total reviews scraped: 1
Now scraping film 518/287519: tt0042980
Total reviews scraped: 1
Now scraping film 519/287519: tt1959550
Total reviews scraped: 1
Now scraping film 520/287519: tt1671674
Total reviews scrap

Total reviews scraped: 1
Now scraping film 632/287519: tt27141553
Total reviews scraped: 1
Now scraping film 633/287519: tt2668914
Total reviews scraped: 1
Now scraping film 634/287519: tt0082262
Total reviews scraped: 1
Now scraping film 635/287519: tt0007473
Total reviews scraped: 1
Now scraping film 636/287519: tt17074540
Total reviews scraped: 1
Now scraping film 637/287519: tt1260680
Total reviews scraped: 1
Now scraping film 638/287519: tt3820948
Total reviews scraped: 1
Now scraping film 639/287519: tt0114149
Total reviews scraped: 1
Now scraping film 640/287519: tt0071647
Total reviews scraped: 1
Now scraping film 641/287519: tt0426332
Total reviews scraped: 1
Now scraping film 642/287519: tt0167787
Total reviews scraped: 1
Now scraping film 643/287519: tt0186714
Total reviews scraped: 1
Now scraping film 644/287519: tt0005739
Total reviews scraped: 1
Now scraping film 645/287519: tt0407887
Total reviews scraped: 1
Now scraping film 646/287519: tt0067095
Total reviews scraped: 

Total reviews scraped: 1
Now scraping film 758/287519: tt12145322
Total reviews scraped: 1
Now scraping film 759/287519: tt0040087
Total reviews scraped: 1
Now scraping film 760/287519: tt1948512
Total reviews scraped: 1
Now scraping film 761/287519: tt1617208
Total reviews scraped: 1
Now scraping film 762/287519: tt16526498
Total reviews scraped: 1
Now scraping film 763/287519: tt0096440
Total reviews scraped: 1
Now scraping film 764/287519: tt7897316
Total reviews scraped: 1
Now scraping film 765/287519: tt5974626
Total reviews scraped: 1
Now scraping film 766/287519: tt2948782
Total reviews scraped: 1
Now scraping film 767/287519: tt0069929
Total reviews scraped: 1
Now scraping film 768/287519: tt5051224
Total reviews scraped: 1
Now scraping film 769/287519: tt0116357
Total reviews scraped: 1
Now scraping film 770/287519: tt2081314
Total reviews scraped: 1
Now scraping film 771/287519: tt0095126
Total reviews scraped: 1
Now scraping film 772/287519: tt8822390
Total reviews scraped: 

Total reviews scraped: 1
Now scraping film 884/287519: tt2715590
Total reviews scraped: 1
Now scraping film 885/287519: tt0049153
Total reviews scraped: 1
Now scraping film 886/287519: tt2274052
Total reviews scraped: 1
Now scraping film 887/287519: tt5927562
Total reviews scraped: 1
Now scraping film 888/287519: tt1446838
Total reviews scraped: 1
Now scraping film 889/287519: tt4829436
Total reviews scraped: 1
Now scraping film 890/287519: tt10329842
Total reviews scraped: 1
Now scraping film 891/287519: tt0090565
Total reviews scraped: 1
Now scraping film 892/287519: tt1947969
Total reviews scraped: 1
Now scraping film 893/287519: tt0320136
Total reviews scraped: 1
Now scraping film 894/287519: tt14581444
Total reviews scraped: 1
Now scraping film 895/287519: tt4162012
Total reviews scraped: 1
Now scraping film 896/287519: tt0363096
Total reviews scraped: 1
Now scraping film 897/287519: tt0052919
Total reviews scraped: 1
Now scraping film 898/287519: tt0012484
Total reviews scraped: 

Total reviews scraped: 1
Now scraping film 1010/287519: tt0127272
Total reviews scraped: 1
Now scraping film 1011/287519: tt0211661
Total reviews scraped: 1
Now scraping film 1012/287519: tt0030118
Total reviews scraped: 1
Now scraping film 1013/287519: tt0201899
Total reviews scraped: 1
Now scraping film 1014/287519: tt6892064
Total reviews scraped: 1
Now scraping film 1015/287519: tt8174446
Total reviews scraped: 1
Now scraping film 1016/287519: tt0190319
Total reviews scraped: 1
Now scraping film 1017/287519: tt4321938
Total reviews scraped: 1
Now scraping film 1018/287519: tt0059805
Total reviews scraped: 1
Now scraping film 1019/287519: tt0156298
Total reviews scraped: 1
Now scraping film 1020/287519: tt0134771
Total reviews scraped: 1
Now scraping film 1021/287519: tt0229193
Total reviews scraped: 1
Now scraping film 1022/287519: tt2315152
Total reviews scraped: 1
Now scraping film 1023/287519: tt5126986
Total reviews scraped: 1
Now scraping film 1024/287519: tt1338636
Total revi

Total reviews scraped: 1
Now scraping film 1134/287519: tt0171469
Total reviews scraped: 1
Now scraping film 1135/287519: tt0302674
Total reviews scraped: 1
Now scraping film 1136/287519: tt0110480
Total reviews scraped: 1
Now scraping film 1137/287519: tt4242982
Total reviews scraped: 1
Now scraping film 1138/287519: tt0378697
Total reviews scraped: 1
Now scraping film 1139/287519: tt5250946
Total reviews scraped: 1
Now scraping film 1140/287519: tt0112428
Total reviews scraped: 1
Now scraping film 1141/287519: tt11722496
Total reviews scraped: 1
Now scraping film 1142/287519: tt10360860
Total reviews scraped: 1
Now scraping film 1143/287519: tt1858799
Total reviews scraped: 1
Now scraping film 1144/287519: tt20115096
Total reviews scraped: 1
Now scraping film 1145/287519: tt6809036
Total reviews scraped: 1
Now scraping film 1146/287519: tt5593444
Total reviews scraped: 1
Now scraping film 1147/287519: tt0091165
Total reviews scraped: 1
Now scraping film 1148/287519: tt0021377
Total r

Total reviews scraped: 1
Now scraping film 1259/287519: tt10524746
Total reviews scraped: 1
Now scraping film 1260/287519: tt0997170
Total reviews scraped: 1
Now scraping film 1261/287519: tt0327947
Total reviews scraped: 1
Now scraping film 1262/287519: tt1312954
Total reviews scraped: 1
Now scraping film 1263/287519: tt20231014
Total reviews scraped: 1
Now scraping film 1264/287519: tt2113638
Total reviews scraped: 1
Now scraping film 1265/287519: tt0370438
Total reviews scraped: 1
Now scraping film 1266/287519: tt0106308
Total reviews scraped: 1
Now scraping film 1267/287519: tt4464394
Total reviews scraped: 1
Now scraping film 1268/287519: tt0064757
Total reviews scraped: 1
Now scraping film 1269/287519: tt0180974
Total reviews scraped: 1
Now scraping film 1270/287519: tt0140003
Total reviews scraped: 1
Now scraping film 1271/287519: tt3228124
Total reviews scraped: 1
Now scraping film 1272/287519: tt1793239
Total reviews scraped: 1
Now scraping film 1273/287519: tt13073962
Total r

Total reviews scraped: 1
Now scraping film 1383/287519: tt4934886
Total reviews scraped: 1
Now scraping film 1384/287519: tt13761172
Total reviews scraped: 1
Now scraping film 1385/287519: tt8033592
Total reviews scraped: 1
Now scraping film 1386/287519: tt22360682
Total reviews scraped: 1
Now scraping film 1387/287519: tt1292642
Total reviews scraped: 1
Now scraping film 1388/287519: tt0461769
Total reviews scraped: 1
Now scraping film 1389/287519: tt1454700
Total reviews scraped: 1
Now scraping film 1390/287519: tt0357092
Total reviews scraped: 1
Now scraping film 1391/287519: tt0264883
Total reviews scraped: 1
Now scraping film 1392/287519: tt4653714
Total reviews scraped: 1
Now scraping film 1393/287519: tt0263017
Total reviews scraped: 1
Now scraping film 1394/287519: tt0038873
Total reviews scraped: 1
Now scraping film 1395/287519: tt0088678
Total reviews scraped: 1
Now scraping film 1396/287519: tt1219167
Total reviews scraped: 1
Now scraping film 1397/287519: tt1789814
Total re

Total reviews scraped: 1
Now scraping film 1507/287519: tt0078432
Total reviews scraped: 1
Now scraping film 1508/287519: tt0120883
Total reviews scraped: 1
Now scraping film 1509/287519: tt0058896
Total reviews scraped: 1
Now scraping film 1510/287519: tt0078986
Total reviews scraped: 1
Now scraping film 1511/287519: tt1326236
Total reviews scraped: 1
Now scraping film 1512/287519: tt4765870
Total reviews scraped: 1
Now scraping film 1513/287519: tt12466240
Total reviews scraped: 1
Now scraping film 1514/287519: tt5518358
Total reviews scraped: 1
Now scraping film 1515/287519: tt0290402
Total reviews scraped: 1
Now scraping film 1516/287519: tt0050743
Total reviews scraped: 1
Now scraping film 1517/287519: tt6113666
Total reviews scraped: 1
Now scraping film 1518/287519: tt2906280
Total reviews scraped: 1
Now scraping film 1519/287519: tt0469681
Total reviews scraped: 1
Now scraping film 1520/287519: tt10875196
Total reviews scraped: 1
Now scraping film 1521/287519: tt0275230
Total re

Total reviews scraped: 1
Now scraping film 1631/287519: tt0150079
Total reviews scraped: 1
Now scraping film 1632/287519: tt0258032
Total reviews scraped: 1
Now scraping film 1633/287519: tt0353010
Total reviews scraped: 1
Now scraping film 1634/287519: tt0100587
Total reviews scraped: 1
Now scraping film 1635/287519: tt2607890
Total reviews scraped: 1
Now scraping film 1636/287519: tt1878969
Total reviews scraped: 1
Now scraping film 1637/287519: tt0431498
Total reviews scraped: 1
Now scraping film 1638/287519: tt0445388
Total reviews scraped: 1
Now scraping film 1639/287519: tt0991010
Total reviews scraped: 1
Now scraping film 1640/287519: tt0240090
Total reviews scraped: 1
Now scraping film 1641/287519: tt12219112
Total reviews scraped: 1
Now scraping film 1642/287519: tt0052722
Total reviews scraped: 1
Now scraping film 1643/287519: tt0160624
Total reviews scraped: 1
Now scraping film 1644/287519: tt8171022
Total reviews scraped: 1
Now scraping film 1645/287519: tt0108672
Total rev

Total reviews scraped: 1
Now scraping film 1755/287519: tt3265542
Total reviews scraped: 1
Now scraping film 1756/287519: tt0132686
Total reviews scraped: 1
Now scraping film 1757/287519: tt10911998
Total reviews scraped: 1
Now scraping film 1758/287519: tt3331930
Total reviews scraped: 1
Now scraping film 1759/287519: tt1396235
Total reviews scraped: 1
Now scraping film 1760/287519: tt9788930
Total reviews scraped: 1
Now scraping film 1761/287519: tt0446686
Total reviews scraped: 1
Now scraping film 1762/287519: tt7820502
Total reviews scraped: 1
Now scraping film 1763/287519: tt7952200
Total reviews scraped: 1
Now scraping film 1764/287519: tt0354104
Total reviews scraped: 1
Now scraping film 1765/287519: tt0067676
Total reviews scraped: 1
Now scraping film 1766/287519: tt14057058
Total reviews scraped: 1
Now scraping film 1767/287519: tt0041746
Total reviews scraped: 1
Now scraping film 1768/287519: tt0051653
Total reviews scraped: 1
Now scraping film 1769/287519: tt16306290
Total r

Total reviews scraped: 1
Now scraping film 1879/287519: tt5786184
Total reviews scraped: 1
Now scraping film 1880/287519: tt3174292
Total reviews scraped: 1
Now scraping film 1881/287519: tt10434706
Total reviews scraped: 1
Now scraping film 1882/287519: tt0052617
Total reviews scraped: 1
Now scraping film 1883/287519: tt10642834
Total reviews scraped: 1
Now scraping film 1884/287519: tt11422894
Total reviews scraped: 1
Now scraping film 1885/287519: tt0011130
Total reviews scraped: 1
Now scraping film 1886/287519: tt0826796
Total reviews scraped: 1
Now scraping film 1887/287519: tt0115067
Total reviews scraped: 1
Now scraping film 1888/287519: tt0072590
Total reviews scraped: 1
Now scraping film 1889/287519: tt0039192
Total reviews scraped: 1
Now scraping film 1890/287519: tt11779042
Total reviews scraped: 1
Now scraping film 1891/287519: tt9860728
Total reviews scraped: 1
Now scraping film 1892/287519: tt0046996
Total reviews scraped: 1
Now scraping film 1893/287519: tt26595548
Total

Total reviews scraped: 1
Now scraping film 2003/287519: tt0198604
Total reviews scraped: 1
Now scraping film 2004/287519: tt0065610
Total reviews scraped: 1
Now scraping film 2005/287519: tt0089821
Total reviews scraped: 1
Now scraping film 2006/287519: tt0060730
Total reviews scraped: 1
Now scraping film 2007/287519: tt12738164
Total reviews scraped: 1
Now scraping film 2008/287519: tt4138306
Total reviews scraped: 1
Now scraping film 2009/287519: tt4292478
Total reviews scraped: 1
Now scraping film 2010/287519: tt14773866
Total reviews scraped: 1
Now scraping film 2011/287519: tt0251182
Total reviews scraped: 1
Now scraping film 2012/287519: tt0013093
Total reviews scraped: 1
Now scraping film 2013/287519: tt2430166
Total reviews scraped: 1
Now scraping film 2014/287519: tt0031420
Total reviews scraped: 1
Now scraping film 2015/287519: tt0031519
Total reviews scraped: 1
Now scraping film 2016/287519: tt0810822
Total reviews scraped: 1
Now scraping film 2017/287519: tt0037148
Total re

Total reviews scraped: 1
Now scraping film 2127/287519: tt0173686
Total reviews scraped: 1
Now scraping film 2128/287519: tt0196857
Total reviews scraped: 1
Now scraping film 2129/287519: tt3487306
Total reviews scraped: 1
Now scraping film 2130/287519: tt0075279
Total reviews scraped: 1
Now scraping film 2131/287519: tt1802529
Total reviews scraped: 1
Now scraping film 2132/287519: tt0054336
Total reviews scraped: 1
Now scraping film 2133/287519: tt0049492
Total reviews scraped: 1
Now scraping film 2134/287519: tt0259335
Total reviews scraped: 1
Now scraping film 2135/287519: tt1160012
Total reviews scraped: 1
Now scraping film 2136/287519: tt2597554
Total reviews scraped: 1
Now scraping film 2137/287519: tt7514302
Total reviews scraped: 1
Now scraping film 2138/287519: tt0046436
Total reviews scraped: 1
Now scraping film 2139/287519: tt3119050
Total reviews scraped: 1
Now scraping film 2140/287519: tt1297844
Total reviews scraped: 1
Now scraping film 2141/287519: tt2327439
Total revi

Total reviews scraped: 1
Now scraping film 2252/287519: tt0071963
Total reviews scraped: 1
Now scraping film 2253/287519: tt3679978
Total reviews scraped: 1
Now scraping film 2254/287519: tt4977540
Total reviews scraped: 1
Now scraping film 2255/287519: tt11409736
Total reviews scraped: 1
Now scraping film 2256/287519: tt2610798
Total reviews scraped: 1
Now scraping film 2257/287519: tt0014118
Total reviews scraped: 1
Now scraping film 2258/287519: tt14873836
Total reviews scraped: 1
Now scraping film 2259/287519: tt1800669
Total reviews scraped: 1
Now scraping film 2260/287519: tt0106332
Total reviews scraped: 1
Now scraping film 2261/287519: tt1227140
Total reviews scraped: 1
Now scraping film 2262/287519: tt6982588
Total reviews scraped: 1
Now scraping film 2263/287519: tt0008623
Total reviews scraped: 1
Now scraping film 2264/287519: tt15393652
Total reviews scraped: 1
Now scraping film 2265/287519: tt0470042
Total reviews scraped: 1
Now scraping film 2266/287519: tt2027237
Total r

Total reviews scraped: 1
Now scraping film 2376/287519: tt1155630
Total reviews scraped: 1
Now scraping film 2377/287519: tt14365408
Total reviews scraped: 1
Now scraping film 2378/287519: tt6553342
Total reviews scraped: 1
Now scraping film 2379/287519: tt0189603
Total reviews scraped: 1
Now scraping film 2380/287519: tt0425080
Total reviews scraped: 1
Now scraping film 2381/287519: tt3042800
Total reviews scraped: 1
Now scraping film 2382/287519: tt26084136
Total reviews scraped: 1
Now scraping film 2383/287519: tt0129455
Total reviews scraped: 1
Now scraping film 2384/287519: tt1733234
Total reviews scraped: 1
Now scraping film 2385/287519: tt1399565
Total reviews scraped: 1
Now scraping film 2386/287519: tt0451983
Total reviews scraped: 1
Now scraping film 2387/287519: tt4524792
Total reviews scraped: 1
Now scraping film 2388/287519: tt15700594
Total reviews scraped: 1
Now scraping film 2389/287519: tt8011654
Total reviews scraped: 1
Now scraping film 2390/287519: tt5094192
Total r

KeyboardInterrupt: 

In [21]:
#maybe later we could expand to collect metascore data as well, make more accurate predictions

# movie_data = pd.DataFrame(columns=["imdb_id", "critic_reviews", "user_reviews", "metascore", "budget", "worldwide_gross"])

# for imdb_id in imdb_ids:
#     df = scrape_movie_data(imdb_id)
#     movie_data = movie_data.append(df, ignore_index=True)

  movie_data = movie_data.append(df, ignore_index=True)
