In [None]:
import requests
import random
import time
import csv
import os

API_KEY = '' #removed for publishing  
BASE_URL = 'https://api.themoviedb.org/3'
OUTPUT_FILE = 'random_movies_1000.csv'
TOTAL_MOVIES = 1000
MAX_ID = 1000000

MOVIES = []
UNIQUE_IDS = set()

if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            MOVIES.append(row)
            UNIQUE_IDS.add(int(row['id']))
    print(f" Resuming with {len(MOVIES)} movies already collected.")

def get_movie_by_id(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {
        'api_key': API_KEY,
        'language': 'en-US'
    }
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            movie = response.json()
            if movie.get("adult") is False and movie.get("title") and movie.get("vote_count", 0) > 0:
                return {
                    'id': movie['id'],
                    'title': movie.get('title'),
                    'release_date': movie.get('release_date'),
                    'overview': movie.get('overview'),
                    'vote_average': movie.get('vote_average'),
                    'vote_count': movie.get('vote_count'),
                    'popularity': movie.get('popularity'),
                    'original_language': movie.get('original_language')
                }
    except Exception as e:
        print(f" Error fetching movie ID {movie_id}: {e}")
    return None

def save_to_csv(movies, filename=OUTPUT_FILE):
    if not movies:
        return
    keys = movies[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(movies)

def collect_random_movies(target=TOTAL_MOVIES):
    request_count = 0
    while len(MOVIES) < target:
        if request_count > 0 and request_count % 40 == 0:
            print(" Waiting 10 seconds to respect TMDB rate limits...")
            time.sleep(10)

        random_id = random.randint(1, MAX_ID)
        if random_id in UNIQUE_IDS:
            continue
        UNIQUE_IDS.add(random_id)

        movie = get_movie_by_id(random_id)
        request_count += 1

        if movie:
            MOVIES.append(movie)
            print(f" ({len(MOVIES)}/{target}): {movie['title']} ({movie['release_date']})")

            if len(MOVIES) % 100 == 0:
                print(" Saving checkpoint...")
                save_to_csv(MOVIES)

        time.sleep(0.25)

    print(" Finished collecting 1000 movies!")
    save_to_csv(MOVIES)

collect_random_movies()


 (1/1000): WEC 32: New Mexico (2008-02-13)
 (2/1000): In Concert at The Hollywood Bowl: Musicals and the Movies (2020-09-09)
 (3/1000): Every 9 Hours (2019-05-31)
 (4/1000): Time for Revenge (1981-07-30)
 (5/1000): Chvrches – Pitchfork Music Festival 2015 (2015-09-25)
 (6/1000): Japanese Borscht (2019-07-30)
 (7/1000): Taxi Ballad (2012-03-08)
 (8/1000): My Old Ass (2024-09-13)
 (9/1000): O.H.M.S. (1937-01-01)
 (10/1000): Double Platinum (1999-05-16)
 (11/1000): Dia Branco (2014-01-30)
 (12/1000): Rossini: La Donna del Lago (2015-03-14)
 (13/1000): The Colours of My Father: A Portrait of Sam Borenstein (1992-01-01)
 Waiting 10 seconds to respect TMDB rate limits...
 (14/1000): Song Service (1930-10-24)
 (15/1000): Sesame Street: Elmo Says BOO! (1997-07-08)
 (16/1000): Play Dead (2011-04-23)
 (17/1000): Planet Terror (2007-04-06)
 (18/1000): Asi mit Niwoh (2019-02-07)
 (19/1000): Buried Alive (2008-09-30)
 (20/1000): Another You (2017-05-15)
 (21/1000): Great Expectations (2019-06-26)
 

In [None]:
import requests
import random
import time
import csv
import os

API_KEY = '' # removed for publishing
BASE_URL = 'https://api.themoviedb.org/3'
MAX_ID = 1000000

# === File & chunk settings ===
NEW_OUTPUT_FILE = 'random_movies_1001_2000.csv'
NEW_TARGET = 1000  # number of new movies to fetch

NEW_MOVIES = []
EXISTING_IDS = set()

# === Load existing IDs from previous file ===
with open('random_movies_1000.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        EXISTING_IDS.add(int(row['id']))
print(f"Loaded {len(EXISTING_IDS)} existing movie IDs.")

# === Movie fetching function ===
def get_movie_by_id(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {'api_key': API_KEY, 'language': 'en-US'}
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            movie = response.json()
            if movie.get("title") and movie.get("vote_count", 0) > 0:
                return {
                    'id': movie['id'],
                    'title': movie.get('title'),
                    'release_date': movie.get('release_date'),
                    'overview': movie.get('overview'),
                    'vote_average': movie.get('vote_average'),
                    'vote_count': movie.get('vote_count'),
                    'popularity': movie.get('popularity'),
                    'original_language': movie.get('original_language'),
                    'adult': movie.get('adult')  # ← keep this if you want to include adult flag
                }
    except Exception as e:
        print(f"Error fetching movie ID {movie_id}: {e}")
    return None

# === Save new movies to a CSV file ===
def save_to_csv(movies, filename):
    if not movies:
        return
    keys = movies[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(movies)

# === Collect new unique movies ===
def collect_additional_movies(target=NEW_TARGET):
    request_count = 0
    while len(NEW_MOVIES) < target:
        if request_count > 0 and request_count % 40 == 0:
            print("Waiting 10 seconds for rate limit...")
            time.sleep(10)

        random_id = random.randint(1, MAX_ID)
        if random_id in EXISTING_IDS:
            continue
        EXISTING_IDS.add(random_id)

        movie = get_movie_by_id(random_id)
        request_count += 1

        if movie:
            NEW_MOVIES.append(movie)
            print(f"({len(NEW_MOVIES)}/{target}): {movie['title']} ({movie['release_date']})")

        time.sleep(0.25)

    print(f"Finished collecting {len(NEW_MOVIES)} additional movies.")
    save_to_csv(NEW_MOVIES, NEW_OUTPUT_FILE)

# === Run the new scrape ===
collect_additional_movies()


Loaded 1000 existing movie IDs.
(1/1000): The Tiger from Tjampa (1953-01-01)
(2/1000): Para Sonia (2015-03-04)
(3/1000): The Other Pompeii: Life & Death in Herculaneum (2013-04-01)
(4/1000): American Made Movie (2013-08-30)
(5/1000): Outerborough (2005-04-19)
(6/1000): Les Déferlantes (2013-09-12)
(7/1000): The Game Changer (2017-02-10)
(8/1000): Multiplex 10 (2018-10-26)
(9/1000): Silence of the Tides (2020-11-22)
(10/1000): Madness in Bloom (2002-10-19)
(11/1000): Three Poplars on Plyuschikha Street (1968-04-29)
(12/1000): The Execution of the Traitor to the Homeland Ernst S. (1976-09-26)
(13/1000): Storybook Friends: A Little Christmas Magic (1998-11-12)
(14/1000): Boss Engira Baskaran (2010-09-10)
(15/1000): Barrier Device (2002-05-11)
Waiting 10 seconds for rate limit...
(16/1000): Angel in a Devil's Body (1984-02-01)
(17/1000): Quarantine (2021-10-04)
(18/1000): Time to Kill (2018-02-23)
(19/1000): Crossroads (2013-10-25)
(20/1000): Who We Are: A Chronicle of Racism in America (2

In [None]:
import requests
import random
import time
import csv
import os

API_KEY = ''
BASE_URL = 'https://api.themoviedb.org/3'
MAX_ID = 1000000

# === File & chunk settings ===
NEW_OUTPUT_FILE = 'random_movies_2001_3000.csv'
NEW_TARGET = 1000  # number of new movies to fetch

NEW_MOVIES = []
EXISTING_IDS = set()

# === Load existing IDs from previous file ===
with open('random_movies_1000.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        EXISTING_IDS.add(int(row['id']))
print(f"Loaded {len(EXISTING_IDS)} existing movie IDs.")

# === Movie fetching function ===
def get_movie_by_id(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {'api_key': API_KEY, 'language': 'en-US'}
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            movie = response.json()
            if movie.get("title") and movie.get("vote_count", 0) > 0:
                return {
                    'id': movie['id'],
                    'title': movie.get('title'),
                    'release_date': movie.get('release_date'),
                    'overview': movie.get('overview'),
                    'vote_average': movie.get('vote_average'),
                    'vote_count': movie.get('vote_count'),
                    'popularity': movie.get('popularity'),
                    'original_language': movie.get('original_language'),
                    'adult': movie.get('adult')  # ← keep this if you want to include adult flag
                }
    except Exception as e:
        print(f"Error fetching movie ID {movie_id}: {e}")
    return None

# === Save new movies to a CSV file ===
def save_to_csv(movies, filename):
    if not movies:
        return
    keys = movies[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(movies)

# === Collect new unique movies ===
def collect_additional_movies(target=NEW_TARGET):
    request_count = 0
    while len(NEW_MOVIES) < target:
        if request_count > 0 and request_count % 40 == 0:
            print("Waiting 10 seconds for rate limit...")
            time.sleep(10)

        random_id = random.randint(1, MAX_ID)
        if random_id in EXISTING_IDS:
            continue
        EXISTING_IDS.add(random_id)

        movie = get_movie_by_id(random_id)
        request_count += 1

        if movie:
            NEW_MOVIES.append(movie)
            print(f"({len(NEW_MOVIES)}/{target}): {movie['title']} ({movie['release_date']})")

        time.sleep(0.25)

    print(f"Finished collecting {len(NEW_MOVIES)} additional movies into {NEW_OUTPUT_FILE}.")
    save_to_csv(NEW_MOVIES, NEW_OUTPUT_FILE)

# === Run the new scrape ===
collect_additional_movies()


Loaded 1000 existing movie IDs.
(1/1000): The Birth of "Princess Mononoke" Part 2: Life Has Been Breathed Into It! (1997-02-04)
(2/1000): El Gran Viaje - 1) Los últimos indígenas de Europa (2011-12-30)
(3/1000): The Unborn II (1994-04-20)
(4/1000): Bumper Draw (2015-10-16)
(5/1000): Brainiac (2004-01-01)
(6/1000): No Place for Fools (2015-01-24)
(7/1000): Ein Traum von Hochzeit (1997-01-01)
(8/1000): Nativity 3: Dude, Where's My Donkey?! (2014-11-14)
(9/1000): Listen to Me (1989-05-05)
(10/1000): Ugly Nasty People (2017-10-19)
(11/1000): Kidu (2018-06-29)
(12/1000): Bad is Bad (2011-08-06)
Waiting 10 seconds for rate limit...
(13/1000): Lightrapping (2016-09-20)
(14/1000): Ankebût (2020-03-28)
(15/1000): The Phoenix (1978-01-01)
(16/1000): Coyote (1992-06-26)
(17/1000): Luxury Nurse (1999-01-01)
(18/1000): Playboy: Sexy Lingerie V (1993-01-01)
(19/1000): Step Lively (1917-12-30)
(20/1000): The Story of Soaps (2020-05-19)
(21/1000): How the World Is Losing Poets (1982-11-12)
(22/1000): 

In [None]:
import requests
import random
import time
import csv
import os

API_KEY = ''
BASE_URL = 'https://api.themoviedb.org/3'
MAX_ID = 1000000

# === File & chunk settings ===
NEW_OUTPUT_FILE = 'random_movies_3001_4000.csv'
NEW_TARGET = 1000  # number of new movies to fetch

NEW_MOVIES = []
EXISTING_IDS = set()

# === Load existing IDs from ALL previous CSVs ===
for file in os.listdir():
    if file.startswith("random_movies_") and file.endswith(".csv") and file != NEW_OUTPUT_FILE:
        with open(file, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                EXISTING_IDS.add(int(row['id']))
print(f"Loaded {len(EXISTING_IDS)} existing movie IDs from previous files.")

# === Movie fetching function ===
def get_movie_by_id(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {'api_key': API_KEY, 'language': 'en-US'}
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            movie = response.json()
            if movie.get("title") and movie.get("vote_count", 0) > 0:
                return {
                    'id': movie['id'],
                    'title': movie.get('title'),
                    'release_date': movie.get('release_date'),
                    'overview': movie.get('overview'),
                    'vote_average': movie.get('vote_average'),
                    'vote_count': movie.get('vote_count'),
                    'popularity': movie.get('popularity'),
                    'original_language': movie.get('original_language'),
                    'adult': movie.get('adult')  # Include this if you want to filter or flag later
                }
    except Exception as e:
        print(f"Error fetching movie ID {movie_id}: {e}")
    return None

# === Save new movies to a CSV file ===
def save_to_csv(movies, filename):
    if not movies:
        return
    keys = movies[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(movies)

# === Collect new unique movies ===
def collect_additional_movies(target=NEW_TARGET):
    request_count = 0
    while len(NEW_MOVIES) < target:
        if request_count > 0 and request_count % 40 == 0:
            print("Waiting 10 seconds for rate limit...")
            time.sleep(10)

        random_id = random.randint(1, MAX_ID)
        if random_id in EXISTING_IDS:
            continue
        EXISTING_IDS.add(random_id)

        movie = get_movie_by_id(random_id)
        request_count += 1

        if movie:
            NEW_MOVIES.append(movie)
            print(f"({len(NEW_MOVIES)}/{target}): {movie['title']} ({movie['release_date']})")

        time.sleep(0.25)

    print(f"Finished collecting {len(NEW_MOVIES)} additional movies into {NEW_OUTPUT_FILE}.")
    save_to_csv(NEW_MOVIES, NEW_OUTPUT_FILE)

# === Run the new scrape ===
collect_additional_movies()


Loaded 2999 existing movie IDs from previous files.
(1/1000): One More Flip (2021-10-01)
(2/1000): Car Trouble (1986-02-28)
(3/1000): Hello, Fred the Beard (1978-10-16)
(4/1000): After Life (2008-01-17)
(5/1000): The Case for Christ (2007-09-11)
(6/1000): Barrio (2006-01-01)
(7/1000): The $50,000,000 Cherry (1987-01-01)
(8/1000): Sweet Daddy (1921-12-11)
(9/1000): Pink, Plunk, Plink (1966-05-25)
Waiting 10 seconds for rate limit...
(10/1000): The Snowman (1985-03-22)
(11/1000): Il vegetale (2018-01-18)
(12/1000): The Happy Couple (1975-10-27)
(13/1000): Le chemin (2017-09-06)
(14/1000): Snail House (2017-10-22)
(15/1000): All Is One. Except 0 (2020-10-04)
(16/1000): Pavada (2016-01-15)
(17/1000): Cat Fishin' (1947-02-22)
(18/1000): Tokyo Woes (1945-11-15)
Waiting 10 seconds for rate limit...
(19/1000): Ponnapuram Kotta (1974-01-23)
(20/1000): The Goob (2014-09-25)
(21/1000): Namo OK (2014-11-06)
(22/1000): Hamlet (2008-02-18)
(23/1000): Ceiling (1962-03-15)
(24/1000): Fuckland (2000-09

In [None]:
import requests
import random
import time
import csv
import os

API_KEY = ''
BASE_URL = 'https://api.themoviedb.org/3'
MAX_ID = 1000000

# === File & chunk settings ===
NEW_OUTPUT_FILE = 'random_movies_4001_5000.csv'
NEW_TARGET = 1000  # number of new movies to fetch

NEW_MOVIES = []
EXISTING_IDS = set()

# === Load existing IDs from ALL previous CSVs ===
for file in os.listdir():
    if file.startswith("random_movies_") and file.endswith(".csv") and file != NEW_OUTPUT_FILE:
        with open(file, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                try:
                    EXISTING_IDS.add(int(row['id']))
                except ValueError:
                    continue
print(f"Loaded {len(EXISTING_IDS)} existing movie IDs from previous files.")

# === Movie fetching function ===
def get_movie_by_id(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {'api_key': API_KEY, 'language': 'en-US'}
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            movie = response.json()
            if movie.get("title") and movie.get("vote_count", 0) > 0:
                return {
                    'id': movie['id'],
                    'title': movie.get('title'),
                    'release_date': movie.get('release_date'),
                    'overview': movie.get('overview'),
                    'vote_average': movie.get('vote_average'),
                    'vote_count': movie.get('vote_count'),
                    'popularity': movie.get('popularity'),
                    'original_language': movie.get('original_language'),
                    'adult': movie.get('adult')  # Keep adult flag if you want to use/filter it later
                }
    except Exception as e:
        print(f"Error fetching movie ID {movie_id}: {e}")
    return None

# === Save new movies to a CSV file ===
def save_to_csv(movies, filename):
    if not movies:
        return
    keys = movies[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(movies)

# === Collect new unique movies ===
def collect_additional_movies(target=NEW_TARGET):
    request_count = 0
    while len(NEW_MOVIES) < target:
        if request_count > 0 and request_count % 40 == 0:
            print("Waiting 10 seconds for rate limit...")
            time.sleep(10)

        random_id = random.randint(1, MAX_ID)
        if random_id in EXISTING_IDS:
            continue
        EXISTING_IDS.add(random_id)

        movie = get_movie_by_id(random_id)
        request_count += 1

        if movie:
            NEW_MOVIES.append(movie)
            print(f"({len(NEW_MOVIES)}/{target}): {movie['title']} ({movie['release_date']})")

        time.sleep(0.25)

    print(f"Finished collecting {len(NEW_MOVIES)} additional movies into {NEW_OUTPUT_FILE}.")
    save_to_csv(NEW_MOVIES, NEW_OUTPUT_FILE)

# === Run the new scrape ===
collect_additional_movies()


Loaded 3999 existing movie IDs from previous files.
(1/1000): A Country More Beautiful Than Before (2018-12-05)
(2/1000): Be Here to Love Me (2004-09-13)
(3/1000): Кто пасётся на лугу? (1973-01-01)
(4/1000): Robert et Robert (1978-06-13)
(5/1000): Wouter Deprez: War (2008-03-30)
(6/1000): Stone Locals - Rediscovering the Soul of Climbing (2020-08-28)
(7/1000): Puño de lodo (1999-01-01)
(8/1000): Fragments of Conversations with Jean-Luc Godard (2007-08-05)
(9/1000): Delizia (1986-12-17)
Waiting 10 seconds for rate limit...
(10/1000): Crisis Hotline: Veterans Press 1 (2013-11-11)
(11/1000): Father Knows Best (2007-01-01)
(12/1000): $9.99 (2009-04-29)
(13/1000): Beyond the Seventh Door (1987-06-28)
(14/1000): The Prodigy Live at Brixton Academy (2006-01-01)
(15/1000): Super Bowl XXXIX Champions: New England Patriots (2005-03-01)
(16/1000): True Anal Training 2 (2019-03-04)
(17/1000): To Hell with Women (1955-02-17)
(18/1000): Luther (1974-01-21)
(19/1000): The 4th Company (2017-01-13)
(20

In [None]:
import requests
import random
import time
import csv
import os

API_KEY = ''
BASE_URL = 'https://api.themoviedb.org/3'
MAX_ID = 1000000

# === File & chunk settings ===
NEW_OUTPUT_FILE = 'random_movies_5001_6000.csv'
NEW_TARGET = 1000  # number of new movies to fetch

NEW_MOVIES = []
EXISTING_IDS = set()

# === Load existing IDs from ALL previous CSVs ===
for file in os.listdir():
    if file.startswith("random_movies_") and file.endswith(".csv") and file != NEW_OUTPUT_FILE:
        with open(file, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                try:
                    EXISTING_IDS.add(int(row['id']))
                except ValueError:
                    continue
print(f"Loaded {len(EXISTING_IDS)} existing movie IDs from previous files.")

# === Movie fetching function ===
def get_movie_by_id(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {'api_key': API_KEY, 'language': 'en-US'}
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            movie = response.json()
            if movie.get("title") and movie.get("vote_count", 0) > 0:
                return {
                    'id': movie['id'],
                    'title': movie.get('title'),
                    'release_date': movie.get('release_date'),
                    'overview': movie.get('overview'),
                    'vote_average': movie.get('vote_average'),
                    'vote_count': movie.get('vote_count'),
                    'popularity': movie.get('popularity'),
                    'original_language': movie.get('original_language'),
                    'adult': movie.get('adult')  # include if you're tracking it
                }
    except Exception as e:
        print(f"Error fetching movie ID {movie_id}: {e}")
    return None

# === Save new movies to a CSV file ===
def save_to_csv(movies, filename):
    if not movies:
        return
    keys = movies[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(movies)

# === Collect new unique movies ===
def collect_additional_movies(target=NEW_TARGET):
    request_count = 0
    while len(NEW_MOVIES) < target:
        if request_count > 0 and request_count % 40 == 0:
            print("Waiting 10 seconds for rate limit...")
            time.sleep(10)

        random_id = random.randint(1, MAX_ID)
        if random_id in EXISTING_IDS:
            continue
        EXISTING_IDS.add(random_id)

        movie = get_movie_by_id(random_id)
        request_count += 1

        if movie:
            NEW_MOVIES.append(movie)
            print(f"({len(NEW_MOVIES)}/{target}): {movie['title']} ({movie['release_date']})")

        time.sleep(0.25)

    print(f"Finished collecting {len(NEW_MOVIES)} additional movies into {NEW_OUTPUT_FILE}.")
    save_to_csv(NEW_MOVIES, NEW_OUTPUT_FILE)

# === Run the new scrape ===
collect_additional_movies()


Loaded 4999 existing movie IDs from previous files.
(1/1000): Seduce Me Tonight (1982-01-01)
(2/1000): Dance of a Dream (2001-12-21)
(3/1000): Orphée aux enfers (1997-01-01)
(4/1000): Gone Wednesday (2020-05-15)
(5/1000): 69 Scenes: Superstars of the 90's (2012-05-21)
(6/1000): I Am Somebody (1970-01-01)
(7/1000): Worm (2013-04-18)
(8/1000): A Story from Echigo (1964-05-09)
(9/1000): Against the Law (1934-10-25)
(10/1000): Thigh High Thrills (2004-11-23)
(11/1000): Monsieur Beaucaire (1946-09-04)
(12/1000): George Lopez: Why You Crying? (2005-09-23)
(13/1000): Film sans caméra (F.S.C.) n°1 (1973-03-23)
(14/1000): Açaí (2020-02-07)
(15/1000): Harry Knuckles (1998-06-01)
(16/1000): Advanced Style (2014-05-09)
(17/1000): Mining Operations, Pennsylvania Coal Fields (1904-12-22)
(18/1000): Crime Ring (1938-07-08)
(19/1000): Attack of the 50 Foot Cheerleader (2012-08-25)
(20/1000): Ma fille n'épousera qu'un médium (1909-09-28)
(21/1000): Backfield in Motion (1995-01-01)
(22/1000): Double Pen

In [None]:
import requests
import random
import time
import csv
import os

API_KEY = ''
BASE_URL = 'https://api.themoviedb.org/3'
MAX_ID = 1000000

# === File & chunk settings ===
NEW_OUTPUT_FILE = 'random_movies_6001_7000.csv'
NEW_TARGET = 1000  # number of new movies to fetch

NEW_MOVIES = []
EXISTING_IDS = set()

# === Load existing IDs from ALL previous CSVs ===
for file in os.listdir():
    if file.startswith("random_movies_") and file.endswith(".csv") and file != NEW_OUTPUT_FILE:
        with open(file, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                try:
                    EXISTING_IDS.add(int(row['id']))
                except ValueError:
                    continue
print(f"Loaded {len(EXISTING_IDS)} existing movie IDs from previous files.")

# === Movie fetching function ===
def get_movie_by_id(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {'api_key': API_KEY, 'language': 'en-US'}
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            movie = response.json()
            if movie.get("title") and movie.get("vote_count", 0) > 0:
                return {
                    'id': movie['id'],
                    'title': movie.get('title'),
                    'release_date': movie.get('release_date'),
                    'overview': movie.get('overview'),
                    'vote_average': movie.get('vote_average'),
                    'vote_count': movie.get('vote_count'),
                    'popularity': movie.get('popularity'),
                    'original_language': movie.get('original_language'),
                    'adult': movie.get('adult')
                }
    except Exception as e:
        print(f"Error fetching movie ID {movie_id}: {e}")
    return None

# === Save new movies to a CSV file ===
def save_to_csv(movies, filename):
    if not movies:
        return
    keys = movies[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(movies)

# === Collect new unique movies ===
def collect_additional_movies(target=NEW_TARGET):
    request_count = 0
    while len(NEW_MOVIES) < target:
        if request_count > 0 and request_count % 40 == 0:
            print("Waiting 10 seconds for rate limit...")
            time.sleep(10)

        random_id = random.randint(1, MAX_ID)
        if random_id in EXISTING_IDS:
            continue
        EXISTING_IDS.add(random_id)

        movie = get_movie_by_id(random_id)
        request_count += 1

        if movie:
            NEW_MOVIES.append(movie)
            print(f"({len(NEW_MOVIES)}/{target}): {movie['title']} ({movie['release_date']})")

        time.sleep(0.25)

    print(f"Finished collecting {len(NEW_MOVIES)} additional movies into {NEW_OUTPUT_FILE}.")
    save_to_csv(NEW_MOVIES, NEW_OUTPUT_FILE)

# === Run the new scrape ===
collect_additional_movies()


Loaded 5999 existing movie IDs from previous files.
(1/1000): Frat House (1979-03-22)
(2/1000): Fucking Pissed Off 2 (2018-02-12)
(3/1000): Blood on the Asphalt (1992-12-14)
(4/1000): Sex and the Silver Gays (2016-09-12)
(5/1000): Under the Sun of Satan (1987-08-01)
(6/1000): Swamp Shark (2011-06-25)
(7/1000): Mother (1979-01-01)
(8/1000): Léo e Bia (2010-05-04)
(9/1000): Kuchh Meetha Ho Jaye (2005-04-15)
(10/1000): The Shining Hour (1938-11-18)
(11/1000): The Farmer's Wife (1941-04-20)
(12/1000): Wing Chun (1994-03-24)
(13/1000): Luces de Nueva York (2001-01-01)
(14/1000): Taandro (2015-11-16)
(15/1000): Something Real (2012-07-22)
(16/1000): Emil (2020-09-09)
Waiting 10 seconds for rate limit...
(17/1000): Flow (2014-10-26)
(18/1000): Pussy Lust 2 (2018-05-29)
(19/1000): Sociopathia (2015-11-19)
(20/1000): Annie O (1996-03-12)
(21/1000): Spinach Greetings (1960-11-15)
(22/1000): Hype Williams: The Videos Vol. 1 (2002-01-01)
(23/1000): Memory of Berlin (1998-11-09)
(24/1000): Der Mann

In [None]:
import requests
import random
import time
import csv
import os

API_KEY = ''
BASE_URL = 'https://api.themoviedb.org/3'
MAX_ID = 1000000

# === File & chunk settings ===
NEW_OUTPUT_FILE = 'random_movies_7001_8000.csv'
NEW_TARGET = 1000  # number of new movies to fetch

NEW_MOVIES = []
EXISTING_IDS = set()

# === Load existing IDs from ALL previous CSVs ===
for file in os.listdir():
    if file.startswith("random_movies_") and file.endswith(".csv") and file != NEW_OUTPUT_FILE:
        with open(file, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                try:
                    EXISTING_IDS.add(int(row['id']))
                except ValueError:
                    continue
print(f"Loaded {len(EXISTING_IDS)} existing movie IDs from previous files.")

# === Movie fetching function ===
def get_movie_by_id(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {'api_key': API_KEY, 'language': 'en-US'}
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            movie = response.json()
            if movie.get("title") and movie.get("vote_count", 0) > 0:
                return {
                    'id': movie['id'],
                    'title': movie.get('title'),
                    'release_date': movie.get('release_date'),
                    'overview': movie.get('overview'),
                    'vote_average': movie.get('vote_average'),
                    'vote_count': movie.get('vote_count'),
                    'popularity': movie.get('popularity'),
                    'original_language': movie.get('original_language'),
                    'adult': movie.get('adult')
                }
    except Exception as e:
        print(f"Error fetching movie ID {movie_id}: {e}")
    return None

# === Save new movies to a CSV file ===
def save_to_csv(movies, filename):
    if not movies:
        return
    keys = movies[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(movies)

# === Collect new unique movies ===
def collect_additional_movies(target=NEW_TARGET):
    request_count = 0
    while len(NEW_MOVIES) < target:
        if request_count > 0 and request_count % 40 == 0:
            print("Waiting 10 seconds for rate limit...")
            time.sleep(10)

        random_id = random.randint(1, MAX_ID)
        if random_id in EXISTING_IDS:
            continue
        EXISTING_IDS.add(random_id)

        movie = get_movie_by_id(random_id)
        request_count += 1

        if movie:
            NEW_MOVIES.append(movie)
            print(f"({len(NEW_MOVIES)}/{target}): {movie['title']} ({movie['release_date']})")

        time.sleep(0.25)

    print(f"Finished collecting {len(NEW_MOVIES)} additional movies into {NEW_OUTPUT_FILE}.")
    save_to_csv(NEW_MOVIES, NEW_OUTPUT_FILE)

# === Run the new scrape ===
collect_additional_movies()


Loaded 6999 existing movie IDs from previous files.
(1/1000): Something Borrowed (2020-10-24)
(2/1000): NOM (2019-11-09)
(3/1000): Naughty Couple (1994-06-10)
(4/1000): The Man Outside (1933-06-01)
(5/1000): Bella und der Feigenbaum (2013-08-22)
(6/1000): Oh slippery trail (2001-01-01)
(7/1000): Saturday Night Live: The Best of Steve Martin (2000-04-25)
(8/1000): Children's Marriages (1950-12-31)
(9/1000): Dhinamdhorum (1998-02-13)
(10/1000): Memories Within Miss Aggie (1974-05-01)
(11/1000): Thief or Reality (2001-08-05)
(12/1000): La sonrisa de mamá (1972-03-02)
(13/1000): Akram Khan's Giselle (2018-04-25)
(14/1000): Bugs Bunny's Looney Christmas Tales (1979-11-27)
(15/1000): Our Town (1956-08-28)
Waiting 10 seconds for rate limit...
(16/1000): Mga babae sa Isla Azul (1998-07-30)
(17/1000): Mom’s Song (2021-10-28)
(18/1000): VIZ: Oh Lordy! It's The Fat Slags: In Slags At Large (1990-07-15)
(19/1000): Homely Meals (2014-10-03)
(20/1000): Persian Series #8 (2000-03-28)
(21/1000): Help 

In [None]:
import requests
import random
import time
import csv
import os

API_KEY = ''
BASE_URL = 'https://api.themoviedb.org/3'
MAX_ID = 1000000

# === File & chunk settings ===
NEW_OUTPUT_FILE = 'random_movies_8001_9000.csv'
NEW_TARGET = 1000  # number of new movies to fetch

NEW_MOVIES = []
EXISTING_IDS = set()

# === Load existing IDs from ALL previous CSVs ===
for file in os.listdir():
    if file.startswith("random_movies_") and file.endswith(".csv") and file != NEW_OUTPUT_FILE:
        with open(file, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                try:
                    EXISTING_IDS.add(int(row['id']))
                except ValueError:
                    continue
print(f"Loaded {len(EXISTING_IDS)} existing movie IDs from previous files.")

# === Movie fetching function ===
def get_movie_by_id(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {'api_key': API_KEY, 'language': 'en-US'}
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            movie = response.json()
            if movie.get("title") and movie.get("vote_count", 0) > 0:
                return {
                    'id': movie['id'],
                    'title': movie.get('title'),
                    'release_date': movie.get('release_date'),
                    'overview': movie.get('overview'),
                    'vote_average': movie.get('vote_average'),
                    'vote_count': movie.get('vote_count'),
                    'popularity': movie.get('popularity'),
                    'original_language': movie.get('original_language'),
                    'adult': movie.get('adult')
                }
    except Exception as e:
        print(f"Error fetching movie ID {movie_id}: {e}")
    return None

# === Save new movies to a CSV file ===
def save_to_csv(movies, filename):
    if not movies:
        return
    keys = movies[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(movies)

# === Collect new unique movies ===
def collect_additional_movies(target=NEW_TARGET):
    request_count = 0
    while len(NEW_MOVIES) < target:
        if request_count > 0 and request_count % 40 == 0:
            print("Waiting 10 seconds for rate limit...")
            time.sleep(10)

        random_id = random.randint(1, MAX_ID)
        if random_id in EXISTING_IDS:
            continue
        EXISTING_IDS.add(random_id)

        movie = get_movie_by_id(random_id)
        request_count += 1

        if movie:
            NEW_MOVIES.append(movie)
            print(f"({len(NEW_MOVIES)}/{target}): {movie['title']} ({movie['release_date']})")

        time.sleep(0.25)

    print(f"Finished collecting {len(NEW_MOVIES)} additional movies into {NEW_OUTPUT_FILE}.")
    save_to_csv(NEW_MOVIES, NEW_OUTPUT_FILE)

# === Run the new scrape ===
collect_additional_movies()


Loaded 7999 existing movie IDs from previous files.
(1/1000): Don vs. Raph (2016-07-21)
(2/1000): Space Warriors (2013-04-26)
(3/1000): The Saga of Tanegashima (1968-05-18)
(4/1000): Offending Women Is Not Recommended (2000-10-01)
(5/1000): Choking On Fat Tool (2021-01-01)
(6/1000): The Life Story of John Lee, or The Man They Could Not Hang (1921-12-23)
(7/1000): Hell or Tidewater (2020-09-19)
(8/1000): The Seed (1974-02-28)
(9/1000): Boxeurs (1896-05-17)
(10/1000): Frankenstein Unbound (1990-11-02)
(11/1000): Mi niña mi vida (2014-01-17)
(12/1000): Death of a Princess (1980-04-09)
(13/1000): Michael Ian Black: Very Famous (2011-08-06)
Waiting 10 seconds for rate limit...
(14/1000): Death Shock (1981-01-01)
(15/1000): Triple Crossed (2013-11-12)
(16/1000): In Search of the Wild Kingdom (2007-11-17)
(17/1000): Viralukketha Veekkam (1999-07-16)
(18/1000): Mauerpark (2014-11-21)
(19/1000): La ferme des humains (2014-01-17)
(20/1000): Bertram & Co (2002-12-25)
(21/1000): Through the Black 

In [None]:
import requests
import random
import time
import csv
import os

API_KEY = ''
BASE_URL = 'https://api.themoviedb.org/3'
MAX_ID = 1000000

# === File & chunk settings ===
NEW_OUTPUT_FILE = 'random_movies_9001_10000.csv'
NEW_TARGET = 1000  # number of new movies to fetch

NEW_MOVIES = []
EXISTING_IDS = set()

# === Load existing IDs from ALL previous CSVs ===
for file in os.listdir():
    if file.startswith("random_movies_") and file.endswith(".csv") and file != NEW_OUTPUT_FILE:
        with open(file, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                try:
                    EXISTING_IDS.add(int(row['id']))
                except ValueError:
                    continue
print(f"Loaded {len(EXISTING_IDS)} existing movie IDs from previous files.")

# === Movie fetching function ===
def get_movie_by_id(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {'api_key': API_KEY, 'language': 'en-US'}
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            movie = response.json()
            if movie.get("title") and movie.get("vote_count", 0) > 0:
                return {
                    'id': movie['id'],
                    'title': movie.get('title'),
                    'release_date': movie.get('release_date'),
                    'overview': movie.get('overview'),
                    'vote_average': movie.get('vote_average'),
                    'vote_count': movie.get('vote_count'),
                    'popularity': movie.get('popularity'),
                    'original_language': movie.get('original_language'),
                    'adult': movie.get('adult')
                }
    except Exception as e:
        print(f"Error fetching movie ID {movie_id}: {e}")
    return None

# === Save new movies to a CSV file ===
def save_to_csv(movies, filename):
    if not movies:
        return
    keys = movies[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(movies)

# === Collect new unique movies ===
def collect_additional_movies(target=NEW_TARGET):
    request_count = 0
    while len(NEW_MOVIES) < target:
        if request_count > 0 and request_count % 40 == 0:
            print("Waiting 10 seconds for rate limit...")
            time.sleep(10)

        random_id = random.randint(1, MAX_ID)
        if random_id in EXISTING_IDS:
            continue
        EXISTING_IDS.add(random_id)

        movie = get_movie_by_id(random_id)
        request_count += 1

        if movie:
            NEW_MOVIES.append(movie)
            print(f"({len(NEW_MOVIES)}/{target}): {movie['title']} ({movie['release_date']})")

        time.sleep(0.25)

    print(f"Finished collecting {len(NEW_MOVIES)} additional movies into {NEW_OUTPUT_FILE}.")
    save_to_csv(NEW_MOVIES, NEW_OUTPUT_FILE)

# === Run the new scrape ===
collect_additional_movies()


Loaded 8999 existing movie IDs from previous files.
(1/1000): Bathroom Hooligan (2019-10-10)
(2/1000): I Never Sang for My Father (1970-10-18)
(3/1000): 116 Cameras (2017-04-06)
(4/1000): Eh Al-Nizam (2010-01-13)
(5/1000): Mobile Suit Gundam: Char's Counterattack (1988-03-12)
(6/1000): The Duck Hunt (1932-01-28)
(7/1000): Comedown (2012-08-23)
(8/1000): Глухарь. Приходи, Новый год! (2009-12-31)
(9/1000): Maria Adelaide (2017-01-11)
(10/1000): Loony in the Woods (2006-08-07)
(11/1000): The Loom (1986-12-30)
(12/1000): Dirty Laundry ()
(13/1000): Anal Workout (2011-05-26)
(14/1000): Wild Wind (1985-11-01)
(15/1000): Ismail Yassine in the House of Ghosts (1951-01-01)
(16/1000): ADN (2022-03-16)
(17/1000): A Trip (2011-10-04)
(18/1000): Los niños de Rusia (2001-11-30)
Waiting 10 seconds for rate limit...
(19/1000): Antariksham 9000 KMPH (2018-12-20)
(20/1000): Guts & Gore 2 (2009-06-17)
(21/1000): Welcome Aboard (2012-06-13)
(22/1000): Mujhse Shaadi Karogi (2004-07-30)
(23/1000): Everythin

In [5]:
import pandas as pd
import os

# Just use the filenames 
file_names = [
    "random_movies_1000.csv",
    "random_movies_1001_2000.csv",
    "random_movies_2001_3000.csv",
    "random_movies_3001_4000.csv",
    "random_movies_4001_5000.csv",
    "random_movies_5001_6000.csv",
    "random_movies_6001_7000.csv",
    "random_movies_7001_8000.csv",
    "random_movies_8001_9000.csv",
    "random_movies_9001_10000.csv"
]

# Read and concatenate all dataframes
dfs = [pd.read_csv(file) for file in file_names]
combined_df = pd.concat(dfs, ignore_index=True)

# Save the merged file
combined_df.to_csv("10k_movies.csv", index=False)

print(" Merged into 10k_movies.csv successfully.")


 Merged into 10k_movies.csv successfully.
