In [12]:
import requests
import json
import os
from pymongo import MongoClient
import time
from datetime import datetime, timedelta
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [13]:
# Set up MongoDB connection
client = MongoClient('localhost',27017)
db = client.project
Movies_col = db["Movies"]
Genre_col = db["Genre"]

In [None]:
# Fetch movies from TMDB

def month_range(start_date, end_date):
    """Generate (start_of_month, start_of_next_month) tuples from start_date to end_date"""
    current = start_date
    while current < end_date:
        next_month = (current.replace(day=28) + timedelta(days=4)).replace(day=1)
        yield current, next_month
        current = next_month
        
def fetch_and_save_movies_by_month(api_key, start_date_str="2018-01-01", end_date_str="2025-05-01", pages_per_month=500):
    base_url = "https://api.themoviedb.org/3/discover/movie"
    headers = {
        "Authorization": f"Bearer {api_key}"
    }

    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")

    for gte_date, lte_date in month_range(start_date, end_date):
        gte_str = gte_date.strftime("%Y-%m-%d")
        lte_str = lte_date.strftime("%Y-%m-%d")
        print(f"\nðŸ“… Fetching movies from {gte_str} to {lte_str}")

        for page in range(1, pages_per_month + 1):
            params = {
                'include_adult': False,
                'include_video': False,
                'language': 'en-US',
                'sort_by': 'popularity.desc',
                'primary_release_date.gte': gte_str,
                'primary_release_date.lte': lte_str,
                'page': page
            }

            response = requests.get(base_url, headers=headers, params=params)
            if response.status_code == 200:
                data = response.json()
                movies_batch = data.get('results', [])

                if not movies_batch:
                    print(f" No more movies on page {page} for {gte_str} to {lte_str}")
                    break

                for movie in movies_batch:
                    movie["release_month"] = gte_str[:7] 

                try:
                    Movies_col.insert_many(movies_batch, ordered=False)
                    print(f" {gte_str[:7]} - Page {page}: Inserted {len(movies_batch)} movies")
                except Exception as e:
                    print(f" Insert error for {gte_str[:7]} Page {page}: {e}")
            else:
                print(f" Failed to fetch {gte_str} Page {page}: Status {response.status_code}")
                break

            time.sleep(0.3) 

In [None]:
api_key = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI3MDFkNTBiZDc0NDJhNGY1NWM1MTgzNDhhMWEzNzZmMiIsIm5iZiI6MTc0NDE1MzU1OS45MTYsInN1YiI6IjY3ZjVhYmQ3MzFjOWYyNzI5OWFkN2QyMiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.7EXE4iAqTiaDxghHjy_5POzE6zXOKpMEm31dj39S2Lg"
fetch_and_save_movies_by_month(api_key, start_date_str="2018-01-01", end_date_str="2025-05-01", pages_per_month=500)

In [75]:
# Fetch Genres
api_key = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI3MDFkNTBiZDc0NDJhNGY1NWM1MTgzNDhhMWEzNzZmMiIsIm5iZiI6MTc0NDE1MzU1OS45MTYsInN1YiI6IjY3ZjVhYmQ3MzFjOWYyNzI5OWFkN2QyMiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.7EXE4iAqTiaDxghHjy_5POzE6zXOKpMEm31dj39S2Lg"
headers = {"Authorization": f"Bearer {api_key}"}
url = "https://api.themoviedb.org/3/genre/movie/list?language=en"
response = requests.get(url, headers=headers)
if response.status_code == 200:
    genre_data = response.json()
    Genre_col.insert_one(genre_data)

genres


In [14]:
Movie_Albums_col = db["Movie_Albums"]
Albumns_Info_col = db["Albums_Info"]

# Spotify API credentials
client_id = "4db0521b60254bef948d398eb66aaafd"
client_secret = "b502ecc5b90e46d1b46de5b100bd0662"

# Auth manager
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id=client_id,
    client_secret=client_secret))

In [15]:
def get_all_movie_titles():
    movie_cursor = db["top_200_movies"].find({}, {"_id": 0, "title": 1})  
    movie_titles = [movie["title"] for movie in movie_cursor if "title" in movie]
    return movie_titles


def search_and_store_spotify_album(term):
    results = sp.search(q=term + "Movie Sound Track", type='album', limit=1)
    albums = results['albums']['items']

    for album in albums:
        Movie_Albums_col.insert_one({
            "search_term": term,
            "raw_data": album })

def fetch_spotify_albums_for_all_movies():
    movie_titles = get_all_movie_titles()

    for title in movie_titles:
        try:
            search_and_store_spotify_album(title)
            print(f"Fetched album(s) for: {title}")
        except Exception as e:
            print(f"Error fetching for '{title}': {e}")

fetch_spotify_albums_for_all_movies()

In [16]:
def get_album_ids_from_raw_data():
    album_ids = []
    cursor = db["Movie_Albums"].find({}, {"raw_data.id": 1})
    for doc in cursor:
            album_id = doc["raw_data"]["id"]
            album_ids.append(album_id)

    return album_ids
    
def fetch_and_store_raw_album_info(album_ids):
    for album_id in album_ids:
        album_raw = sp.album(album_id)

        # Save the full raw data
        db["Albums_Info"].update_one(
            {"album_id": album_id},
            {"$set": {
                "album_id": album_id,
                "raw_data": album_raw
            }},
            upsert=True
        )

        print(f"Stored raw album info for ID: {album_id}")
    print("Done")

In [None]:
album_ids = get_album_ids_from_raw_data()
fetch_and_store_raw_album_info(album_ids)