<a href="https://colab.research.google.com/github/msaantonova/IR-IE-project---Musicals-Recommendation-System/blob/main/Musicals_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**IR/ IE Project** - Musical recommendation system

Workflow:

1. Finding the corpus

Possible corpus: https://docs.google.com/document/d/1TqIBg_3frwDtaE518Cq_AxAdtSsM93OWp8MTfJq0iPE/edit?tab=t.0

POr existing dataset with movie musicals reviews: https://www.kaggle.com/datasets/bwandowando/rotten-tomatoes-best-musicals-of-all-time/data
2. Indexing
3. Knowledge graphs (where do they go?)
4. Create query examples
5. Build an extraction pipeline for recognition of names etc.

**1. Working with the dataset**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import zipfile
import os
from google.colab import files

In [4]:
file_path1 = '/content/movies.csv'
with open(file_path1, 'r', encoding='utf-8') as f:
    for i in range(5):
        print(f.readline())

"movieId","movieTitle","movieYear","movieURL","critic_score","audience_score"

"6da4d992-97ca-3acc-834c-37c1ad34872c","Singin' in the Rain",1952,"https://www.rottentomatoes.com/m/singin_in_the_rain","100%","95%"

"56b1624a-1ceb-3598-8ee2-62236554094a","Top Hat",1935,"https://www.rottentomatoes.com/m/top_hat","100%","90%"

"718dd15b-50ee-3090-84c7-13244b587b00","Meet Me in St. Louis",1944,"https://www.rottentomatoes.com/m/meet_me_in_st_louis","100%","87%"

"80f30474-3ac3-36c7-a43b-648d1a627432","The Wizard of Oz",1939,"https://www.rottentomatoes.com/m/the_wizard_of_oz_1939","98%","89%"



**`3. Template for possible knowledge graph `**

In [None]:
def add_musical_to_graph(G, title, release_date, genre, location, director, actors, characters, songs, spotify_link, time_period, composer, source_material, plot):
    G.add_node(title, type='musical')

    # release date
    G.add_node(release_date, type='date')
    G.add_edge(title, release_date, relationship='released_on')

    # genre
    G.add_node(genre, type='genre')
    G.add_edge(title, genre, relationship='has_genre')

    # location
    G.add_node(location, type='place')
    G.add_edge(title, location, relationship='set_in')

    # director
    G.add_node(director, type='director')
    G.add_edge(title, director, relationship='directed_by')

    # actors
    for actor in actors:
        G.add_node(actor, type='actor')
        G.add_edge(title, actor, relationship='features_actor')

    # characters
    for character in characters:
        G.add_node(character, type='character')
        G.add_edge(title, character, relationship='features_character')
        G.add_edge(actor, character, relationship='played_by_actor')

    # songs
    for song in songs:
        G.add_node(song, type='song')
        G.add_edge(title, song, relationship='includes_song')
        G.add_edge(song, composer, relationship='written_by')
        G.add_edge(song, actor, relationship='performed_by')
        G.add_edge(song, character, relationship='performed_by')

    # Spotify or apple music
    G.add_node(spotify_link, type='link')
    G.add_edge(title, spotify_link, relationship='has_music_link')

    #time_period
    G.add_node(time_period, type='time_period')
    G.add_edge(title, time_period, relationship='set_in_time_period')

    #composer
    G.add_node(composer, type='composer')
    G.add_edge(composer, title, relationship='created_music_for')

    #source_material
    G.add_node(source_material, type='source_material')
    G.add_edge(title, source_material, relationship='is_based_on_source')

    #plot
    G.add_node(plot, type='plot')
    G.add_edge(title, plot, relationship='has_plot')


    return G


4. How to get information? Manuelly or parsing

EXAMPLE - Parsing from IMDB

In [6]:
!pip install IMDbPY
from imdb import IMDb

ia = IMDb()

results = ia.search_movie("Les Misérables")

def get_imdb_data(title):
    results = ia.search_movie(title)
    if not results:
        return {
            "imdb_title": None,
            "release_date": None,
            "directors": [],
            "genres": [],
            "actors": [],
            "characters": [],
            "plot": None
        }

    movie = results[0]
    ia.update(movie)

    return {
        "imdb_title": movie.get('title'),
        "release_date": movie.get('year'),
        "directors": [d['name'] for d in movie.get('directors', [])],
        "genres": movie.get('genres', []),
        "actors": [a['name'] for a in movie.get('cast', [])[:5]],
        "characters": [a.currentRole['name'] if a.currentRole else None for a in movie.get('cast', [])[:5]],
        "plot": movie.get('plot', [])

    }


Collecting IMDbPY
  Downloading IMDbPY-2022.7.9-py3-none-any.whl.metadata (498 bytes)
Collecting cinemagoer (from IMDbPY)
  Downloading cinemagoer-2023.5.1-py3-none-any.whl.metadata (2.9 kB)
Downloading IMDbPY-2022.7.9-py3-none-any.whl (1.2 kB)
Downloading cinemagoer-2023.5.1-py3-none-any.whl (297 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.2/297.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cinemagoer, IMDbPY
Successfully installed IMDbPY-2022.7.9 cinemagoer-2023.5.1


In [29]:
!pip install IMDbPY

from imdb import IMDb

ia = IMDb()

results = ia.search_movie("Les Misérables")

movie = results[0]
ia.update(movie)

print("Title:", movie['title'])
print("Release date:", movie['year'])
print("Director(s):", [d['name'] for d in movie.get('directors', [])])
print("Genres:", movie.get('genres', []))
print("Actors:", [a['name'] for a in movie.get('cast', [])[:5]])
print("Characters:", [a.currentRole['name'] for a in movie.get('cast', [])[:5]])
print("Plot:", movie.get('plot', []))



Title: Les Misérables
Release date: 2012
Director(s): ['Tom Hooper']
Genres: ['Drama', 'Musical', 'Romance']
Actors: ['Hugh Jackman', 'Russell Crowe', 'Anne Hathaway', 'Amanda Seyfried', 'Sacha Baron Cohen']
Characters: ['Jean Valjean', 'Javert', 'Fantine', 'Cosette', 'Thénardier']
Plot: ["In 19th-century France, Jean Valjean, who, for decades, has been hunted by the ruthless policeman Javert after breaking parole, agrees to care for a factory worker's daughter. The decision changes their lives forever.", 'Jean Valjean, known as Prisoner 24601, is released from prison and breaks parole to create a new life for himself while evading the grip of the persistent Inspector Javert. Set in post-revolutionary France, the story reaches resolution against the background of the June Rebellion.—Anonymous', "Based on the novel by Victor Hugo, 'Les Miserables' travels with prisoner-on-parole, 24601, Jean Valjean, as he runs from the ruthless Inspector Javert on a journey beyond the barricades, at th

Parsing from wikipedia

In [30]:
!pip install wikipedia
import wikipedia
import re

wikipedia.set_lang("en")

movie = "Les Misérables (musical)"

try:
    page = wikipedia.page(movie)
    content = page.content

    composer_match = re.search(r"[Mm]usic by ([A-Z][\w\-\'é]+(?: [A-Z][\w\-\'é]+)+)", content)
    composer = composer_match.group(1) if composer_match else "Unknown"
    print("🎹 Composer:", composer)


    source_match = re.search(r"(?:based on|adapted from).*?novel.*?by ([A-Z][\w\-\'é]+(?: [A-Z][\w\-\'é]+)*)", content, re.IGNORECASE)
    source_material = f"Novel by {source_match.group(1)}" if source_match else "Unknown"
    print("📚 Source material:", source_material)

except wikipedia.exceptions.DisambiguationError as e:
    print("Unknown", e.options)
except wikipedia.exceptions.PageError:
    print("Unknown")

🎹 Composer: Claude-Michel Schönberg
📚 Source material: Novel by Victor Hugo


In [31]:
import requests
from bs4 import BeautifulSoup
import re

title = "Les Misérables (2012 film)"
url = "https://en.wikipedia.org/wiki/Les_Mis%C3%A9rables_(2012_film)"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

songs = []
found_section = False

for tag in soup.find_all(['h2', 'h3', 'ul', 'ol', 'div', 'p']):
    # Проверка заголовка
    if tag.name in ['h2', 'h3'] and "Musical numbers" in tag.get_text():
        found_section = True
        continue

    if found_section:
        # Если это снова заголовок — значит секция закончилась
        if tag.name in ['h2', 'h3']:
            break

        # Если это список, ищем песни
        if tag.name in ['ul', 'ol']:
            for li in tag.find_all('li'):
                text = li.get_text(strip=True)
                match = re.match(r'["“](.*?)["”]', text)
                songs.append(match.group(1) if match else text)

        # Если это div, тоже смотрим на его li
        elif tag.name == 'div':
            for li in tag.find_all('li'):
                text = li.get_text(strip=True)
                match = re.match(r'["“](.*?)["”]', text)
                songs.append(match.group(1) if match else text)

print("🎵 Songs:", songs if songs else "Unknown")



🎵 Songs: ['Look Down', 'The Bishop', "Valjean's Soliloquy", 'At the End of the Day', 'The Runaway Cart', 'The Docks (Lovely Ladies)', 'I Dreamed a Dream', "Fantine's Arrest", 'Who Am I?', "Fantine's Death", 'The Confrontation', 'Castle on a Cloud', 'Master of the House', 'The Well Scene', 'The Bargain', 'The Thénardier Waltz of Treachery', 'Suddenly', 'The Convent', 'Stars', 'Paris/Look Down', 'The Robbery', "Javert's Intervention", "Éponine's Errand", 'ABC Café/Red and Black', 'In My Life', 'A Heart Full of Love', 'The Attack on Rue Plumet', 'On My Own', 'One Day More', 'Do You Hear the People Sing?', 'Building the Barricade (Upon These Stones)', "Javert's Arrival", 'Little People', 'A Little Fall of Rain', 'Night of Anguish', 'Drink With Me', 'Bring Him Home', 'Dawn of Anguish', 'The Second Attack (Death of Gavroche)', 'The Sewers', "Javert's Suicide", 'Turning', 'Empty Chairs at Empty Tables', 'A Heart Full of Love [Reprise]', "Valjean's Confession", 'Suddenly [Reprise]', 'Wedding C

In [32]:
import requests
from bs4 import BeautifulSoup
import re

def get_songs_from_wikipedia(title):
    url_title = title.replace(" ", "_")
    url = f"https://en.wikipedia.org/wiki/{url_title}"

    response = requests.get(url)
    if response.status_code != 200:
        return f"❌ Could not load page: {response.status_code}"

    soup = BeautifulSoup(response.text, 'html.parser')

    section_titles = ["Musical numbers", "Soundtrack", "Track listing", "Songs"]
    songs = []
    found_section = False

    for tag in soup.find_all(['h2', 'h3', 'ul', 'ol', 'div']):
        if tag.name in ['h2', 'h3']:
            header_text = tag.get_text().strip()
            if any(section in header_text for section in section_titles):
                found_section = True
                continue

        if found_section:
            if tag.name in ['h2', 'h3']:
                break
            if tag.name in ['ul', 'ol', 'div']:
                for li in tag.find_all('li'):
                    text = li.get_text(strip=True)
                    match = re.match(r'["“](.*?)["”]', text)
                    songs.append(match.group(1) if match else text)

    return songs if songs else "Unknown"


Getting the time period and location from the plot

In [33]:
!pip install IMDbPY
import re
from imdb import IMDb
import spacy

ia = IMDb()

results = ia.search_movie("Les Misérables")
movie = results[0]
ia.update(movie)

def extract_time_period(text):
    if not text:
        return None

    patterns = [
    r'\b\d{1,2}(st|nd|rd|th)?-century\b',  # ←  "19th-century"
    r'\b(1[5-9]|20|21)(st|nd|rd|th)?\s*century\b',  # "18th century"
    r'\b(19|18|17|20)\d{2}s\b',  # "1920s", "1880s"
    r'\b(during|in|around)\s+(World War I|World War II|Cold War)\b',
    r'\b(medieval|renaissance|victorian|industrial|modern|future|ancient)\s+era\b',
]


    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(0)

    return "Unknown"

nlp = spacy.load("en_core_web_sm")

def extract_location(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ in ("GPE", "LOC")]
    return list(set(locations)) if locations else ["Unknown"]

# get (plot)
plot_text = movie.get('plot')

if plot_text:
    #print("📘 Plot:\n", plot_text[0])
    time_period = extract_time_period(plot_text[0])
    print("Time period:", time_period)
else:
    print("Unknown")

location = extract_location(plot_text[0])
print("Location:", location)


Time period: 19th-century
Location: ['France']


There is a kaggle dataset that contains 100 most popular musicals. However there is not enough information forr the knowledge graph. Therefore the parsing could be done on thhe basis of the mentioned datased on the Wiipedia or IMDB website to get additional nessesary information.

In [113]:
import csv
from imdb import IMDb

ia = IMDb()

# Save results
musicals_data = []

input_file = 'movies.csv'
output_file = 'output_musicals.csv'

with open(input_file, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for i, row in enumerate(reader):
        if i >= 5:
            break

        title = row['movieTitle'].strip()

        try:
            results = ia.search_movie(title)
            if not results:
                print(f"Not found: {title}")
                continue

            # The get_imdb_data function should be defined outside the loop
            def get_imdb_data(title):
                results = ia.search_movie(title)
                if not results:
                    return {
                        "title": None,
                        "release_date": None,
                        "directors": [],
                        "genres": [],
                        "actors": [],
                        "characters": []
                    }

                movie = results[0]
                ia.update(movie)

                # Corrected indentation for the return statement
                return {
                    "title": movie.get('title'),
                    "release_date": movie.get('year'),
                    "directors": [d['name'] for d in movie.get('directors', [])],
                    "genres": movie.get('genres', []),
                    "actors": [a['name'] for a in movie.get('cast', [])],
                    "characters": [a.currentRole[0]['name'] if a.currentRole else None for a in movie.get('cast', [])],
                }

            # Call the get_imdb_data function and append its result
            data = get_imdb_data(title)
            musicals_data.append(data)

        except Exception as e:
            print(f"Error {title}: {e}")

# Save the results to a new CSV
with open(output_file, 'w', newline='', encoding='utf-8') as f:
    fieldnames = ["title", "release_date", "directors", "genres", "actors", "characters"]
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for data in musicals_data:
        writer.writerow(data)

Error Singin' in the Rain: 0
Error Top Hat: 0
Error Meet Me in St. Louis: 0
Error The Wizard of Oz: 0
Error A Hard Day's Night: 0


5. Next steps:

    1) Fix the knowledge graph according to the plan ✅

    2) Figure out how to get time period ✅

    3) Figure out how to save combined csv ✅

    4) Figure out how to add links (spotify/ apple music) - 🌀(set aside for now)

    5) Check requirements + testing queries

**6. Implementing everything to save as csv**

In [34]:
import wikipedia
import re

wikipedia.set_lang("en")

def get_composer_and_source(title):
    try:
        page = wikipedia.page(title)
        content = page.content

        composer_match = re.search(r"[Mm]usic by ([A-Z][\w\-\'é]+(?: [A-Z][\w\-\'é]+)+)", content)
        composer = composer_match.group(1) if composer_match else "Unknown"

        source_match = re.search(r"(?:based on|adapted from).*?novel.*?by ([A-Z][\w\-\'é]+(?: [A-Z][\w\-\'é]+)*)", content, re.IGNORECASE)
        source_material = f"Novel by {source_match.group(1)}" if source_match else "Unknown"

        return composer, source_material

    except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError):
        return "Unknown", "Unknown"


In [35]:
import requests
from bs4 import BeautifulSoup

def get_songs_from_wikipedia(title):
    url_title = title.replace(" ", "_")
    url = f"https://en.wikipedia.org/wiki/{url_title}"

    response = requests.get(url)
    if response.status_code != 200:
        return ["Unknown"]

    soup = BeautifulSoup(response.text, 'html.parser')
    section_titles = ["Musical numbers", "Soundtrack", "Track listing", "Songs"]
    songs = []
    found_section = False

    for tag in soup.find_all(['h2', 'h3', 'ul', 'ol', 'div']):
        if tag.name in ['h2', 'h3']:
            header_text = tag.get_text().strip()
            if any(section in header_text for section in section_titles):
                found_section = True
                continue

        if found_section:
            if tag.name in ['h2', 'h3']:
                break
            if tag.name in ['ul', 'ol', 'div']:
                for li in tag.find_all('li'):
                    text = li.get_text(strip=True)
                    match = re.match(r'["“](.*?)["”]', text)
                    songs.append(match.group(1) if match else text)

    return songs if songs else ["Unknown"]


In [36]:
import re
import spacy
from imdb import IMDb

nlp = spacy.load("en_core_web_sm")
ia = IMDb()

def extract_time_period(text):
    if not text:
        return "Unknown"

    patterns = [
        r'\b\d{1,2}(st|nd|rd|th)?-century\b',
        r'\b(1[5-9]|20|21)(st|nd|rd|th)?\s*century\b',
        r'\b(19|18|17|20)\d{2}s\b',
        r'\b(during|in|around)\s+(World War I|World War II|Cold War)\b',
        r'\b(medieval|renaissance|victorian|industrial|modern|future|ancient)\s+era\b',
    ]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(0)

    return "Unknown"

def extract_location(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ in ("GPE", "LOC")]
    return list(set(locations)) if locations else ["Unknown"]

def get_plot_data(title):
    try:
        results = ia.search_movie(title)
        if not results:
            return "Unknown", ["Unknown"]
        movie = results[0]
        ia.update(movie)
        plot = movie.get('plot')
        if plot:
            return extract_time_period(plot[0]), extract_location(plot[0])
        else:
            return "Unknown", ["Unknown"]
    except Exception:
        return "Unknown", ["Unknown"]

In [16]:
def process_row(title):
    imdb_data = get_imdb_data(title)
    if not imdb_data:
        return None

    composer, source_material = get_composer_and_source(title)
    songs = get_songs_from_wikipedia(title)
    time_period, locations = get_plot_data(title)

    return {
        **imdb_data,
        "composer": composer,
        "source_material": source_material,
        "songs": songs,
        "time_period": time_period,
        "locations": locations
    }


In [37]:
def get_imdb_data(title):
    results = ia.search_movie(title)
    if not results:
        return {
            "title": None,
            "release_date": None,
            "directors": [],
            "genres": [],
            "actors": [],
            "characters": [],
            "plot": None
        }

    movie = results[0]
    ia.update(movie)

    # Check if currentRole exists and is not empty before accessing elements
    return {
        "title": movie.get('title'),
        "release_date": movie.get('year'),
        "directors": [d['name'] for d in movie.get('directors', [])],
        "genres": movie.get('genres', []),
        "actors": [a['name'] for a in movie.get('cast', [])],
        "characters": [a.currentRole[0].get('name') if a.currentRole and isinstance(a.currentRole, list) and len(a.currentRole) > 0 and isinstance(a.currentRole[0], dict) else None for a in movie.get('cast', [])],
        "plot": movie.get('plot', [])
    }

In [39]:
import csv

def process_musicals(input_file, output_file, critic_file, limit=5):
    results = []

    # Сначала считываем отзывы критиков в словарь {movieId: quote}
    critic_quotes = {}
    with open(critic_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            movie_id = row['movieId'].strip()
            quote = row['quote'].strip()
            if movie_id and quote:
                critic_quotes[movie_id] = quote

    # Обрабатываем мюзиклы и добавляем цитаты
    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            title = row['movieTitle'].strip()
            movie_id = row['movieId'].strip()
            data = process_row(title)
            if data:
                data['id'] = i + 1               # Внутренний порядковый номер
                data['movieId'] = movie_id       # Внешний ID фильма
                data['quote'] = critic_quotes.get(movie_id, '')  # Цитата если есть
                results.append(data)

    # Собираем все возможные ключи
    all_keys = set()
    for r in results:
        all_keys.update(r.keys())
    fieldnames = ['id', 'movieId'] + sorted(k for k in all_keys if k not in ['id', 'movieId'])

    # Записываем в файл
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in results:
            writer.writerow(row)

# Вызов
input_file = 'movies.csv'
critic_file = 'critic_reviews.csv'
output_file = 'output_musicals.csv'

process_musicals(input_file, output_file, critic_file)



In [40]:
import networkx as nx
import csv
import ast

G = nx.DiGraph()  # важно!

def add_musical_to_graph(graph, title, release_date, genre, location,
                         director, actors, characters, songs,
                         spotify_link, time_period, composer, source_material, plot, quote):
    musical_node = f"MUSICAL: {title}"
    graph.add_node(musical_node, type='musical', release_date=release_date)

    for g in genre:
        graph.add_node(g, type='genre')
        graph.add_edge(musical_node, g, relationship='has_genre')

    for actor in actors:
        graph.add_node(actor, type='actor')
        graph.add_edge(musical_node, actor, relationship='features_actor')

    graph.add_node(director, type='director')
    graph.add_edge(musical_node, director, relationship='directed_by')

    graph.add_node(release_date, type='date')
    graph.add_edge(musical_node, release_date, relationship='released_on')

    graph.add_node(location, type='place')
    graph.add_edge(musical_node, location, relationship='set_in')

    graph.add_node(composer, type='composer')
    graph.add_edge(musical_node, composer, relationship='created_music_for')

    graph.add_node(source_material, type='source_material')
    graph.add_edge(musical_node, source_material, relationship='is_based_on_source')

    for character in characters:
        # Check if character is not None before adding as node
        if character is not None:
            graph.add_node(character, type='character')
            graph.add_edge(musical_node, character, relationship='features_character')

    for song in songs:
        song_node = f"SONG: {song}"
        graph.add_node(song_node, type='song')
        graph.add_edge(musical_node, song_node, relationship='includes_song')

    graph.add_node(spotify_link, type='link')
    graph.add_edge(musical_node, spotify_link, relationship='has_music_link')

    graph.add_node(time_period, type='time_period')
    graph.add_edge(musical_node, time_period, relationship='set_in_time_period')

    G.add_node(plot, type='plot')
    G.add_edge(title, plot, relationship='has_plot')

    G.add_node(quote, type='quote')
    G.add_edge(title, quote, relationship='has_quote')

    print(f"✅ Added musical: {title}")

def load_musicals_to_graph(csv_file):
    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            title = row['title']
            release_date = row['release_date']
            genre = ast.literal_eval(row['genres']) if row['genres'] else ["Unknown"]
            location = ast.literal_eval(row['locations'])[0] if row['locations'] else "Unknown"
            director = ast.literal_eval(row['directors'])[0] if row['directors'] and ast.literal_eval(row['directors']) else "Unknown"
            actors = ast.literal_eval(row['actors'])
            characters = ast.literal_eval(row['characters'])
            songs = ast.literal_eval(row['songs']) if row['songs'] else []
            spotify_link = row.get('spotify_link', 'Unknown')
            time_period = row.get('time_period', 'Unknown')
            composer = row.get('composer', 'Unknown')
            source_material = row.get('source_material', 'Unknown')
            plot = row.get('plot', 'Unknown')
            quote = row.get('quote', 'Unknown')

            add_musical_to_graph(G, title, release_date, genre, location,
                                 director, actors, characters, songs,
                                 spotify_link, time_period, composer, source_material, plot, quote)

    print("✅ All musicals added to the graph!")
    return G

# Запуск:
load_musicals_to_graph('output_musicals.csv')
print("Graph nodes:", G.nodes(data=True))


✅ Added musical: Singin' in the Rain
✅ Added musical: Top Hat
✅ Added musical: Meet Me in St. Louis
✅ Added musical: The Wizard of Oz
✅ Added musical: A Hard Day's Night
✅ Added musical: Gentlemen Prefer Blondes
✅ Added musical: A Star Is Born
✅ Added musical: The Young Girls of Rochefort
✅ Added musical: The Umbrellas of Cherbourg
✅ Added musical: Once Upon a Time... in Hollywood
✅ Added musical: A Night at the Opera
✅ Added musical: An American in Paris
✅ Added musical: Mary Poppins
✅ Added musical: The Muppets
✅ Added musical: Sing Street
✅ Added musical: Lagaan: Once Upon a Time in India
✅ Added musical: In the Heights
✅ Added musical: Funny Girl
✅ Added musical: My Fair Lady
✅ Added musical: Enchanted
✅ Added musical: Cabaret
✅ Added musical: West Side Story
✅ Added musical: Hairspray
✅ Added musical: Hedwig and the Angry Inch
✅ Added musical: La La Land
✅ Added musical: West Side Story
✅ Added musical: The Sapphires
✅ Added musical: Willy Wonka & the Chocolate Factory
✅ Added mus

In [41]:
import pickle

with open("musical_graph.gpickle", "wb") as f:
    pickle.dump(G, f)


In [24]:
def filter_graph_by_query_all(graph, query):
    query_parts = [q.strip().title() for q in query.split(",")]

    for part in query_parts:
        if not graph.has_node(part):
            print(f"No match")
            return []

    filtered_musicals = []

    for node in graph.nodes:
        if graph.nodes[node].get('type') == 'musical':
            if all(
                graph.has_edge(node, part) or graph.has_edge(part, node)
                for part in query_parts
            ):
                filtered_musicals.append(node)

    return filtered_musicals



In [25]:
query = "Lin-Manuel miranda"
filtered_results = filter_graph_by_query_all(G, query)
print("Recommended musicals:", filtered_results)

Recommended musicals: ['MUSICAL: In the Heights', 'MUSICAL: tick, tick... BOOM!', 'MUSICAL: Mary Poppins Returns']


In [26]:
query = "Italy, comedy"
filtered_results = filter_graph_by_query_all(G, query)
print("Recommended musicals:", filtered_results)

No match
Recommended musicals: []


In [27]:
query = "2011"
filtered_results = filter_graph_by_query_all(G, query)
print("Recommended musicals:", filtered_results)

Recommended musicals: ['MUSICAL: The Muppets', 'MUSICAL: The Help']


In [28]:
def find_musicals_by_location(G, location_name):
    results = []
    for musical in G.nodes:
        if G.nodes[musical].get("type") == "musical":
            for neighbor in G.neighbors(musical):
                if (G.nodes[neighbor].get("type") == "place" and
                        neighbor.lower() == location_name.lower()):
                    for edge in G.edges(musical, data=True):
                        if edge[1] == neighbor and edge[2].get("relationship") == "set_in":
                            results.append(musical)
    return results

# Пример использования:
query_location = "New York"
found_musicals = find_musicals_by_location(G, query_location)
print(f"Найдено мюзиклов, действие в '{query_location}':", found_musicals)

# Проверка результата
expected = set(["Hamilton", "Rent", "West Side Story"])  # Подставь реальные названия из графа
found = set(found_musicals)
print("✅ Совпадение:", found == expected)
print("🔍 Пропущенные:", expected - found)
print("🎯 Лишние:", found - expected)


Найдено мюзиклов, действие в 'New York': ['MUSICAL: Everyone Says I Love You']
✅ Совпадение: False
🔍 Пропущенные: {'Rent', 'Hamilton', 'West Side Story'}
🎯 Лишние: {'MUSICAL: Everyone Says I Love You'}


**Next steps:**

1. Check and correct the parsing for imdb and wikipedia (check csv for empty entries)
2. Fix the parameters for queries
3. Figure out adding the link
4. Recommens similar movies in the output
5. ***Extra create a separate csv for broadway musical (manually with all information)
