# Italian Cinema Data Mining & Geocoding

| Section | Description | Output |
|---------|-------------|--------|
| **1. Setup** | Install dependencies, load libraries | - |
| **2. Scraping** | Extract film data from website | `filmitalia_raw.csv` |
| **3. Cleanup** | Clean and enrich data | `filmitalia_cleaned.csv` |
| **4. Location Extraction** | Extract from **synopsis** AND **wikipedia_summary** | `filmitalia_locations.csv` |
| **5. Geocoding** | Geocode ALL locations via Nominatim | `filmitalia_geocoded.csv` |
| **6. Visualization** | Maps and comparative analysis | `database.csv` |

**Output columns:** `locations_synopsis`, `coordinates_synopsis`, `locations_wiki`, `coordinates_wiki`

---
## 1. SETUP & IMPORTS

In [None]:
import pandas as pd
import numpy as np
import re
import json
import time
import os
import ast
from collections import Counter

import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

import spacy
try:
    nlp = spacy.load('it_core_news_sm')
except:
    !python -m spacy download it_core_news_sm
    nlp = spacy.load('it_core_news_sm')

print("Setup complete!")

---
## 2. WEB SCRAPING

**⚠️ Set `RUN_SCRAPING = True` to execute**

In [None]:
RUN_SCRAPING = False

In [None]:
if RUN_SCRAPING:
    
    def scrape_film_urls(page_num, retries=3, delay=2):
        url = f"https://filmitalia.org/it/film/pag-{page_num}/"
        for attempt in range(retries):
            try:
                response = requests.get(url, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                film_links = []
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    if '/it/film/' in href and href.endswith('/'):
                        if href.startswith('/'):
                            href = 'https://filmitalia.org' + href
                        film_links.append(href)
                return list(set(film_links))
            except Exception as e:
                print(f"Attempt {attempt + 1} failed for page {page_num}: {e}")
                if attempt < retries - 1:
                    time.sleep(delay)
        return []

    print("Scraping film URLs...")
    all_film_urls = []
    for page in tqdm(range(1, 500)):
        urls = scrape_film_urls(page)
        if not urls:
            print(f"No URLs found on page {page}. Stopping.")
            break
        all_film_urls.extend(urls)
        time.sleep(1)

    df_urls = pd.DataFrame({'film_url': list(set(all_film_urls))})
    df_urls.to_csv('filmitalia_urls.csv', index=False)
    print(f"Scraped {len(df_urls)} unique film URLs")

    def scrape_film_details(film_url, retries=3, delay=2):
        for attempt in range(retries):
            try:
                response = requests.get(film_url, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                film_data = {'film_url': film_url}
                title_elem = soup.find('h1')
                film_data['title'] = title_elem.text.strip() if title_elem else None
                for row in soup.find_all('div', class_='field'):
                    label = row.find('div', class_='field-label')
                    value = row.find('div', class_='field-items')
                    if label and value:
                        field_name = label.text.strip().rstrip(':')
                        field_value = value.text.strip()
                        film_data[field_name] = field_value
                synopsis_elem = soup.find('div', class_='field-name-body')
                if synopsis_elem:
                    film_data['synopsis'] = synopsis_elem.text.strip()
                return film_data
            except Exception as e:
                print(f"Attempt {attempt + 1} failed for {film_url}: {e}")
                if attempt < retries - 1:
                    time.sleep(delay)
        return {'film_url': film_url, 'error': 'Failed to scrape'}

    print("\nScraping film details...")
    df_urls = pd.read_csv('filmitalia_urls.csv')
    films_data = []
    for url in tqdm(df_urls['film_url']):
        films_data.append(scrape_film_details(url))
        time.sleep(1)

    df = pd.DataFrame(films_data)
    df.to_csv('filmitalia_raw.csv', index=False)
    print(f"Scraped details for {len(df)} films")

else:
    print("Scraping inactive. Set RUN_SCRAPING = True to execute.")

---
## 3. DATA CLEANUP

**⚠️ Set `RUN_CLEANUP = True` to execute**

In [None]:
RUN_CLEANUP = False

In [None]:
# [OPTIONAL LOAD] df = pd.read_csv('filmitalia_raw.csv')

In [None]:
if RUN_CLEANUP:
    print("Loading raw scraped data...")
    df = pd.read_csv('filmitalia_raw.csv')

    def parse_duration(duration_str):
        if pd.isna(duration_str):
            return None
        duration_str = str(duration_str).strip()
        numbers = re.findall(r'\d+', duration_str)
        if not numbers:
            return None
        if 'h' in duration_str.lower() or 'ore' in duration_str.lower():
            return int(numbers[0]) * 60 + int(numbers[1]) if len(numbers) >= 2 else int(numbers[0]) * 60
        return int(numbers[0])

    if 'duration' in df.columns:
        df['duration_minutes'] = df['duration'].apply(parse_duration)

    if 'wikidata_id' in df.columns:
        def get_wikidata_summary(qid):
            if pd.isna(qid):
                return None
            try:
                url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
                response = requests.get(url, timeout=10)
                data = response.json()
                entity = data.get('entities', {}).get(qid, {})
                return entity.get('descriptions', {}).get('it', {}).get('value')
            except:
                return None

        print("Enriching with Wikidata summaries...")
        tqdm.pandas()
        df['wikidata_summary'] = df['wikidata_id'].progress_apply(get_wikidata_summary)

    df.to_csv('filmitalia_cleaned.csv', index=False)
    print(f"Cleaned data saved: {len(df)} films")
else:
    print("Cleanup inactive. Set RUN_CLEANUP = True to execute.")

---
## 4. LOCATION EXTRACTION (spaCy NER)

Extracts from **BOTH** `synopsis` and `wikipedia_summary`

In [None]:
# Load data from GitHub
GITHUB_DATA_URL = "https://raw.githubusercontent.com/lucagiovannini7/modelling-italian-films/refs/heads/main/filmitalia_details_enriched.csv"
df = pd.read_csv(GITHUB_DATA_URL)
print(f"Loaded {len(df)} films from GitHub")
print(f"Columns: {list(df.columns)}")

In [None]:
def extract_locations(text):
    """Extract location entities (GPE, LOC) from Italian text using spaCy."""
    if pd.isna(text):
        return []
    doc = nlp(str(text))
    locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
    return list(set(locations))

In [None]:
# Load stopwords
STOPWORDS_URL = "https://raw.githubusercontent.com/lucagiovannini7/modelling-italian-films/refs/heads/main/stopwords.txt"
response = requests.get(STOPWORDS_URL)
response.raise_for_status()
stopwords = [w.strip() for w in response.text.split(",") if w.strip()]
print(f"Loaded {len(stopwords)} stopwords")

In [None]:
MIN_LENGTH = 3
MIN_ACRONYM_LENGTH = 5
ITALIAN_PREFIXES = r'^(di|a|da|in|nel|nell|nella|negli|per|verso|il|la|lo|le|i|gli|del|della|dei|degli|al|alla|ai|agli)\s+'

def clean_locations(location_list):
    """Remove stopwords, invalid entries, and Italian prefixes."""
    if not location_list or not isinstance(location_list, list):
        return []
    cleaned = []
    for loc in location_list:
        if loc in stopwords or len(loc) < MIN_LENGTH or (loc.isupper() and len(loc) < MIN_ACRONYM_LENGTH):
            continue
        loc = re.sub(ITALIAN_PREFIXES, '', loc, flags=re.IGNORECASE)
        if match := re.match(r'^[a-z]+([A-Z].*)$', loc):
            loc = match.group(1)
        if loc:
            cleaned.append(loc)
    return cleaned

In [None]:
# Load silver standard
SILVER_STANDARD_URL = "https://raw.githubusercontent.com/lucagiovannini7/modelling-italian-films/refs/heads/main/silver_standard.txt"
response = requests.get(SILVER_STANDARD_URL)
response.raise_for_status()
silver_standard = set(line.strip() for line in response.text.splitlines() if line.strip())
print(f"Loaded {len(silver_standard)} locations from silver_standard.txt")

In [None]:
def collect_discrepancies(entities, standards_set, discrepancy_collector):
    """Collect entities NOT in the standards set for manual review."""
    if not isinstance(entities, list):
        return
    for ent in entities:
        if ent not in standards_set:
            discrepancy_collector.add(ent)

def filter_by_silver_standard(entities):
    """Keep only entities in silver standard."""
    if not isinstance(entities, list):
        return []
    return [ent for ent in entities if ent in silver_standard]

### 4.1 Extract from SYNOPSIS

In [None]:
print("Extracting locations from SYNOPSIS...")
tqdm.pandas()
df['locations_synopsis'] = df['synopsis'].progress_apply(extract_locations)

raw_count = sum(len(x) for x in df['locations_synopsis'])
print(f"Raw extraction: {raw_count} mentions")

df['locations_synopsis'] = df['locations_synopsis'].apply(clean_locations)
clean_count = sum(len(x) for x in df['locations_synopsis'])
print(f"After cleaning: {clean_count} mentions")

In [None]:
# Check discrepancies BEFORE filtering
discrepancies_synopsis = set()
df['locations_synopsis'].apply(
    lambda x: collect_discrepancies(x, silver_standard, discrepancies_synopsis)
)
print(f"\nSYNOPSIS: {len(discrepancies_synopsis)} unique elements NOT in silver_standard.txt")
if discrepancies_synopsis:
    print("\n--- Elements in locations_synopsis NOT in silver_standard ---")
    for element in sorted(list(discrepancies_synopsis)):
        print(element)

In [None]:
# Apply silver standard filter
df['locations_synopsis'] = df['locations_synopsis'].apply(filter_by_silver_standard)
final_count = sum(len(x) for x in df['locations_synopsis'])
print(f"After silver standard filter: {final_count} mentions")

### 4.2 Extract from WIKIPEDIA_SUMMARY

In [None]:
print("Extracting locations from WIKIPEDIA_SUMMARY...")
tqdm.pandas()
df['locations_wiki'] = df['wikipedia_summary'].progress_apply(extract_locations)

raw_count = sum(len(x) for x in df['locations_wiki'])
print(f"Raw extraction: {raw_count} mentions")

df['locations_wiki'] = df['locations_wiki'].apply(clean_locations)
clean_count = sum(len(x) for x in df['locations_wiki'])
print(f"After cleaning: {clean_count} mentions")

In [None]:
# Check discrepancies BEFORE filtering
discrepancies_wiki = set()
df['locations_wiki'].apply(
    lambda x: collect_discrepancies(x, silver_standard, discrepancies_wiki)
)
print(f"\nWIKIPEDIA: {len(discrepancies_wiki)} unique elements NOT in silver_standard.txt")
if discrepancies_wiki:
    print("\n--- Elements in locations_wiki NOT in silver_standard ---")
    for element in sorted(list(discrepancies_wiki)):
        print(element)

In [None]:
# Apply silver standard filter
df['locations_wiki'] = df['locations_wiki'].apply(filter_by_silver_standard)
final_count = sum(len(x) for x in df['locations_wiki'])
print(f"After silver standard filter: {final_count} mentions")

In [None]:
# Summary statistics
synopsis_locs = [loc for locs in df['locations_synopsis'] for loc in locs]
wiki_locs = [loc for locs in df['locations_wiki'] for loc in locs]

print("\n" + "="*50)
print("LOCATION EXTRACTION SUMMARY")
print("="*50)
print(f"Synopsis:  {len(synopsis_locs)} mentions, {len(set(synopsis_locs))} unique")
print(f"Wikipedia: {len(wiki_locs)} mentions, {len(set(wiki_locs))} unique")
print(f"Combined unique: {len(set(synopsis_locs) | set(wiki_locs))}")

In [None]:
# [SAVE]
df.to_csv('filmitalia_locations.csv', index=False)
print("Saved: filmitalia_locations.csv")

---
## 5. GEOCODING (Nominatim)

- `USE_GEOCACHE = True` → Load from cache
- `USE_GEOCACHE = False` → Geocode from scratch

In [None]:
# [OPTIONAL LOAD]
# df = pd.read_csv('filmitalia_locations.csv')
# df['locations_synopsis'] = df['locations_synopsis'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and x != '[]' else [])
# df['locations_wiki'] = df['locations_wiki'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and x != '[]' else [])

In [None]:
USE_GEOCACHE = False
GEOCACHE_FILE = 'geocache.txt'

In [None]:
def geocode_location(location_name):
    """Geocode a location using Nominatim."""
    try:
        response = requests.get(
            "https://nominatim.openstreetmap.org/search",
            params={'q': location_name, 'format': 'json', 'limit': 1},
            headers={'User-Agent': 'ItalianCinemaResearch/1.0'},
            timeout=10
        )
        data = response.json()
        if data:
            return (float(data[0]['lat']), float(data[0]['lon']))
    except Exception as e:
        print(f"Error geocoding {location_name}: {e}")
    return (None, None)

def load_geocache(filepath):
    cache = {}
    if os.path.exists(filepath):
        try:
            cache_df = pd.read_csv(filepath)
            for _, row in cache_df.iterrows():
                loc = row.get('location', row.iloc[0] if len(row) > 0 else None)
                lat = row.get('latitude', row.iloc[1] if len(row) > 1 else None)
                lon = row.get('longitude', row.iloc[2] if len(row) > 2 else None)
                if loc and pd.notna(lat) and pd.notna(lon):
                    cache[loc] = [float(lat), float(lon)]
        except Exception as e:
            print(f"Warning: Could not load geocache: {e}")
    return cache

def save_geocache(cache, filepath):
    rows = [{'location': k, 'latitude': v[0], 'longitude': v[1]} 
            for k, v in cache.items() if v]
    pd.DataFrame(rows).to_csv(filepath, index=False)
    print(f"Saved {len(rows)} locations to {filepath}")

In [None]:
# Collect ALL unique locations from BOTH sources
all_unique_locations = set()
df['locations_synopsis'].apply(lambda x: all_unique_locations.update(x) if isinstance(x, list) else None)
synopsis_unique = len(all_unique_locations)
df['locations_wiki'].apply(lambda x: all_unique_locations.update(x) if isinstance(x, list) else None)
wiki_added = len(all_unique_locations) - synopsis_unique

print(f"Unique from synopsis: {synopsis_unique}")
print(f"Additional from wikipedia: {wiki_added}")
print(f"Total to geocode: {len(all_unique_locations)}")

In [None]:
if USE_GEOCACHE and os.path.exists(GEOCACHE_FILE):
    print(f"Loading geocache from {GEOCACHE_FILE}...")
    location_coords = load_geocache(GEOCACHE_FILE)
    print(f"Loaded {len(location_coords)} cached coordinates")
    to_geocode = all_unique_locations - set(location_coords.keys())
    print(f"Need to geocode {len(to_geocode)} new locations")
else:
    location_coords = {}
    to_geocode = all_unique_locations
    print(f"Starting fresh geocoding for {len(to_geocode)} locations")

In [None]:
if to_geocode:
    print(f"\nGeocoding {len(to_geocode)} locations...")
    new_coords = {}
    for location in tqdm(to_geocode):
        lat, lon = geocode_location(location)
        new_coords[location] = [lat, lon] if lat and lon else None
        time.sleep(1)

    location_coords.update(new_coords)
    save_geocache(location_coords, GEOCACHE_FILE)

success_count = sum(1 for v in location_coords.values() if v)
print(f"\nSuccessfully geocoded: {success_count}/{len(all_unique_locations)} locations")

In [None]:
def get_coordinates(location_list):
    if not isinstance(location_list, list):
        return []
    return [location_coords.get(loc) for loc in location_list if location_coords.get(loc)]

df['coordinates_synopsis'] = df['locations_synopsis'].apply(get_coordinates)
df['coordinates_wiki'] = df['locations_wiki'].apply(get_coordinates)

print("Sample (synopsis):")
print(df[['original_title', 'locations_synopsis', 'coordinates_synopsis']].head(3))
print("\nSample (wikipedia):")
print(df[['original_title', 'locations_wiki', 'coordinates_wiki']].head(3))

In [None]:
# [SAVE]
df.to_csv('filmitalia_geocoded.csv', index=False)
print("Saved: filmitalia_geocoded.csv")

---
## 6. VISUALIZATION & ANALYSIS

In [None]:
# [OPTIONAL LOAD]
# df = pd.read_csv('filmitalia_geocoded.csv')
# for col in ['locations_synopsis', 'locations_wiki', 'coordinates_synopsis', 'coordinates_wiki']:
#     df[col] = df[col].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and x != '[]' else [])

In [None]:
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap

In [None]:
# Prepare data for visualizations
synopsis_locs = [loc for locs in df['locations_synopsis'] for loc in (locs if isinstance(locs, list) else [])]
wiki_locs = [loc for locs in df['locations_wiki'] for loc in (locs if isinstance(locs, list) else [])]

synopsis_counts = Counter(synopsis_locs)
wiki_counts = Counter(wiki_locs)

print(f"Total synopsis mentions: {len(synopsis_locs)}, unique: {len(set(synopsis_locs))}")
print(f"Total wiki mentions: {len(wiki_locs)}, unique: {len(set(wiki_locs))}")

### 6.1 Venn-style Overlap

In [None]:
synopsis_set = set(synopsis_locs)
wiki_set = set(wiki_locs)

only_synopsis = synopsis_set - wiki_set
only_wiki = wiki_set - synopsis_set
both = synopsis_set & wiki_set

print(f"Only in synopsis: {len(only_synopsis)}")
print(f"Only in wiki: {len(only_wiki)}")
print(f"In both: {len(both)}")

fig, ax = plt.subplots(figsize=(8, 8))
ax.pie([len(only_synopsis), len(only_wiki), len(both)],
       labels=[f'Synopsis only\n({len(only_synopsis)})',
               f'Wiki only\n({len(only_wiki)})',
               f'Both\n({len(both)})'],
       colors=['steelblue', 'darkorange', 'purple'],
       autopct='%1.1f%%')
ax.set_title('Location Overlap between Sources')
plt.savefig('location_overlap.png', dpi=150)
plt.show()

### 6.2 Top Locations Comparison (Percentages)

In [None]:
total_synopsis = len(synopsis_locs)
total_wiki = len(wiki_locs)

all_top = set([x[0] for x in synopsis_counts.most_common(20)] +
              [x[0] for x in wiki_counts.most_common(20)])

compare_df = pd.DataFrame({
    'location': list(all_top),
    'synopsis': [synopsis_counts.get(loc, 0) / total_synopsis * 100 for loc in all_top],
    'wiki': [wiki_counts.get(loc, 0) / total_wiki * 100 for loc in all_top]
}).sort_values('synopsis', ascending=True)

fig, ax = plt.subplots(figsize=(12, 10))
y = range(len(compare_df))
ax.barh([i - 0.2 for i in y], compare_df['synopsis'], 0.4, label='Synopsis', color='steelblue')
ax.barh([i + 0.2 for i in y], compare_df['wiki'], 0.4, label='Wikipedia', color='darkorange')
ax.set_yticks(y)
ax.set_yticklabels(compare_df['location'])
ax.set_xlabel('Percentage of mentions')
ax.set_title('Top Locations Comparison (% of total mentions)')
ax.legend()
plt.tight_layout()
plt.savefig('location_comparison_pct.png', dpi=150)
plt.show()

### 6.3 Frequency Correlation Scatter

In [None]:
common_locs = list(both)
x = [synopsis_counts[loc] / total_synopsis * 100 for loc in common_locs]
y = [wiki_counts[loc] / total_wiki * 100 for loc in common_locs]

fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(x, y, alpha=0.6)
for i, loc in enumerate(common_locs):
    if x[i] > 2 or y[i] > 2:
        ax.annotate(loc, (x[i], y[i]), fontsize=8)
ax.set_xlabel('Synopsis frequency (%)')
ax.set_ylabel('Wiki frequency (%)')
ax.set_title('Location frequency correlation (% of total mentions)')
ax.plot([0, max(x) if x else 1], [0, max(x) if x else 1], 'k--', alpha=0.3)
plt.tight_layout()
plt.savefig('location_correlation.png', dpi=150)
plt.show()

### 6.4 Donut Charts

In [None]:
def prepare_donut_data(counts, total):
    pct = {loc: count / total * 100 for loc, count in counts.items()}
    top = {loc: p for loc, p in pct.items() if p >= 1}
    others_pct = sum(p for p in pct.values() if p < 1)
    return top, others_pct

synopsis_top, synopsis_others = prepare_donut_data(synopsis_counts, len(synopsis_locs))
wiki_top, wiki_others = prepare_donut_data(wiki_counts, len(wiki_locs))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

# Synopsis donut
labels1 = list(synopsis_top.keys()) + ['Others (<1%)']
sizes1 = list(synopsis_top.values()) + [synopsis_others]
explode1 = [0.02] * len(sizes1)
colors1 = plt.cm.Blues([0.3 + 0.5 * i / len(sizes1) for i in range(len(sizes1))])
wedges1, texts1, autotexts1 = ax1.pie(sizes1, labels=labels1, autopct='%1.1f%%',
                                       explode=explode1, colors=colors1, pctdistance=0.75, rotatelabels=True)
centre_circle1 = plt.Circle((0, 0), 0.50, fc='white')
ax1.add_artist(centre_circle1)
ax1.set_title('Synopsis locations')

# Wiki donut
labels2 = list(wiki_top.keys()) + ['Others (<1%)']
sizes2 = list(wiki_top.values()) + [wiki_others]
explode2 = [0.02] * len(sizes2)
colors2 = plt.cm.Oranges([0.3 + 0.5 * i / len(sizes2) for i in range(len(sizes2))])
wedges2, texts2, autotexts2 = ax2.pie(sizes2, labels=labels2, autopct='%1.1f%%',
                                       explode=explode2, colors=colors2, pctdistance=0.75, rotatelabels=True)
centre_circle2 = plt.Circle((0, 0), 0.50, fc='white')
ax2.add_artist(centre_circle2)
ax2.set_title('Wikipedia locations')

plt.suptitle('Location distribution (≥1% labelled)', fontsize=14)
plt.tight_layout()
plt.savefig('location_donuts.png', dpi=150)
plt.show()

### 6.5 Interactive Map

In [None]:
def create_locations_map(df, source='both'):
    coords_with_labels = []
    
    for _, row in df.iterrows():
        if source in ['synopsis', 'both']:
            if isinstance(row['locations_synopsis'], list) and isinstance(row['coordinates_synopsis'], list):
                for loc, coord in zip(row['locations_synopsis'], row['coordinates_synopsis']):
                    if coord and coord[0] and coord[1]:
                        coords_with_labels.append((coord[0], coord[1], loc, 'synopsis'))
        if source in ['wiki', 'both']:
            if isinstance(row['locations_wiki'], list) and isinstance(row['coordinates_wiki'], list):
                for loc, coord in zip(row['locations_wiki'], row['coordinates_wiki']):
                    if coord and coord[0] and coord[1]:
                        coords_with_labels.append((coord[0], coord[1], loc, 'wiki'))

    print(f"Plotting {len(coords_with_labels)} location instances")

    m = folium.Map(location=[42.5, 12.5], zoom_start=6)
    heat_data = [[lat, lon] for lat, lon, _, _ in coords_with_labels]
    HeatMap(heat_data, radius=15, blur=25, max_zoom=13).add_to(m)

    unique_coords = {}
    for lat, lon, label, src in coords_with_labels:
        key = (round(lat, 4), round(lon, 4))
        if key not in unique_coords:
            unique_coords[key] = {'labels': [], 'sources': set()}
        unique_coords[key]['labels'].append(label)
        unique_coords[key]['sources'].add(src)

    for (lat, lon), data in unique_coords.items():
        color = 'red' if 'synopsis' in data['sources'] else 'blue'
        if len(data['sources']) > 1:
            color = 'purple'
        folium.CircleMarker(
            location=[lat, lon],
            radius=4,
            popup=f"<b>{', '.join(set(data['labels']))}</b><br>Count: {len(data['labels'])}<br>Source: {', '.join(data['sources'])}",
            tooltip=data['labels'][0],
            color=color,
            fill=True,
            fillOpacity=0.7
        ).add_to(m)

    print(f"Added {len(unique_coords)} unique markers")
    return m

map_obj = create_locations_map(df, source='both')
map_obj.save('locations_map.html')
print("Saved: locations_map.html")

In [None]:
map_obj

### 6.6 Final Outputs

In [None]:
# Create flattened location table
location_data = []
for _, row in df.iterrows():
    title = row.get('original_title', row.get('title', 'Unknown'))
    year = row.get('year')
    
    if isinstance(row['locations_synopsis'], list) and isinstance(row['coordinates_synopsis'], list):
        for loc, coord in zip(row['locations_synopsis'], row['coordinates_synopsis']):
            if coord:
                location_data.append({
                    'film_title': title, 'year': year, 'location': loc,
                    'latitude': coord[0], 'longitude': coord[1], 'source': 'synopsis'
                })
    
    if isinstance(row['locations_wiki'], list) and isinstance(row['coordinates_wiki'], list):
        for loc, coord in zip(row['locations_wiki'], row['coordinates_wiki']):
            if coord:
                location_data.append({
                    'film_title': title, 'year': year, 'location': loc,
                    'latitude': coord[0], 'longitude': coord[1], 'source': 'wikipedia'
                })

df_locations = pd.DataFrame(location_data)
print(f"Flattened table: {len(df_locations)} entries")
print(f"  - From synopsis: {len(df_locations[df_locations['source']=='synopsis'])}")
print(f"  - From wikipedia: {len(df_locations[df_locations['source']=='wikipedia'])}")

In [None]:
# Grouped location summary
locations_summary = df_locations.groupby(['location', 'latitude', 'longitude']).size().reset_index(name='count')
locations_summary = locations_summary.sort_values('count', ascending=False)
print(f"\nUnique location-coordinate pairs: {len(locations_summary)}")
print(locations_summary.head(20).to_string(index=False))

In [None]:
# [FINAL SAVES]
df.to_csv('database.csv', index=False)
df_locations.to_csv('locations_flat.csv', index=False)
locations_summary.to_csv('locations_coordinates.csv', index=False)

print("="*60)
print("FINAL OUTPUTS SAVED")
print("="*60)
print(f"database.csv          - {len(df)} films with all data")
print(f"locations_flat.csv    - {len(df_locations)} film-location pairs")
print(f"locations_coordinates.csv - {len(locations_summary)} unique locations with counts")
print(f"locations_map.html    - Interactive map")
print(f"geocache.txt          - Coordinate cache for reuse")