# Italian Cinema Data Mining & Geocoding

A comprehensive pipeline for extracting, geocoding, and analyzing locations mentioned in Italian film synopses and Wikipedia summaries.

## Table of Contents

| Section | Description | Output |
|---------|-------------|--------|
| **1. Setup** | Install dependencies, load libraries | — |
| **2. Scraping** *(optional)* | Extract film data from filmitalia.org | `filmitalia_raw.csv` |
| **3. Cleanup** *(optional)* | Clean and enrich data with Wikidata | `filmitalia_cleaned.csv` |
| **4. Location Extraction** | NER-based extraction from synopsis & wikipedia | `filmitalia_locations.csv` |
| — 4.1 Synopsis | Extract from `synopsis` column | — |
| — 4.2 Wikipedia | Extract from `wikipedia_summary` column | — |
| **5. Geocoding** | Geocode locations via Nominatim API | `filmitalia_geocoded.csv` |
| **6. Semantic Analysis** | Lexicon-based spatial semantics | — |
| — 6.1 Settlement/Nature | Urban vs. natural landscape terms | — |
| — 6.2 Mobility/Static | Movement vs. sedentary lexicon | — |
| **7. Visualization & Analysis** | Maps, charts, and comparative analysis | `database.csv` |
| — 7.1 Location Overlap | Venn-style overlap between sources | — |
| — 7.2 Frequency Analysis | Top locations, correlation plots | — |
| — 7.3 Interactive Map | Folium heatmap | `locations_map.html` |
| — 7.4 Final Outputs | Export all datasets | Multiple CSVs |

---

**Key output columns:** `locations_synopsis`, `coordinates_synopsis`, `locations_wiki`, `coordinates_wiki`, `settlement_*`, `nature_*`, `stanziale_*`, `movimento_*`

---
## 1. Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import re
import json
import time
import os
import ast
from collections import Counter

import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

import spacy
try:
    nlp = spacy.load('it_core_news_sm')
except:
    !python -m spacy download it_core_news_sm
    nlp = spacy.load('it_core_news_sm')

print("Setup complete!")

---
## 2. Web Scraping *(Optional)*

**⚠️ Set `RUN_SCRAPING = True` to execute.** This section scrapes filmitalia.org and takes several hours.

In [None]:
RUN_SCRAPING = False

In [None]:
if RUN_SCRAPING:

    def scrape_film_urls(page_num, retries=3, delay=2):
        url = f"https://filmitalia.org/it/film/pag-{page_num}/"
        for attempt in range(retries):
            try:
                response = requests.get(url, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                film_links = []
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    if '/it/film/' in href and href.endswith('/'):
                        if href.startswith('/'):
                            href = 'https://filmitalia.org' + href
                        film_links.append(href)
                return list(set(film_links))
            except Exception as e:
                print(f"Attempt {attempt + 1} failed for page {page_num}: {e}")
                if attempt < retries - 1:
                    time.sleep(delay)
        return []

    print("Scraping film URLs...")
    all_film_urls = []
    for page in tqdm(range(1, 500)):
        urls = scrape_film_urls(page)
        if not urls:
            print(f"No URLs found on page {page}. Stopping.")
            break
        all_film_urls.extend(urls)
        time.sleep(1)

    df_urls = pd.DataFrame({'film_url': list(set(all_film_urls))})
    df_urls.to_csv('filmitalia_urls.csv', index=False)
    print(f"Scraped {len(df_urls)} unique film URLs")

    def scrape_film_details(film_url, retries=3, delay=2):
        for attempt in range(retries):
            try:
                response = requests.get(film_url, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                film_data = {'film_url': film_url}
                title_elem = soup.find('h1')
                film_data['title'] = title_elem.text.strip() if title_elem else None
                for row in soup.find_all('div', class_='field'):
                    label = row.find('div', class_='field-label')
                    value = row.find('div', class_='field-items')
                    if label and value:
                        field_name = label.text.strip().rstrip(':')
                        field_value = value.text.strip()
                        film_data[field_name] = field_value
                synopsis_elem = soup.find('div', class_='field-name-body')
                if synopsis_elem:
                    film_data['synopsis'] = synopsis_elem.text.strip()
                return film_data
            except Exception as e:
                print(f"Attempt {attempt + 1} failed for {film_url}: {e}")
                if attempt < retries - 1:
                    time.sleep(delay)
        return {'film_url': film_url, 'error': 'Failed to scrape'}

    print("\nScraping film details...")
    df_urls = pd.read_csv('filmitalia_urls.csv')
    films_data = []
    for url in tqdm(df_urls['film_url']):
        films_data.append(scrape_film_details(url))
        time.sleep(1)

    df = pd.DataFrame(films_data)
    df.to_csv('filmitalia_raw.csv', index=False)
    print(f"Scraped details for {len(df)} films")

else:
    print("Scraping inactive. Set RUN_SCRAPING = True to execute.")

---
## 3. Data Cleanup *(Optional)*

**⚠️ Set `RUN_CLEANUP = True` to execute.** Parses duration, enriches with Wikidata.

In [None]:
RUN_CLEANUP = False

In [None]:
# [OPTIONAL LOAD] df = pd.read_csv('filmitalia_raw.csv')

In [None]:
if RUN_CLEANUP:
    print("Loading raw scraped data...")
    df = pd.read_csv('filmitalia_raw.csv')

    def parse_duration(duration_str):
        if pd.isna(duration_str):
            return None
        duration_str = str(duration_str).strip()
        numbers = re.findall(r'\d+', duration_str)
        if not numbers:
            return None
        if 'h' in duration_str.lower() or 'ore' in duration_str.lower():
            return int(numbers[0]) * 60 + int(numbers[1]) if len(numbers) >= 2 else int(numbers[0]) * 60
        return int(numbers[0])

    if 'duration' in df.columns:
        df['duration_minutes'] = df['duration'].apply(parse_duration)

    if 'wikidata_id' in df.columns:
        def get_wikidata_summary(qid):
            if pd.isna(qid):
                return None
            try:
                url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
                response = requests.get(url, timeout=10)
                data = response.json()
                entity = data.get('entities', {}).get(qid, {})
                return entity.get('descriptions', {}).get('it', {}).get('value')
            except:
                return None

        print("Enriching with Wikidata summaries...")
        tqdm.pandas()
        df['wikidata_summary'] = df['wikidata_id'].progress_apply(get_wikidata_summary)

    df.to_csv('filmitalia_cleaned.csv', index=False)
    print(f"Cleaned data saved: {len(df)} films")
else:
    print("Cleanup inactive. Set RUN_CLEANUP = True to execute.")

---
## 4. Location Extraction (spaCy NER)

Extracts geographic entities (`GPE`, `LOC`) from both `synopsis` and `wikipedia_summary` columns.

- Applies stopword filtering
- Removes Italian article prefixes
- Validates against silver standard

In [None]:
# Load data from GitHub repository
GITHUB_DATA_URL = "https://raw.githubusercontent.com/lucagiovannini7/modelling-italian-films/refs/heads/main/filmitalia_details_enriched.csv"
df = pd.read_csv(GITHUB_DATA_URL)
print(f"Loaded {len(df)} films from GitHub")
print(f"Columns: {list(df.columns)}")

In [None]:
# Core NER extraction and cleaning functions

def extract_locations(text):
    """Extract location entities (GPE, LOC) from Italian text using spaCy."""
    if pd.isna(text):
        return []
    doc = nlp(str(text))
    return list(set(ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']))

# Load stopwords and silver standard
STOPWORDS_URL = "https://raw.githubusercontent.com/lucagiovannini7/modelling-italian-films/refs/heads/main/stopwords.txt"
SILVER_STANDARD_URL = "https://raw.githubusercontent.com/lucagiovannini7/modelling-italian-films/refs/heads/main/silver_standard.txt"

stopwords = [w.strip() for w in requests.get(STOPWORDS_URL).text.split(",") if w.strip()]
silver_standard = set(line.strip() for line in requests.get(SILVER_STANDARD_URL).text.splitlines() if line.strip())
print(f"Loaded {len(stopwords)} stopwords, {len(silver_standard)} silver standard locations")

# Cleaning parameters
MIN_LENGTH = 3
MIN_ACRONYM_LENGTH = 5
ITALIAN_PREFIXES = r'^(di|a|da|in|nel|nell|nella|negli|per|verso|il|la|lo|le|i|gli|del|della|dei|degli|al|alla|ai|agli)\s+'

def clean_locations(location_list):
    """Remove stopwords, invalid entries, and Italian prefixes."""
    if not location_list or not isinstance(location_list, list):
        return []
    cleaned = []
    for loc in location_list:
        if loc in stopwords or len(loc) < MIN_LENGTH or (loc.isupper() and len(loc) < MIN_ACRONYM_LENGTH):
            continue
        loc = re.sub(ITALIAN_PREFIXES, '', loc, flags=re.IGNORECASE)
        if match := re.match(r'^[a-z]+([A-Z].*)$', loc):
            loc = match.group(1)
        if loc:
            cleaned.append(loc)
    return cleaned

def filter_by_silver_standard(entities):
    """Keep only entities in silver standard."""
    return [ent for ent in entities if ent in silver_standard] if isinstance(entities, list) else []

### 4.1 Extract from Synopsis

In [None]:
print("Extracting locations from SYNOPSIS...")
tqdm.pandas()
df['locations_synopsis'] = df['synopsis'].progress_apply(extract_locations)
print(f"Raw extraction: {sum(len(x) for x in df['locations_synopsis'])} mentions")

df['locations_synopsis'] = df['locations_synopsis'].apply(clean_locations)
print(f"After cleaning: {sum(len(x) for x in df['locations_synopsis'])} mentions")

df['locations_synopsis'] = df['locations_synopsis'].apply(filter_by_silver_standard)
print(f"After silver standard filter: {sum(len(x) for x in df['locations_synopsis'])} mentions")

### 4.2 Extract from Wikipedia Summary

In [None]:
print("Extracting locations from WIKIPEDIA_SUMMARY...")
tqdm.pandas()
df['locations_wiki'] = df['wikipedia_summary'].progress_apply(extract_locations)
print(f"Raw extraction: {sum(len(x) for x in df['locations_wiki'])} mentions")

df['locations_wiki'] = df['locations_wiki'].apply(clean_locations)
print(f"After cleaning: {sum(len(x) for x in df['locations_wiki'])} mentions")

df['locations_wiki'] = df['locations_wiki'].apply(filter_by_silver_standard)
print(f"After silver standard filter: {sum(len(x) for x in df['locations_wiki'])} mentions")

In [None]:
# Location extraction summary
synopsis_locs = [loc for locs in df['locations_synopsis'] for loc in locs]
wiki_locs = [loc for locs in df['locations_wiki'] for loc in locs]

print("\n" + "="*50)
print("LOCATION EXTRACTION SUMMARY")
print("="*50)
print(f"Synopsis:  {len(synopsis_locs)} mentions, {len(set(synopsis_locs))} unique")
print(f"Wikipedia: {len(wiki_locs)} mentions, {len(set(wiki_locs))} unique")
print(f"Combined unique: {len(set(synopsis_locs) | set(wiki_locs))}")

# Save checkpoint
df.to_csv('filmitalia_locations.csv', index=False)
print("\nSaved: filmitalia_locations.csv")

---
## 5. Geocoding (Nominatim)

Geocode extracted locations using OpenStreetMap's Nominatim API.

- `USE_GEOCACHE = True` → Load from cache file
- `USE_GEOCACHE = False` → Geocode from scratch

In [None]:
# Configuration
USE_GEOCACHE = False
GEOCACHE_FILE = 'geocache.txt'

# [OPTIONAL LOAD]
# df = pd.read_csv('filmitalia_locations.csv')
# for col in ['locations_synopsis', 'locations_wiki']:
#     df[col] = df[col].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and x != '[]' else [])

In [None]:
# Geocoding functions
def geocode_location(location_name):
    """Geocode a location using Nominatim."""
    try:
        response = requests.get(
            "https://nominatim.openstreetmap.org/search",
            params={'q': location_name, 'format': 'json', 'limit': 1},
            headers={'User-Agent': 'ItalianCinemaResearch/1.0'},
            timeout=10
        )
        data = response.json()
        if data:
            return (float(data[0]['lat']), float(data[0]['lon']))
    except Exception as e:
        print(f"Error geocoding {location_name}: {e}")
    return (None, None)

def load_geocache(filepath):
    cache = {}
    if os.path.exists(filepath):
        try:
            cache_df = pd.read_csv(filepath)
            for _, row in cache_df.iterrows():
                loc = row.get('location', row.iloc[0] if len(row) > 0 else None)
                lat = row.get('latitude', row.iloc[1] if len(row) > 1 else None)
                lon = row.get('longitude', row.iloc[2] if len(row) > 2 else None)
                if loc and pd.notna(lat) and pd.notna(lon):
                    cache[loc] = [float(lat), float(lon)]
        except Exception as e:
            print(f"Warning: Could not load geocache: {e}")
    return cache

def save_geocache(cache, filepath):
    rows = [{'location': k, 'latitude': v[0], 'longitude': v[1]} for k, v in cache.items() if v]
    pd.DataFrame(rows).to_csv(filepath, index=False)
    print(f"Saved {len(rows)} locations to {filepath}")

In [None]:
# Collect all unique locations and geocode
all_unique_locations = set()
df['locations_synopsis'].apply(lambda x: all_unique_locations.update(x) if isinstance(x, list) else None)
synopsis_unique = len(all_unique_locations)
df['locations_wiki'].apply(lambda x: all_unique_locations.update(x) if isinstance(x, list) else None)
print(f"Unique from synopsis: {synopsis_unique}")
print(f"Additional from wikipedia: {len(all_unique_locations) - synopsis_unique}")
print(f"Total to geocode: {len(all_unique_locations)}")

# Load cache or start fresh
if USE_GEOCACHE and os.path.exists(GEOCACHE_FILE):
    location_coords = load_geocache(GEOCACHE_FILE)
    print(f"Loaded {len(location_coords)} cached coordinates")
    to_geocode = all_unique_locations - set(location_coords.keys())
else:
    location_coords = {}
    to_geocode = all_unique_locations

# Geocode new locations
if to_geocode:
    print(f"\nGeocoding {len(to_geocode)} locations...")
    for location in tqdm(to_geocode):
        lat, lon = geocode_location(location)
        location_coords[location] = [lat, lon] if lat and lon else None
        time.sleep(1)
    save_geocache(location_coords, GEOCACHE_FILE)

print(f"\nSuccessfully geocoded: {sum(1 for v in location_coords.values() if v)}/{len(all_unique_locations)} locations")

In [None]:
# Apply coordinates to dataframe
def get_coordinates(location_list):
    if not isinstance(location_list, list):
        return []
    return [location_coords.get(loc) for loc in location_list if location_coords.get(loc)]

df['coordinates_synopsis'] = df['locations_synopsis'].apply(get_coordinates)
df['coordinates_wiki'] = df['locations_wiki'].apply(get_coordinates)

print("Sample (synopsis):")
print(df[['original_title', 'locations_synopsis', 'coordinates_synopsis']].head(3))

# Save checkpoint
df.to_csv('filmitalia_geocoded.csv', index=False)
print("\nSaved: filmitalia_geocoded.csv")

---
## 6. Semantic Analysis

Lexicon-based analysis of spatial semantics in film descriptions. Two complementary approaches:

1. **Settlement/Nature lexicon** — Identifies urban vs. natural landscape vocabulary
2. **Mobility/Static lexicon** — Detects movement-related vs. sedentary verbs

Both analyses are applied to `synopsis` and `wikipedia_summary` columns.

In [None]:
# [OPTIONAL LOAD] - Start here if you have filmitalia_geocoded.csv
# df = pd.read_csv('filmitalia_geocoded.csv', engine='python')
# for col in ['locations_synopsis', 'locations_wiki', 'coordinates_synopsis', 'coordinates_wiki']:
#     df[col] = df[col].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and x != '[]' else [])

### 6.1 Settlement/Nature Lexicon

Identifies mentions of urban settlements (città, quartiere, piazza...) vs. natural landscapes (mare, montagna, campagna...).

In [None]:
# Define settlement/nature lexicons
settlement_lexicon = [
    'città', 'paese', 'paesino', 'provincia', 'regione', 'nazione',
    'periferia', 'quartiere', 'rione', 'piazza', 'piazzetta',
    'lungomare', 'porto', 'borgo', 'centro storico'
]

nature_lexicon = [
    'mare', 'spiaggia', 'costa', 'isola', 'isole', 'laguna',
    'montagna', 'montagne', 'collina', 'colline', 'valle', 'pianura',
    'bosco', 'boschi', 'foresta', 'pineta',
    'fiume', 'fiumi', 'lago', 'laghi', 'torrente',
    'campagna', 'campagne', 'vigneto', 'entroterra'
]

def extract_lexicon(text, lexicon):
    if pd.isna(text):
        return []
    text_lower = text.lower()
    return [term for term in lexicon if re.search(r'\b' + term + r'\b', text_lower)]

# Extract from both columns
df['settlement_synopsis'] = df['synopsis'].apply(lambda x: extract_lexicon(x, settlement_lexicon))
df['nature_synopsis'] = df['synopsis'].apply(lambda x: extract_lexicon(x, nature_lexicon))
df['settlement_wiki'] = df['wikipedia_summary'].apply(lambda x: extract_lexicon(x, settlement_lexicon))
df['nature_wiki'] = df['wikipedia_summary'].apply(lambda x: extract_lexicon(x, nature_lexicon))

# Statistics
settlement_syn = [t for terms in df['settlement_synopsis'] for t in terms]
nature_syn = [t for terms in df['nature_synopsis'] for t in terms]
settlement_wiki = [t for terms in df['settlement_wiki'] for t in terms]
nature_wiki = [t for terms in df['nature_wiki'] for t in terms]

print("=== SETTLEMENT LEXICON ===")
print(f"Synopsis: {len(settlement_syn)} mentions | Wikipedia: {len(settlement_wiki)} mentions")
print(f"Top terms (synopsis): {Counter(settlement_syn).most_common(5)}")

print("\n=== NATURE LEXICON ===")
print(f"Synopsis: {len(nature_syn)} mentions | Wikipedia: {len(nature_wiki)} mentions")
print(f"Top terms (synopsis): {Counter(nature_syn).most_common(5)}")

In [None]:
# Visualization: Settlement/Nature frequency comparison
import matplotlib.pyplot as plt

n_synopsis = df['synopsis'].notna().sum()
n_wiki = df['wikipedia_summary'].notna().sum()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Settlement comparison
ax1 = axes[0]
settlement_syn_counts = Counter(settlement_syn)
settlement_wiki_counts = Counter(settlement_wiki)
terms = settlement_lexicon
x = range(len(terms))
syn_vals = [settlement_syn_counts.get(t, 0) / n_synopsis * 100 for t in terms]
wiki_vals = [settlement_wiki_counts.get(t, 0) / n_wiki * 100 for t in terms]
ax1.bar([i - 0.2 for i in x], syn_vals, 0.4, label='Synopsis', color='steelblue')
ax1.bar([i + 0.2 for i in x], wiki_vals, 0.4, label='Wikipedia', color='coral')
ax1.set_xticks(x)
ax1.set_xticklabels(terms, rotation=45, ha='right')
ax1.set_ylabel('Mentions per 100 entries')
ax1.set_title('Settlement Lexicon Frequency')
ax1.legend()

# Nature comparison
ax2 = axes[1]
nature_syn_counts = Counter(nature_syn)
nature_wiki_counts = Counter(nature_wiki)
terms = nature_lexicon
x = range(len(terms))
syn_vals = [nature_syn_counts.get(t, 0) / n_synopsis * 100 for t in terms]
wiki_vals = [nature_wiki_counts.get(t, 0) / n_wiki * 100 for t in terms]
ax2.bar([i - 0.2 for i in x], syn_vals, 0.4, label='Synopsis', color='steelblue')
ax2.bar([i + 0.2 for i in x], wiki_vals, 0.4, label='Wikipedia', color='coral')
ax2.set_xticks(x)
ax2.set_xticklabels(terms, rotation=45, ha='right')
ax2.set_ylabel('Mentions per 100 entries')
ax2.set_title('Nature Lexicon Frequency')
ax2.legend()

plt.tight_layout()
plt.show()

### 6.2 Mobility/Static Lexicon

Uses spaCy lemmatization to identify:
- **Stanziale (static)**: verbs like *vivere, abitare, restare, rimanere*
- **Movimento (mobile)**: verbs like *partire, arrivare, fuggire, viaggiare*

In [None]:
# Define mobility/static lexicons
stanziale_verbs = ['ambientare', 'svolgere', 'vivere', 'abitare', 'risiedere',
                   'stabilire', 'restare', 'rimanere', 'rifugiare']
stanziale_nouns = ['città natale', 'paese natale', 'casa natale']

movimento_verbs = ['partire', 'arrivare', 'raggiungere', 'tornare', 'ritornare',
                   'rientrare', 'lasciare', 'abbandonare', 'fuggire', 'scappare',
                   'trasferire', 'spostare', 'andare', 'recare', 'dirigere',
                   'viaggiare', 'attraversare', 'percorrere', 'sbarcare',
                   'approdare', 'migrare', 'inseguire', 'seguire']
movimento_nouns = ['viaggio', 'fuga', 'ritorno', 'trasferimento', 'rientro', 'transito']

def count_semantic_field(text, verb_lemmas, noun_lemmas):
    if pd.isna(text):
        return 0
    doc = nlp(str(text))
    count = sum(1 for token in doc if token.pos_ == 'VERB' and token.lemma_.lower() in verb_lemmas)
    text_lower = text.lower()
    for noun in noun_lemmas:
        count += len(re.findall(r'\b' + noun + r'\b', text_lower))
    return count

tqdm.pandas()

print("Processing synopsis...")
df['stanziale_synopsis'] = df['synopsis'].progress_apply(lambda x: count_semantic_field(x, stanziale_verbs, stanziale_nouns))
df['movimento_synopsis'] = df['synopsis'].progress_apply(lambda x: count_semantic_field(x, movimento_verbs, movimento_nouns))

print("Processing wikipedia_summary...")
df['stanziale_wiki'] = df['wikipedia_summary'].progress_apply(lambda x: count_semantic_field(x, stanziale_verbs, stanziale_nouns))
df['movimento_wiki'] = df['wikipedia_summary'].progress_apply(lambda x: count_semantic_field(x, movimento_verbs, movimento_nouns))

print("\nExtraction complete.")
print(f"Synopsis - Stanziale: {df['stanziale_synopsis'].sum()}, Movimento: {df['movimento_synopsis'].sum()}")
print(f"Wiki - Stanziale: {df['stanziale_wiki'].sum()}, Movimento: {df['movimento_wiki'].sum()}")

In [None]:
# Visualization: Mobility/Static distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Index distribution
for ax, source, col_stan, col_mov in [(axes[0], 'Synopsis', 'stanziale_synopsis', 'movimento_synopsis'),
                                       (axes[1], 'Wikipedia', 'stanziale_wiki', 'movimento_wiki')]:
    n = df[col_stan.replace('stanziale_', '').replace('_synopsis', 'synopsis').replace('_wiki', 'wikipedia_summary')].notna().sum() if 'synopsis' in col_stan else df['wikipedia_summary'].notna().sum()
    n = df['synopsis'].notna().sum() if 'synopsis' in col_stan else df['wikipedia_summary'].notna().sum()
    
    for idx, (col, label, color) in enumerate([(col_stan, 'Stanziale', 'steelblue'), (col_mov, 'Movimento', 'coral')]):
        counts = df[col].value_counts().sort_index()
        pcts = counts / n * 100
        ax.bar([i + idx*0.4 for i in pcts.index[:8]], pcts.values[:8], 0.4, label=label, color=color)
    ax.set_xlabel('Index value')
    ax.set_ylabel('% of entries')
    ax.set_title(f'{source}: Index Distribution')
    ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# Top films by mobility/static scores (normalized)
def count_tokens(text):
    return len(nlp(str(text))) if pd.notna(text) else 0

if 'tokens_synopsis' not in df.columns:
    tqdm.pandas()
    print("Counting tokens...")
    df['tokens_synopsis'] = df['synopsis'].progress_apply(count_tokens)
    df['tokens_wiki'] = df['wikipedia_summary'].progress_apply(count_tokens)

# Normalized scores (per 100 tokens)
df['stanziale_syn_norm'] = df['stanziale_synopsis'] / (df['tokens_synopsis'] + 1) * 100
df['movimento_syn_norm'] = df['movimento_synopsis'] / (df['tokens_synopsis'] + 1) * 100

cols_display = ['reconciled_title', 'year', 'director', 'tokens_synopsis', 'stanziale_syn_norm', 'movimento_syn_norm']

print("=== TOP 5 MOVIMENTO (Synopsis, normalized) ===")
print(df[df['tokens_synopsis'] > 50].nlargest(5, 'movimento_syn_norm')[cols_display].to_string(index=False))

print("\n=== TOP 5 STANZIALE (Synopsis, normalized) ===")
print(df[df['tokens_synopsis'] > 50].nlargest(5, 'stanziale_syn_norm')[cols_display].to_string(index=False))

---
## 7. Visualization & Analysis

Comparative analysis of location extraction from both sources.

In [None]:
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap

# Prepare data
synopsis_locs = [loc for locs in df['locations_synopsis'] for loc in (locs if isinstance(locs, list) else [])]
wiki_locs = [loc for locs in df['locations_wiki'] for loc in (locs if isinstance(locs, list) else [])]

synopsis_counts = Counter(synopsis_locs)
wiki_counts = Counter(wiki_locs)

print(f"Total synopsis mentions: {len(synopsis_locs)}, unique: {len(set(synopsis_locs))}")
print(f"Total wiki mentions: {len(wiki_locs)}, unique: {len(set(wiki_locs))}")

### 7.1 Location Overlap Analysis

In [None]:
synopsis_set = set(synopsis_locs)
wiki_set = set(wiki_locs)

only_synopsis = synopsis_set - wiki_set
only_wiki = wiki_set - synopsis_set
both = synopsis_set & wiki_set

print(f"Only in synopsis: {len(only_synopsis)}")
print(f"Only in wiki: {len(only_wiki)}")
print(f"In both: {len(both)}")

fig, ax = plt.subplots(figsize=(8, 8))
ax.pie([len(only_synopsis), len(only_wiki), len(both)],
       labels=[f'Synopsis only\n({len(only_synopsis)})', f'Wiki only\n({len(only_wiki)})', f'Both\n({len(both)})'],
       colors=['steelblue', 'darkorange', 'purple'], autopct='%1.1f%%')
ax.set_title('Location Overlap between Sources')
plt.savefig('location_overlap.png', dpi=150)
plt.show()

### 7.2 Frequency Analysis

In [None]:
# Top locations comparison
total_synopsis = len(synopsis_locs)
total_wiki = len(wiki_locs)

all_top = set([x[0] for x in synopsis_counts.most_common(20)] + [x[0] for x in wiki_counts.most_common(20)])

compare_df = pd.DataFrame({
    'location': list(all_top),
    'synopsis': [synopsis_counts.get(loc, 0) / total_synopsis * 100 for loc in all_top],
    'wiki': [wiki_counts.get(loc, 0) / total_wiki * 100 for loc in all_top]
}).sort_values('synopsis', ascending=True)

fig, ax = plt.subplots(figsize=(12, 10))
y = range(len(compare_df))
ax.barh([i - 0.2 for i in y], compare_df['synopsis'], 0.4, label='Synopsis', color='steelblue')
ax.barh([i + 0.2 for i in y], compare_df['wiki'], 0.4, label='Wikipedia', color='darkorange')
ax.set_yticks(y)
ax.set_yticklabels(compare_df['location'])
ax.set_xlabel('Percentage of mentions')
ax.set_title('Top Locations Comparison (% of total mentions)')
ax.legend()
plt.tight_layout()
plt.savefig('location_comparison_pct.png', dpi=150)
plt.show()

In [None]:
# Frequency correlation scatter
common_locs = list(both)
x = [synopsis_counts[loc] / total_synopsis * 100 for loc in common_locs]
y = [wiki_counts[loc] / total_wiki * 100 for loc in common_locs]

fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(x, y, alpha=0.6)
for i, loc in enumerate(common_locs):
    if x[i] > 2 or y[i] > 2:
        ax.annotate(loc, (x[i], y[i]), fontsize=8)
ax.set_xlabel('Synopsis frequency (%)')
ax.set_ylabel('Wiki frequency (%)')
ax.set_title('Location Frequency Correlation')
ax.plot([0, max(x) if x else 1], [0, max(x) if x else 1], 'k--', alpha=0.3)
plt.tight_layout()
plt.savefig('location_correlation.png', dpi=150)
plt.show()

### 7.3 Interactive Map

In [None]:
def create_locations_map(df, source='both'):
    coords_with_labels = []
    
    for _, row in df.iterrows():
        if source in ['synopsis', 'both']:
            locations = row['locations_synopsis'] if isinstance(row['locations_synopsis'], list) else []
            coordinates = row['coordinates_synopsis'] if isinstance(row['coordinates_synopsis'], list) else []
            for loc, coord in zip(locations, coordinates):
                if isinstance(coord, list) and len(coord) == 2 and all(isinstance(c, (int, float)) for c in coord):
                    coords_with_labels.append((coord[0], coord[1], loc, 'synopsis'))
        
        if source in ['wiki', 'both']:
            locations = row['locations_wiki'] if isinstance(row['locations_wiki'], list) else []
            coordinates = row['coordinates_wiki'] if isinstance(row['coordinates_wiki'], list) else []
            for loc, coord in zip(locations, coordinates):
                if isinstance(coord, list) and len(coord) == 2 and all(isinstance(c, (int, float)) for c in coord):
                    coords_with_labels.append((coord[0], coord[1], loc, 'wiki'))
    
    print(f"Plotting {len(coords_with_labels)} location instances")
    
    m = folium.Map(location=[42.5, 12.5], zoom_start=6)
    heat_data = [[lat, lon] for lat, lon, _, _ in coords_with_labels]
    HeatMap(heat_data, radius=10, blur=15).add_to(m)
    
    return m

map_obj = create_locations_map(df)
map_obj.save('locations_map.html')
print("Saved: locations_map.html")
map_obj

### 7.4 Final Outputs

In [None]:
# Create flattened location table
location_data = []
for _, row in df.iterrows():
    title = row.get('original_title', row.get('title', 'Unknown'))
    year = row.get('year')
    
    if isinstance(row['locations_synopsis'], list) and isinstance(row['coordinates_synopsis'], list):
        for loc, coord in zip(row['locations_synopsis'], row['coordinates_synopsis']):
            if coord:
                location_data.append({'film_title': title, 'year': year, 'location': loc,
                                      'latitude': coord[0], 'longitude': coord[1], 'source': 'synopsis'})
    
    if isinstance(row['locations_wiki'], list) and isinstance(row['coordinates_wiki'], list):
        for loc, coord in zip(row['locations_wiki'], row['coordinates_wiki']):
            if coord:
                location_data.append({'film_title': title, 'year': year, 'location': loc,
                                      'latitude': coord[0], 'longitude': coord[1], 'source': 'wikipedia'})

df_locations = pd.DataFrame(location_data)
print(f"Flattened table: {len(df_locations)} entries")
print(f"  - From synopsis: {len(df_locations[df_locations['source']=='synopsis'])}")
print(f"  - From wikipedia: {len(df_locations[df_locations['source']=='wikipedia'])}")

In [None]:
# Grouped location summary
locations_summary = df_locations.groupby(['location', 'latitude', 'longitude']).size().reset_index(name='count')
locations_summary = locations_summary.sort_values('count', ascending=False)
print(f"Unique location-coordinate pairs: {len(locations_summary)}")
print(locations_summary.head(20).to_string(index=False))

In [None]:
# Final saves
df.to_csv('database.csv', index=False)
df_locations.to_csv('locations_flat.csv', index=False)
locations_summary.to_csv('locations_coordinates.csv', index=False)

print("="*60)
print("FINAL OUTPUTS SAVED")
print("="*60)
print(f"database.csv          - {len(df)} films with all data")
print(f"locations_flat.csv    - {len(df_locations)} film-location pairs")
print(f"locations_coordinates.csv - {len(locations_summary)} unique locations with counts")
print(f"locations_map.html    - Interactive map")
print(f"geocache.txt          - Coordinate cache for reuse")