In [211]:
import pandas as pd
import pandas as pd
import numpy as np
import warnings
from sklearn.manifold import TSNE
import plotly.express as px
from helpers import load_data, get_embedding
import tqdm
import plotly.graph_objects as go
import json

warnings.filterwarnings("ignore")
USE_CITIES = True # If False, we use countries
DATA_PATH = 'data/'

# Load the data
loaded_data = load_data(DATA_PATH)

character_metadata = loaded_data['character_metadata']
movie_metadata = loaded_data['movie_metadata']
plot_summaries = loaded_data['plot_summaries']
embeddings = loaded_data['embeddings']
combined_plot_summaries = loaded_data['combined_plot_summaries']

In [156]:
def movie_id_to_name(wikipedia_movie_id):
    if wikipedia_movie_id is None:
        return None
    
    names = movie_metadata.loc[movie_metadata['Wikipedia movie ID'] == wikipedia_movie_id]['Movie name']

    if names.empty:
        return "Unknown"
    return names.values[0]

movie_names = combined_plot_summaries['Wikipedia movie ID'].apply(movie_id_to_name)

#### City representation from foreign countries 

In [157]:
# We load from movie_analysis.json and convert to df
with open('data/movie_analysis.json', 'r') as f:
    analysis = json.load(f)

In [158]:
# We convert the string in the values to a dict
for key, value in analysis.items():
    analysis[key] = json.loads(value)

In [159]:
def get_cities(one_analysis):
    if one_analysis is None:
        return []

    if 'cities' not in one_analysis:
        return []

    return one_analysis['cities']
    
cities = [get_cities(value) for key, value in analysis.items()]
cities = sorted(list(set([item for sublist in cities for item in sublist])))

In [160]:
def get_countries(one_analysis):
    if one_analysis is None:
        return []

    if 'countries' not in one_analysis:
        return []

    return one_analysis['countries']
    
countries = [get_countries(value) for key, value in analysis.items()]
countries = sorted(list(set([item for sublist in countries for item in sublist])))

In [202]:
import ast

# Function to extract country name (first country name)
def extract_country(country_string):
    if country_string:
        try:
            country_dict = ast.literal_eval(country_string)  # Convert string to dictionary
            if country_dict and isinstance(country_dict, dict):
                country_values = list(country_dict.values())
                if country_values:
                    return country_values[0]
        except (ValueError, SyntaxError):
            # Handle the case where the string is not a valid dictionary
            pass
    return None

# Apply the function to the DataFrame column
# Added a 'Country' column some values are None
movie_metadata['Country'] = movie_metadata['Movie countries (Freebase ID:name tuples)'].apply(extract_country)

In [161]:
cities_movies = {city: [] for city in cities}
countries_movies = {country: [] for country in countries}

for key, value in analysis.items():
    movie_cities = get_cities(value)
    movie_countries = get_countries(value)

    for city in movie_cities:
        if city in cities_movies:
            cities_movies[city].append(int(key))

    for country in movie_countries:
        if country in countries_movies:
            countries_movies[country].append(int(key))

In [162]:
# Remove all countries with less than 10 movies both from countries and countries_movies
countries = [country for country in countries if len(countries_movies[country]) >= 10]
countries_movies = {country: countries_movies[country] for country in countries}

In [163]:
# Remove all cities with less than 10 movies both from cities and cities_movies
cities = [city for city in cities if len(cities_movies[city]) >= 10]
cities_movies = {city: cities_movies[city] for city in cities}

#### Select a list of city we are interested in for poc 

check if there could be significant différences for a city violence représentation for example between two production movie countries

les pays qui font le plus de film sur Paris

majoritairement la vision qu ils en ont

que ce soit le genre du film ou autre et si ca differe de la norme ou du pays d'origine de la ville

In [198]:
city_list = [
    'Paris', 'Berlin', 'Marseille'
]


In [199]:
# Extract lists for cities in city_list
city_lists = {}
for city in city_list:
    city_lists[city] = cities_movies.get(city, [])

# Print the lists for each city
for city, movie_ids in city_lists.items():
    print(f"{city}: {movie_ids}")

Paris: [24041333, 156632, 30637679, 5935181, 901907, 26553506, 8030277, 27260560, 11399740, 18423431, 15941612, 17733263, 6542034, 1586285, 19069597, 33179749, 16644875, 61323, 1976777, 15998246, 29236626, 29267698, 2730191, 28510605, 3644779, 737840, 5151199, 2571942, 4339865, 27070440, 23924255, 11654754, 44302, 25017192, 4183527, 12583457, 22374429, 33270747, 8476961, 16731651, 1823450, 8557673, 17055534, 18418904, 26304712, 26503648, 3027029, 22376055, 1623252, 7096573, 5925279, 3116779, 32186653, 23847707, 1587300, 7578779, 20954169, 5826081, 8921046, 899442, 33089019, 805120, 13048086, 8695943, 16113730, 20168380, 18918169, 190405, 417891, 24494541, 1906459, 14924899, 27902570, 29163831, 8523825, 171621, 18333832, 3328486, 26443837, 25400635, 2812718, 13333148, 4073679, 5971762, 27492406, 8418734, 15052685, 12633541, 32472119, 2285357, 225505, 34954266, 61494, 2728419, 11020901, 26378018, 80916, 28413162, 35627063, 25896181, 33567266, 15428140, 4734122, 23367560, 4372744, 339526,

In [166]:
len(city_lists['Paris'])

851

In [200]:
cities_in_country = {
    'France': ['Paris','Marseille'],
    'Germany': ['Berlin'],
}

In [201]:
cities_in_country['France']

['Paris', 'Marseille']

In [204]:
# Filter movie IDs based on the country in movie_metadata
foreign_movies = {}
for city, movie_ids in city_lists.items():
    filtered_movie_ids = []
    for movie_id in movie_ids:
        matching_movies = movie_metadata[movie_metadata['Wikipedia movie ID'] == movie_id]
        if not matching_movies.empty:
            movie_country = matching_movies.iloc[0]['Country']
            city_country = next((country for country, cities in cities_in_country.items() if city in cities), None)
            if movie_country is not None and movie_country != city_country:
                filtered_movie_ids.append(movie_id)
    foreign_movies[city] = filtered_movie_ids

# Print the filtered lists for each city
for city, movie_ids in foreign_movies.items():
    print(f"{city}: {movie_ids}")

Paris: [156632, 901907, 26553506, 8030277, 27260560, 17733263, 1586285, 19069597, 61323, 15998246, 29267698, 3644779, 2571942, 27070440, 23924255, 11654754, 4183527, 16731651, 1823450, 8557673, 17055534, 26304712, 26503648, 3027029, 22376055, 1623252, 5925279, 3116779, 1587300, 5826081, 8921046, 899442, 805120, 18918169, 190405, 24494541, 1906459, 14924899, 27902570, 29163831, 171621, 2812718, 13333148, 4073679, 5971762, 27492406, 8418734, 15052685, 12633541, 2285357, 61494, 11020901, 80916, 28413162, 35627063, 33567266, 23367560, 339526, 35527324, 2137591, 16473441, 19524834, 35012794, 34749923, 2347609, 145892, 1364238, 512161, 8004988, 18056742, 5278636, 6552222, 1059280, 18358585, 10206784, 5746114, 5427176, 15541969, 10782724, 19167990, 10812453, 21744986, 14795399, 3469003, 20481209, 5461236, 13951439, 1627058, 3645005, 8562282, 14415759, 5215498, 9706751, 8049233, 14693181, 7518798, 24962756, 1371186, 9461535, 583932, 34683152, 8625710, 14704654, 16264226, 6238106, 12800555, 136

There are some mistakes such as 30637679 : 'I Want to Go Home' : loc = 30513 :  which takes place in paris and is made by a french production as movie_metadata['Movie countries (Freebase ID:name tuples)'][30513] outputs : '{}'

In [172]:
movie_id_to_name(12752698)

'The Crown of the Russian Empire/Once again the Elusive Avengers'

In [173]:
movie_metadata.loc[movie_metadata['Wikipedia movie ID'] == 30637679]['Country']

30513    None
Name: Country, dtype: object

In [174]:
movie_metadata['Movie countries (Freebase ID:name tuples)'][30513]

'{}'

In [206]:
# Create a dictionary to regroup movie IDs by country
regrouped_by_country = {}

for city, movie_ids in foreign_movies.items():
    for movie_id in movie_ids:
        matching_movies = movie_metadata[movie_metadata['Wikipedia movie ID'] == movie_id]
        if not matching_movies.empty:
            movie_country = matching_movies.iloc[0]['Country']
            if movie_country is not None:
                if movie_country not in regrouped_by_country:
                    regrouped_by_country[movie_country] = []
                regrouped_by_country[movie_country].append(movie_id)


In [207]:
# Create a dictionary to regroup movie IDs by city and country
regrouped_by_city_and_country = {}

for city, movie_ids in foreign_movies.items():
    city_dict = {}
    for movie_id in movie_ids:
        matching_movies = movie_metadata[movie_metadata['Wikipedia movie ID'] == movie_id]
        if not matching_movies.empty:
            movie_country = matching_movies.iloc[0]['Country']
            if movie_country is not None:
                if movie_country not in city_dict:
                    city_dict[movie_country] = []
                city_dict[movie_country].append(movie_id)
    regrouped_by_city_and_country[city] = city_dict

""" # Now you can access the movie IDs grouped by city and country
# For example, to access movies in Paris from different countries:
paris_movies = regrouped_by_city_and_country.get('Paris', {})
for country, movie_ids in paris_movies.items():
    print(f"Movies in Paris from {country}: {movie_ids}")

# To access movies in Berlin from different countries:
berlin_movies = regrouped_by_city_and_country.get('Berlin', {})
for country, movie_ids in berlin_movies.items():
    print(f"Movies in Berlin from {country}: {movie_ids}") """


' # Now you can access the movie IDs grouped by city and country\n# For example, to access movies in Paris from different countries:\nparis_movies = regrouped_by_city_and_country.get(\'Paris\', {})\nfor country, movie_ids in paris_movies.items():\n    print(f"Movies in Paris from {country}: {movie_ids}")\n\n# To access movies in Berlin from different countries:\nberlin_movies = regrouped_by_city_and_country.get(\'Berlin\', {})\nfor country, movie_ids in berlin_movies.items():\n    print(f"Movies in Berlin from {country}: {movie_ids}") '

In [209]:
regrouped_by_city_and_country['Marseille']

{'United States of America': [872497,
  339526,
  2350630,
  6616626,
  12213090,
  2476317,
  8011558,
  4949692,
  9162690,
  4578771,
  963986,
  1343245],
 'Austria': [19197890]}

Scale them compared to total movies they have done (a country)

In [179]:
# Print the regrouped dictionary
for country, movie_ids in regrouped_by_country.items():
    print(f"{country}: {movie_ids}")

United Kingdom: [156632, 3027029, 3116779, 4073679, 23367560, 19524834, 5746114, 14693181, 1371186, 9461535, 34683152, 13643392, 28913353, 26608090, 28902041, 31658770, 13527082, 586023, 17897345, 594422, 28900288, 26551226, 13665311, 29268574, 20079078, 30500657, 4926317, 9239665, 31122996, 1540319, 17890259, 28997526, 28809499, 1873637, 2648352, 97740, 11089906, 2199348, 3044685, 12811780, 20723352, 20635941, 6941316, 3450162, 203993, 9378717, 16185034, 16973860, 21606161, 35243445, 19136103, 11341007, 7088713, 9097942, 9155539]
United States of America: [901907, 26553506, 8030277, 27260560, 17733263, 19069597, 61323, 15998246, 29267698, 2571942, 27070440, 23924255, 11654754, 4183527, 16731651, 1823450, 8557673, 17055534, 26304712, 26503648, 5925279, 1587300, 899442, 805120, 18918169, 190405, 24494541, 1906459, 29163831, 171621, 2812718, 13333148, 5971762, 8418734, 15052685, 2285357, 61494, 11020901, 80916, 28413162, 35627063, 339526, 35527324, 2137591, 2347609, 145892, 1364238, 5121

#### Could be used for a lot other variables than just number of movies or percentage of movies (with : genres / good &  evil  / .... ???)

In [180]:
import googlemaps
import pandas as pd
import os

gmaps = googlemaps.Client(key=os.environ['GOOGLE_MAPS_API_KEY'])

city_coordinates = {}
country_coordinates = {}


if USE_CITIES:
    for city in cities:
        geocode_result = gmaps.geocode(city)
        if geocode_result:
            lat = geocode_result[0]["geometry"]["location"]["lat"]
            lng = geocode_result[0]["geometry"]["location"]["lng"]
            city_coordinates[city] = (lat, lng)
        else:
            city_coordinates[city] = (None, None)
else:
    for country in countries:
        geocode_result = gmaps.geocode(country)
        if geocode_result:
            lat = geocode_result[0]["geometry"]["location"]["lat"]
            lng = geocode_result[0]["geometry"]["location"]["lng"]
            country_coordinates[country] = (lat, lng)
        else:
            country_coordinates[country] = (None, None)


NameError: name 'USE_CITIES' is not defined

In [None]:
# Creating the initial map
fig = go.Figure(go.Scattergeo(
    lat=df['Latitude'],
    lon=df['Longitude'],
    text=df['City'] + ": " + df['Similarity'].astype(str) if USE_CITIES else df['Country'] + ": " + df['Similarity'].astype(str),
    marker=dict(
        color=df['Similarity_Scaled'],
        line_color='rgb(40,40,40)',
        line_width=0.5,
        sizemode='diameter'
    ),
    hoverinfo='text'
))