# Merge GeoJSON Data into CSV

In [1]:
# Import libraries and databases
import pandas as pd
import os
import json
from pycountry import countries

# path
path = r"D:\0 - Data Analytics\6 - Advanced Analytics & Dashboard Design\UFC Analysis"

# import the cleaned dataframe
UFC_df = pd.read_csv(
    os.path.join(path, "02 Data", "Prepared Data", "ufc_cleaned_V1.csv")
)


In [2]:
# Define path to the GeoJSON file
geojson_path = os.path.join(path, "02 Data", "Prepared Data", "merged_geojson.json")

# Load the GeoJSON file
with open(geojson_path, "r", encoding="utf-8") as f:
    geojson = json.load(f)

# Check that it loaded correctly
print(type(geojson))  # Should be <class 'dict'>
print(geojson.keys())  # Should show "features"
print(json.dumps(geojson["features"][:3], indent=2))  # Preview first 3 entries

<class 'dict'>
dict_keys(['type', 'features'])
[
  {
    "type": "Feature",
    "id": "AFG",
    "properties": {
      "name": "Afghanistan"
    },
    "geometry": {
      "type": "Polygon",
      "coordinates": [
        [
          [
            61.210817,
            35.650072
          ],
          [
            62.230651,
            35.270664
          ],
          [
            62.984662,
            35.404041
          ],
          [
            63.193538,
            35.857166
          ],
          [
            63.982896,
            36.007957
          ],
          [
            64.546479,
            36.312073
          ],
          [
            64.746105,
            37.111818
          ],
          [
            65.588948,
            37.305217
          ],
          [
            65.745631,
            37.661164
          ],
          [
            66.217385,
            37.39379
          ],
          [
            66.518607,
            37.362784
          ],
       

In [3]:
# Create the GEOJSON dictionary (Country Name: Geometry)
geo_dict = {
    feature["properties"]["name"]: feature["geometry"]
    for feature in geojson["features"]
}


### Wrangling and cleaning

In [4]:
# I already checked the data and i know that the following states have a different name
# Standardize the country names in the UFC dataframe so they match the GEOJSON dictionary
iso3_to_country = {c.alpha_3: c.name for c in countries}
country_fixes = {
    "Usa": "United States of America",
    "Uae": "United Arab Emirates",
    "Uk": "United Kingdom",
    "Korea, Republic of": "South Korea",
    "Russian Federation": "Russia",
    "Czechia": "Czech Republic",
}


In [5]:
# Extract U.S. states from the 'Location' column (e.g., "Las Vegas, Nevada, USA")
UFC_df["State"] = UFC_df["Location"].str.extract(r",\s*([A-Za-z\s]+),\s*USA")

In [6]:
# If there's a state, keep it; otherwise, keep the country name
UFC_df["Location_Cleaned"] = UFC_df["State"].fillna(UFC_df["Country"])
UFC_df["Location_Cleaned"] = (
    UFC_df["Location_Cleaned"].str.strip().str.title().replace(country_fixes)
)

In [7]:
# Map GeoJSON Geometries (Including U.S. States)
UFC_df["geometry"] = UFC_df["Location_Cleaned"].map(geo_dict)

In [8]:
# Ensure 'Location_Cleaned' exists by extracting U.S. states and keeping country names
UFC_df["State"] = UFC_df["Location"].str.extract(r",\s*([A-Za-z\s]+),\s*USA")
UFC_df["Location_Cleaned"] = UFC_df["State"].fillna(UFC_df["Country"])
UFC_df["Location_Cleaned"] = UFC_df["Location_Cleaned"].str.strip().str.title()

# Now Check for Missing Locations
missing_geometry_count = UFC_df["geometry"].isna().sum()
missing_locations = UFC_df[UFC_df["geometry"].isna()]["Location_Cleaned"].unique()

# show it
# Display results
missing_geometry_count, missing_locations

(89, array(['Singapore', 'Dc'], dtype=object))

In [9]:
# Manually add Washington, D.C. to GeoJSON if missing
if "Washington, D.C." not in geo_dict:
    geo_dict["Washington, D.C."] = {
        "type": "Polygon",
        "coordinates": [
            [
                [-77.119759, 38.791645],
                [-76.909393, 38.791645],
                [-76.909393, 38.995251],
                [-77.119759, 38.995251],
                [-77.119759, 38.791645],
            ]
        ],
    }

In [10]:
# Check if "Singapore" exists in the GeoJSON dictionary
print("Singapore" in geo_dict)

# Find similar names in the GeoJSON (if any)
similar_names = [name for name in geo_dict.keys() if "singapore" in name.lower()]
print("Possible alternative names for Singapore in GeoJSON:", similar_names)

False
Possible alternative names for Singapore in GeoJSON: []


In [11]:
# manually add Singappore

if "Singapore" not in geo_dict:
    geo_dict["Singapore"] = {
        "type": "Polygon",
        "coordinates": [
            [
                [103.64024, 1.26241],
                [103.99717, 1.24419],
                [104.00799, 1.44517],
                [103.86867, 1.47066],
                [103.64024, 1.26241],
            ]
        ],
    }

In [12]:
# Update the country fixes mapping to use Washington, D.C.
country_fixes.update({"Dc": "Washington, D.C."})

# Apply corrections
UFC_df["Location_Cleaned"] = UFC_df["Location_Cleaned"].replace(country_fixes)
UFC_df["geometry"] = UFC_df["Location_Cleaned"].map(geo_dict)

In [13]:
# Final Missing Check
missing_geometry_count = UFC_df["geometry"].isna().sum()
missing_locations = UFC_df[UFC_df["geometry"].isna()]["Location_Cleaned"].unique()

print(f"✅ Final Missing Locations After Fix: {missing_geometry_count}")
print("Remaining Missing Locations:", missing_locations)

✅ Final Missing Locations After Fix: 0
Remaining Missing Locations: []


### Exporting the new dataframe

In [14]:
# Save the updated dataframe
UFC_df.to_csv(
    os.path.join(path, "02 Data", "Prepared Data", "ufc_cleaned_V2.csv"),
    index=False,
)