## Import lib

In [1]:
import geopandas as gpd
import pandas as pd
import folium
import mapclassify
import matplotlib.pyplot as plt

## Import japan shp map

In [2]:
japan_shp = gpd.read_file('gadm41_JPN_shp/gadm41_JPN_1.shp')
japan_shp

Unnamed: 0,GID_1,GID_0,COUNTRY,NAME_1,VARNAME_1,NL_NAME_1,TYPE_1,ENGTYPE_1,CC_1,HASC_1,ISO_1,geometry
0,JPN.1_1,JPN,Japan,Aichi,Aiti,愛知県,Ken,Prefecture,,JP.AI,JP-23,"MULTIPOLYGON (((137.09743 34.65121, 137.09773 ..."
1,JPN.2_1,JPN,Japan,Akita,,秋田県,Ken,Prefecture,,JP.AK,JP-05,"MULTIPOLYGON (((140.70844 38.92136, 140.69861 ..."
2,JPN.3_1,JPN,Japan,Aomori,,青森県,Ken,Prefecture,,JP.AO,JP-02,"MULTIPOLYGON (((140.95625 40.27003, 140.95795 ..."
3,JPN.4_1,JPN,Japan,Chiba,Tiba|Tsiba,千葉県,Ken,Prefecture,,JP.CH,JP-12,"MULTIPOLYGON (((139.82417 34.91861, 139.82465 ..."
4,JPN.5_1,JPN,Japan,Ehime,,愛媛県,Ken,Prefecture,,JP.EH,JP-38,"MULTIPOLYGON (((132.56189 32.91159, 132.56241 ..."
5,JPN.6_1,JPN,Japan,Fukui,Hukui,福井県,Ken,Prefecture,,JP.FI,JP-18,"MULTIPOLYGON (((135.77737 35.36200, 135.77370 ..."
6,JPN.7_1,JPN,Japan,Fukuoka,Hukuoka,福岡県,Ken,Prefecture,,JP.FO,JP-40,"MULTIPOLYGON (((130.88498 33.18366, 130.88748 ..."
7,JPN.8_1,JPN,Japan,Fukushima,Hukusima,福島県,Ken,Prefecture,,JP.FS,JP-07,"MULTIPOLYGON (((140.26530 36.93347, 140.26071 ..."
8,JPN.9_1,JPN,Japan,Gifu,Gihu,岐阜県,Ken,Prefecture,,JP.GF,JP-21,"POLYGON ((136.67628 35.17009, 136.67532 35.165..."
9,JPN.10_1,JPN,Japan,Gunma,GunmaGumma,群馬県,Ken,Prefecture,,JP.GM,JP-10,"POLYGON ((138.94455 36.09058, 138.94102 36.088..."


In [3]:
# fix NA

iso_1_replacements = {
    12: 'JP-28',  # Replace 'NA' with 'JP-28' for index 12
    26: 'JP-42',  # Replace 'NA' with 'JP-42' for index 26
}

# Fill NA values in ISO_1 column based on the dictionary
for index, iso_code in iso_1_replacements.items():
    japan_shp.at[index, 'ISO_1'] = iso_code

In [4]:
def convert_iso_to_region_code(iso_code):
    if '-' in iso_code:
        numeric_part = iso_code.split('-')[1]
        # Convert to the format XX000
        region_code = numeric_part + '000'
    else:
        # Handle unexpected formats
        region_code = '00000'
    return region_code

# Apply the conversion function to the ISO_1 column and create the REGION_CODE column
japan_shp['REGION_CODE'] = japan_shp['ISO_1'].apply(convert_iso_to_region_code)

## Select column

In [5]:
japan_map = japan_shp[['NAME_1', 'REGION_CODE', 'geometry']]
japan_map

Unnamed: 0,NAME_1,REGION_CODE,geometry
0,Aichi,23000,"MULTIPOLYGON (((137.09743 34.65121, 137.09773 ..."
1,Akita,5000,"MULTIPOLYGON (((140.70844 38.92136, 140.69861 ..."
2,Aomori,2000,"MULTIPOLYGON (((140.95625 40.27003, 140.95795 ..."
3,Chiba,12000,"MULTIPOLYGON (((139.82417 34.91861, 139.82465 ..."
4,Ehime,38000,"MULTIPOLYGON (((132.56189 32.91159, 132.56241 ..."
5,Fukui,18000,"MULTIPOLYGON (((135.77737 35.36200, 135.77370 ..."
6,Fukuoka,40000,"MULTIPOLYGON (((130.88498 33.18366, 130.88748 ..."
7,Fukushima,7000,"MULTIPOLYGON (((140.26530 36.93347, 140.26071 ..."
8,Gifu,21000,"POLYGON ((136.67628 35.17009, 136.67532 35.165..."
9,Gunma,10000,"POLYGON ((138.94455 36.09058, 138.94102 36.088..."


In [None]:
m = japan_map.explore()
m

## Hospital per area

In [7]:
df_city_area = pd.read_csv('data/city_area.csv')
df_hospital_clinic_number = pd.read_csv('data/hospital_clinic_number.csv')

In [8]:
# convert ha to km2
df_city_area['area_number'] = df_city_area['Total land area (Incl. the northern territories and Takeshima)【ha】'] / 100

# Filter the data for the year 2022 and exclude the "Japan" region
df_city_area = df_city_area[(df_city_area["Time"] == 2022) & (df_city_area["Region"] != "Japan")]

df_city_area.head()

Unnamed: 0,Time,Region Code,Region,Total land area (Incl. the northern territories and Takeshima)【ha】,Annotation,area_number
2257,2022,1000,Hokkaido,8342381,,83423.81
2258,2022,2000,Aomori-ken,964595,,9645.95
2259,2022,3000,Iwate-ken,1527501,,15275.01
2260,2022,4000,Miyagi-ken,728229,,7282.29
2261,2022,5000,Akita-ken,1163752,,11637.52


In [9]:
def clean_area_name(area_name):
    """Remove specific suffixes from area names and strip whitespace."""
    suffixes = ['-ken', '-fu', '-to', '-do']
    for suffix in suffixes:
        area_name = area_name.replace(suffix, '')
    return area_name.strip()

# Create a dictionary for the specific mismatched area names corrections
area_corrections = {
    'Gumma': 'Gunma',
    'Hyogo': 'Hyōgo',
    'Nagasaki': 'Naoasaki'
}

df_city_area = df_city_area.rename(columns={'Region': 'NAME_1'})
df_city_area.loc[:, 'NAME_1'] = df_city_area['NAME_1'].apply(clean_area_name)
df_city_area.loc[:, 'NAME_1'] = df_city_area['NAME_1'].replace(area_corrections)
df_city_area

Unnamed: 0,Time,Region Code,NAME_1,Total land area (Incl. the northern territories and Takeshima)【ha】,Annotation,area_number
2257,2022,1000,Hokkaido,8342381,,83423.81
2258,2022,2000,Aomori,964595,,9645.95
2259,2022,3000,Iwate,1527501,,15275.01
2260,2022,4000,Miyagi,728229,,7282.29
2261,2022,5000,Akita,1163752,,11637.52
2262,2022,6000,Yamagata,932315,,9323.15
2263,2022,7000,Fukushima,1378414,,13784.14
2264,2022,8000,Ibaraki,609754,,6097.54
2265,2022,9000,Tochigi,640809,,6408.09
2266,2022,10000,Gunma,636228,,6362.28


In [10]:
df_hospital_clinic_number = df_hospital_clinic_number.rename(columns={'AREA': 'NAME_1'})
df_hospital_clinic_number.head()

Unnamed: 0.1,Unnamed: 0,NAME_1,NUMBER
0,1,Hokkaido,144
1,2,Aomori,25
2,3,Iwate,25
3,4,Miyagi,41
4,5,Akita,25


In [11]:
# Merge the datasets on AREA
merged_data = pd.merge(df_city_area, df_hospital_clinic_number, on='NAME_1', how='inner')

# Calculate the ratio
merged_data['area_ratio'] = merged_data['NUMBER'] / merged_data['area_number']

# Select only the relevant columns for the final dataframe
df_hospital_area_ratio = merged_data[['NAME_1', 'Region Code', 'area_ratio']]
df_hospital_area_ratio.head()

Unnamed: 0,NAME_1,Region Code,area_ratio
0,Hokkaido,1000,0.001726
1,Aomori,2000,0.002592
2,Iwate,3000,0.001637
3,Miyagi,4000,0.00563
4,Akita,5000,0.002148


In [12]:
japan_map['REGION_CODE'] = japan_map['REGION_CODE'].astype(int)
df_hospital_area_ratio['Region Code'] = df_hospital_area_ratio['Region Code'].astype(int)

# Perform the merge operation
japan_area_hospital = pd.merge(japan_map, df_hospital_area_ratio[['Region Code', 'area_ratio']], 
                     left_on='REGION_CODE', 
                     right_on='Region Code', 
                     how='left')

japan_area_hospital.drop(columns=['Region Code'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hospital_area_ratio['Region Code'] = df_hospital_area_ratio['Region Code'].astype(int)


In [13]:
japan_area_hospital

Unnamed: 0,NAME_1,REGION_CODE,geometry,area_ratio
0,Aichi,23000,"MULTIPOLYGON (((137.09743 34.65121, 137.09773 ...",0.020877
1,Akita,5000,"MULTIPOLYGON (((140.70844 38.92136, 140.69861 ...",0.002148
2,Aomori,2000,"MULTIPOLYGON (((140.95625 40.27003, 140.95795 ...",0.002592
3,Chiba,12000,"MULTIPOLYGON (((139.82417 34.91861, 139.82465 ...",0.017453
4,Ehime,38000,"MULTIPOLYGON (((132.56189 32.91159, 132.56241 ...",0.006871
5,Fukui,18000,"MULTIPOLYGON (((135.77737 35.36200, 135.77370 ...",0.009068
6,Fukuoka,40000,"MULTIPOLYGON (((130.88498 33.18366, 130.88748 ...",0.03228
7,Fukushima,7000,"MULTIPOLYGON (((140.26530 36.93347, 140.26071 ...",0.004208
8,Gifu,21000,"POLYGON ((136.67628 35.17009, 136.67532 35.165...",0.003201
9,Gunma,10000,"POLYGON ((138.94455 36.09058, 138.94102 36.088...",0.005816


In [None]:
m = japan_area_hospital.explore('area_ratio', cmap='Purples')
m

## Ratio of Lung Cancer to japan population

In [21]:
df_population = pd.read_csv('data/japanese_population.csv')
df_lung_cancer = pd.read_csv('data/lung_cancer.csv')

In [22]:
# Filter the population data for the year 2023 and exclude the "Japan" region
population = df_population[(df_population["Time"] == '2023') & (df_population["Region"] != "Japan")]

# Ensure the columns are of integer type
population['Total population (Both sexes)【person】'] = population['Total population (Both sexes)【person】'].astype(int)
population = population.rename(columns={'Region': 'AREA'})

# Process lung cancer data
df_lung_cancer['NUMBER'] = df_lung_cancer['NUMBER'].str.replace(',', '').astype(int)

# Merge the datasets on AREA
merged_data = pd.merge(population, df_lung_cancer, on='AREA', how='inner')

# Calculate the ratio
merged_data['Lung Cancer Ratio'] = merged_data['NUMBER'] / merged_data['Total population (Both sexes)【person】']

# Select only the relevant columns for the final dataframe
df_population_ratio = merged_data[['AREA', 'Region Code',  'Lung Cancer Ratio']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  population['Total population (Both sexes)【person】'] = population['Total population (Both sexes)【person】'].astype(int)


In [25]:
df_population_ratio = df_population_ratio.rename(columns={'AREA': 'NAME_1', 'Region Code':'REGION_CODE', 'Lung Cancer Ratio':'lung_cancer_ratio'})
df_population_ratio

Unnamed: 0,NAME_1,REGION_CODE,lung_cancer_ratio
0,Hokkaido,1000,0.000162
1,Aomori-ken,2000,0.000209
2,Iwate-ken,3000,0.000202
3,Miyagi-ken,4000,0.000213
4,Akita-ken,5000,0.00021
5,Yamagata-ken,6000,0.000245
6,Fukushima-ken,7000,0.000206
7,Ibaraki-ken,8000,0.000175
8,Tochigi-ken,9000,0.000197
9,Gumma-ken,10000,0.000195


In [28]:
japan_map['REGION_CODE'] = japan_map['REGION_CODE'].astype(int)
df_population_ratio['REGION_CODE'] = df_population_ratio['REGION_CODE'].astype(int)

# Perform the merge operation
japan_population_ratio = pd.merge(japan_map, df_population_ratio[['REGION_CODE', 'lung_cancer_ratio']],
                               left_on='REGION_CODE',
                               right_on='REGION_CODE',
                               how='left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [29]:
japan_population_ratio

Unnamed: 0,NAME_1,REGION_CODE,geometry,lung_cancer_ratio
0,Aichi,23000,"MULTIPOLYGON (((137.09743 34.65121, 137.09773 ...",0.000181
1,Akita,5000,"MULTIPOLYGON (((140.70844 38.92136, 140.69861 ...",0.00021
2,Aomori,2000,"MULTIPOLYGON (((140.95625 40.27003, 140.95795 ...",0.000209
3,Chiba,12000,"MULTIPOLYGON (((139.82417 34.91861, 139.82465 ...",0.000165
4,Ehime,38000,"MULTIPOLYGON (((132.56189 32.91159, 132.56241 ...",0.000175
5,Fukui,18000,"MULTIPOLYGON (((135.77737 35.36200, 135.77370 ...",0.000198
6,Fukuoka,40000,"MULTIPOLYGON (((130.88498 33.18366, 130.88748 ...",0.000161
7,Fukushima,7000,"MULTIPOLYGON (((140.26530 36.93347, 140.26071 ...",0.000206
8,Gifu,21000,"POLYGON ((136.67628 35.17009, 136.67532 35.165...",0.000185
9,Gunma,10000,"POLYGON ((138.94455 36.09058, 138.94102 36.088...",0.000195


In [33]:
m = japan_population_ratio.explore('lung_cancer_ratio', cmap='Wistia')
m