In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import preprocessing
import plotly.express as px
import plotly


import japanize_matplotlib
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 300)

%matplotlib inline

# 乱数seedの固定
np.random.seed(seed=123)



The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.



In [2]:
df = pd.read_csv("../../data/data_covid_fix_name_code_coordinate_ltaly.csv").iloc[:,1:]

In [3]:
df.head()

Unnamed: 0,country,pop,urb,gdp,dist,hf,pf,ef,date_first,detection,status,cumul,air,code3,code2,lat,lon,dist_italy
0,Albania,2866376,60.319,13364.155397,6996524.0,7.84,8.005411,7.67,70,74.3,1,108641,303.14,ALB,AL,41.327546,19.818698,613446.0
1,Algeria,42228429,72.629,15481.78762,9108277.0,4.99,5.201489,4.77,58,12.0,1,80272,6442.44,DZA,DZ,36.753768,3.058756,993497.7
2,Angola,30809762,65.514,6452.355165,10490120.0,5.4,5.979936,4.83,83,17.9,1,303691,76.94,AGO,AO,-8.839988,13.289437,5618975.0
3,Argentina,44494502,91.87,20610.56855,19025620.0,6.86,8.0446,5.67,65,74.9,1,92122,1516.63,ARG,AR,-34.603684,-58.381559,11131590.0
4,Armenia,2951776,63.149,10343.17559,5064044.0,7.42,7.145017,7.7,62,60.8,1,86276,,ARM,AM,40.179186,44.499103,2682066.0


# 年齢中央値に関する前処理  
- 人口の年齢の中央値。  
- 2015年までの過去の推定値。  
- 2020年以降の国連予測（国連予測の中型）  

In [4]:
median_age_df = pd.read_csv("../../data/median-age.csv")
median_age_df.columns=["country", "code3", "year", "median_age"]
median_age_df.head(2)

Unnamed: 0,country,code3,year,median_age
0,Afghanistan,AFG,1950,19.4
1,Afghanistan,AFG,1955,19.200001


In [5]:
median_age_df.isnull().sum()

country          0
code3         1147
year             0
median_age       0
dtype: int64

In [6]:
# Bahamasのcode3が?
# 元データではBHMを使っているので置換
median_age_df.loc[median_age_df["country"]=="Bahamas", "code3"] = "BHM"

In [7]:
median_age_df["country"].nunique()

241

In [8]:
median_age_df["code3"].nunique()

204

In [9]:
# 2015以降のデータに絞る
median_age_df = median_age_df[median_age_df["year"]<2020]
median_age_df["year"].max()

2015

In [10]:
# 重複を削除して最新のデータを取得
median_age_df = median_age_df[~median_age_df[["country"]].duplicated(keep="last")].reset_index(drop=True)
median_age_df.head(2)

Unnamed: 0,country,code3,year,median_age
0,Afghanistan,AFG,2015,17.299999
1,Africa,,2015,19.4


In [11]:
print(len(df))
df = pd.merge(df, median_age_df[["code3", "median_age"]], on="code3", how="left")
df["median_age"].isnull().sum()

155


0

# 人間開発指数(HDI)  

In [12]:
human_development_index_df = pd.read_csv("../../data/human_development_index.csv",engine='python')
human_development_index_df = human_development_index_df[["HDI Rank" , "Country", "2018"]]
human_development_index_df.columns=["HDI_Rank" , "country", "HDI"]
print(human_development_index_df.shape)
human_development_index_df.head()

(206, 3)


Unnamed: 0,HDI_Rank,country,HDI
0,169,Afghanistan,0.509
1,69,Albania,0.792
2,91,Algeria,0.746
3,36,Andorra,0.867
4,148,Angola,0.582


In [13]:
human_development_index_df["country"] = human_development_index_df["country"].str.lstrip()

In [14]:
_df = pd.merge(df, human_development_index_df[["country", "HDI"]], on="country", how="left")
_df[_df["HDI"].isnull()][["country", "HDI"]]

Unnamed: 0,country,HDI
17,Bolivia,
21,Brunei,
28,Cape Verde,
35,Ivory Coast,
38,Czech Republic,
39,Democratic Republic of the Congo,
66,Iran,
78,Laos,
93,Moldova,
106,Macedonia,


In [15]:
# Laosが欠損
change_name={
    # human_development_index_dfの名前:dfの名前 
    "Bolivia (Plurinational State of)":"Bolivia",
    "Brunei Darussalam":"Brunei",
    "Cabo Verde":"Cape Verde",
    "C�te d'Ivoire":"Ivory Coast",
    "Czechia":"Czech Republic",
    "Congo (Democratic Republic of the)":"Democratic Republic of the Congo",
    "Iran (Islamic Republic of)":"Iran",
    #"":"Laos",
    "Moldova (Republic of)":"Moldova",
    "North Macedonia":"Macedonia",
    "Russian Federation":"Russia",
    "Korea (Republic of)":"South Korea",
    "Trinidad and Tobago":"Tobago",
    "United Kingdom":"UK",
    "United States":"USA",
    "Tanzania (United Republic of)":"Tanzania",
    "Venezuela (Bolivarian Republic of)":"Venezuela",
    "Viet Nam":"Vietnam"
}
human_development_index_df["country"] = human_development_index_df["country"].replace(change_name)

In [16]:
# merge
df = pd.merge(df, human_development_index_df[["country", "HDI"]], on="country", how="left")

# 人口密度　　

In [17]:
population_density = pd.read_csv("../../data/population_density.csv")
population_density = population_density[["Country Name", "Country Code", "2017", "2018"]]
population_density.columns = ["country", "code3", "population_density_2017", "population_density_2018"]
population_density["population_density"] = population_density["population_density_2018"].fillna(population_density["population_density_2017"])
population_density.head(2)

Unnamed: 0,country,code3,population_density_2017,population_density_2018,population_density
0,Aruba,ABW,585.366667,588.027778,588.027778
1,Afghanistan,AFG,55.595993,56.93776,56.93776


In [18]:
# 元データではBHMを使っているので置換
population_density.loc[population_density["code3"]=="BHS", "code3"] = "BHM"

In [19]:
population_density.isnull().sum()

country                    0
code3                      0
population_density_2017    4
population_density_2018    9
population_density         4
dtype: int64

In [20]:
df = pd.merge(df, population_density[["code3", "population_density"]], on="code3", how="left")

In [21]:
# スーダンの値は手で補間
df.loc[df["population_density"].isnull(), "population_density"] = 836.00

In [22]:
df["population_density"].isnull().sum()

0

In [24]:
#df.to_csv("../../data/data_covid_fix_name_code_coordinate_ltaly_medianage_hdi_density.csv", index=False)

# 国際インバウンド観光客

In [438]:
"""international_inbound_tourists_df = pd.read_csv("../../data/international_inbound_tourists .csv")
international_inbound_tourists_df = international_inbound_tourists_df[["Country Name", "Country Code", "2018"]]
international_inbound_tourists_df.columns= ["country", "code3", "international_inbound_tourists"]
international_inbound_tourists_df.head()"""

Unnamed: 0,country,code3,international_inbound_tourists
0,Aruba,ABW,1082000.0
1,Afghanistan,AFG,
2,Angola,AGO,218000.0
3,Albania,ALB,5340000.0
4,Andorra,AND,3042000.0


In [439]:
# 元データではBHMを使っているので置換
#median_age_df.loc[median_age_df["code3"]=="BHS", "code3"] = "BHM"

In [440]:
"""_df = pd.merge(df, international_inbound_tourists_df[["code3", "international_inbound_tourists"]], on="code3", how="left")
_df.isnull().sum()"""

country                            0
pop                                0
urb                                0
gdp                                0
dist                               0
hf                                 0
pf                                 0
ef                                 0
date_first                         0
detection                          0
status                             0
cumul                              0
air                                5
code3                              0
code2                              1
lat                                0
lon                                0
dist_italy                         0
median_age                         0
HDI                                1
population_density                 0
international_inbound_tourists    27
dtype: int64

In [441]:
#_df[_df["international_inbound_tourists"].isnull()][["country", "code3", "international_inbound_tourists"]]

Unnamed: 0,country,code3,international_inbound_tourists
8,Bahamas,BHM,
10,Bangladesh,BGD,
19,Botswana,BWA,
24,Burundi,BDI,
26,Cameroon,CMR,
29,Central African Republic,CAF,
30,Chad,TCD,
39,Democratic Republic of the Congo,COD,
50,Gabon,GAB,
54,Ghana,GHA,
