# Show Country Change Rates between 2019 and 2020

In [None]:
import pandas as pd

# 2019
table_2019 = pd.read_csv("2019_table_calculate.csv")
group_2019 = table_2019.groupby(["iso_country"]).sum()
group_2019.to_csv("2019_table_calculate_country.csv")  # 143 countries

# 2020
table_2020 = pd.read_csv("2020_table_calculate.csv")
group_2020 = table_2020.groupby(["iso_country"]).sum()
group_2020.to_csv("2020_table_calculate_country.csv")  # 143 countries

In [None]:
import pandas as pd

# select and rename columns from 2019 table
group_2019 = pd.read_csv("2019_table_calculate_country.csv")

df_left = group_2019[["iso_country", "sum_dn"]]
df_left = df_left.rename(columns={"sum_dn": "2019_sum_dn"})

# select and rename columns from 2020 table
group_2020 = pd.read_csv("2020_table_calculate_country.csv")

df_right = group_2020[["iso_country", "sum_dn"]]
df_right = df_right.rename(columns={"sum_dn": "2020_sum_dn"})

# merge, calculate and export country change rates
df_merge = pd.merge(df_left, df_right, on="iso_country")

df_merge["country_change_rate"] = (df_merge["2020_sum_dn"] - df_merge["2019_sum_dn"]) / df_merge["2019_sum_dn"]

df_merge.to_csv("country_change_rate.csv", index=False)

# KMeans with 4 Clusters

In [None]:
import pandas as pd

# adjustable item 1: csv
yearlyTableDf = pd.read_csv("2019_table_calculate.csv")
yearlyTableDf = yearlyTableDf[(yearlyTableDf.domestic != 0) | (yearlyTableDf.international != 0) | \
                              (yearlyTableDf.day != 0) | (yearlyTableDf.night != 0) | \
                              (yearlyTableDf.long != 0) | (yearlyTableDf.medium != 0) | (yearlyTableDf.short != 0)]

print(yearlyTableDf.shape)
yearlyTableDf.head()

In [None]:
yearlyTableDf.describe()

In [None]:
from sklearn import preprocessing

# adjustable item 2: selected fields for clustering
selectedFields = ["domestic", "international", "day", "night", "long", "medium", "short"]

scaler = preprocessing.StandardScaler().fit(yearlyTableDf[selectedFields])
standardizedVariables = scaler.transform(yearlyTableDf[selectedFields])
standardizedVariables2 = standardizedVariables

from sklearn.cluster import KMeans
import numpy as np

# adjustable item 3: number of cluster
minNumCluster = 4
maxNumCluster = 4

for i in range(minNumCluster, maxNumCluster+1):
    mdl1 = KMeans(n_clusters=i, init="random", random_state=0)
    mdl1.fit(standardizedVariables)
    yearlyTableDf["Cluster(n="+str(i)+")"] = mdl1.labels_
    standardizedVariables2 = np.vstack((standardizedVariables2, mdl1.cluster_centers_))

yearlyTableDf.head()

In [None]:
from sklearn.metrics import silhouette_score

for i in range(minNumCluster, maxNumCluster+1):
    clusterLabel = yearlyTableDf["Cluster(n="+str(i)+")"]
    silhouette_avg = silhouette_score(standardizedVariables, clusterLabel)
    for j in range(0, i):
        print("Cluster(n="+str(i)+") Group"+str(j)+" Row Counts: "+str(yearlyTableDf[clusterLabel==j].shape[0]))
    print("For Cluster(n="+str(i)+"), the Average Silhouette Score:", silhouette_avg)
    print("----------------------------------------------------------------------------------------------------")

In [None]:
from sklearn.decomposition import PCA

mdl2 = PCA(n_components=2)
mdl2.fit(standardizedVariables2)
standardizedVariablesPCA = mdl2.transform(standardizedVariables2)

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

k = yearlyTableDf.shape[0]
for i in range(minNumCluster, maxNumCluster+1):
    clusterLabel = yearlyTableDf["Cluster(n="+str(i)+")"]
    plt.title("Cluster(n="+str(i)+")")
    plt.scatter(standardizedVariablesPCA[:yearlyTableDf.shape[0], 0], standardizedVariablesPCA[:yearlyTableDf.shape[0], 1], c=clusterLabel, s=10, cmap="viridis")
    plt.scatter(standardizedVariablesPCA[k:(k+i), 0], standardizedVariablesPCA[k:(k+i), 1], c=sorted(clusterLabel.unique()), s=150, alpha=0.5, cmap="viridis")
    plt.show()
    k += i
    silhouette_avg = silhouette_score(standardizedVariables, clusterLabel)
    print("For Cluster(n="+str(i)+"), the Average Silhouette Score:", silhouette_avg, "\n")

print(mdl2.explained_variance_ratio_)
print(abs(mdl2.components_))

In [None]:
yearlyTableDf.to_csv("2019_table_calculate_clustering.csv", index=False)

# Show Descriptive Statistics of the 4 Clusters between 2019 and 2020

In [None]:
import pandas as pd

# 2019 table
clustering_2019 = pd.read_csv("2019_table_calculate_clustering.csv")

# 2020 table
table_2020 = pd.read_csv("2020_table_calculate.csv")
df_right = clustering_2019[["icao_code", "Cluster(n=4)"]]
clustering_2020 = pd.merge(table_2020, df_right, on="icao_code", how="right")

In [None]:
# 2019 table group 0
clustering_2019[clustering_2019["Cluster(n=4)"] == 0].describe()[["domestic", "international", \
                                                                  "day", "night", "long", \
                                                                  "medium", "short", \
                                                                  "AVG_TEMP", "AVG_VISIB", "AVG_WDSP", "AVG_MAX", "AVG_MIN", "AVG_PRCP", \
                                                                  "domestic_ratio", "day_ratio", "long_ratio", "short_ratio"]]

In [None]:
# 2019 table group 1
clustering_2019[clustering_2019["Cluster(n=4)"] == 1].describe()[["domestic", "international", \
                                                                  "day", "night", "long", \
                                                                  "medium", "short", \
                                                                  "AVG_TEMP", "AVG_VISIB", "AVG_WDSP", "AVG_MAX", "AVG_MIN", "AVG_PRCP", \
                                                                  "domestic_ratio", "day_ratio", "long_ratio", "short_ratio"]]

In [None]:
# 2019 table group 2
clustering_2019[clustering_2019["Cluster(n=4)"] == 2].describe()[["domestic", "international", \
                                                                  "day", "night", "long", \
                                                                  "medium", "short", \
                                                                  "AVG_TEMP", "AVG_VISIB", "AVG_WDSP", "AVG_MAX", "AVG_MIN", "AVG_PRCP", \
                                                                  "domestic_ratio", "day_ratio", "long_ratio", "short_ratio"]]

In [None]:
# 2019 table group 3
clustering_2019[clustering_2019["Cluster(n=4)"] == 3].describe()[["domestic", "international", \
                                                                  "day", "night", "long", \
                                                                  "medium", "short", \
                                                                  "AVG_TEMP", "AVG_VISIB", "AVG_WDSP", "AVG_MAX", "AVG_MIN", "AVG_PRCP", \
                                                                  "domestic_ratio", "day_ratio", "long_ratio", "short_ratio"]]

In [None]:
# 2020 table group 0
clustering_2020[clustering_2020["Cluster(n=4)"] == 0].describe()[["domestic", "international", \
                                                                  "day", "night", "long", \
                                                                  "medium", "short", \
                                                                  "AVG_TEMP", "AVG_VISIB", "AVG_WDSP", "AVG_MAX", "AVG_MIN", "AVG_PRCP", \
                                                                  "domestic_ratio", "day_ratio", "long_ratio", "short_ratio"]]

In [None]:
# 2020 table group 1
clustering_2020[clustering_2020["Cluster(n=4)"] == 1].describe()[["domestic", "international", \
                                                                  "day", "night", "long", \
                                                                  "medium", "short", \
                                                                  "AVG_TEMP", "AVG_VISIB", "AVG_WDSP", "AVG_MAX", "AVG_MIN", "AVG_PRCP", \
                                                                  "domestic_ratio", "day_ratio", "long_ratio", "short_ratio"]]

In [None]:
# 2020 table group 2
clustering_2020[clustering_2020["Cluster(n=4)"] == 2].describe()[["domestic", "international", \
                                                                  "day", "night", "long", \
                                                                  "medium", "short", \
                                                                  "AVG_TEMP", "AVG_VISIB", "AVG_WDSP", "AVG_MAX", "AVG_MIN", "AVG_PRCP", \
                                                                  "domestic_ratio", "day_ratio", "long_ratio", "short_ratio"]]

In [None]:
# 2020 table group 3
clustering_2020[clustering_2020["Cluster(n=4)"] == 3].describe()[["domestic", "international", \
                                                                  "day", "night", "long", \
                                                                  "medium", "short", \
                                                                  "AVG_TEMP", "AVG_VISIB", "AVG_WDSP", "AVG_MAX", "AVG_MIN", "AVG_PRCP", \
                                                                  "domestic_ratio", "day_ratio", "long_ratio", "short_ratio"]]

# Show Airport Change Rates between 2019 and 2020

In [None]:
import pandas as pd

# select and rename columns from 2019 table
clustering_2019 = pd.read_csv("2019_table_calculate_clustering.csv")

df_left = clustering_2019[["icao_code", "Cluster(n=4)", "sum_dn"]]
df_left = df_left.rename(columns={"sum_dn": "2019_sum_dn"})

# select and rename columns from 2020 table
table_2020 = pd.read_csv("2020_table_calculate.csv")

df_right = table_2020[["icao_code", "sum_dn"]]
df_right = df_right.rename(columns={"sum_dn": "2020_sum_dn"})

# merge, calculate and export airport change rates
df_merge = pd.merge(df_left, df_right, on="icao_code", how="left")

df_merge["airport_change_rate"] = (df_merge["2020_sum_dn"] - df_merge["2019_sum_dn"]) / df_merge["2019_sum_dn"]

df_merge.to_csv("airport_change_rate.csv", index=False)