In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns

from matplotlib import pyplot as plt
from pathlib import Path

from autumn.settings import INPUT_DATA_PATH


If not in an 'AuTuMN' python environment. Edit and run the cell below to manually create the path to files.

In [None]:
p = Path(".")
# INPUT_DATA_PATH = p.joinpath('c:\\', "Users","maba0001", "AuTuMN", "data", "inputs")

In [None]:

fb = p.joinpath(INPUT_DATA_PATH, "mobility", "movement-range-2021.txt")
gg = p.joinpath(INPUT_DATA_PATH, "mobility", "Google_Mobility_Report.csv")


REGIONS = ['Hanoi', 'Ho Chi Minh City']


In [None]:
fb_mob = pd.read_csv(fb, delimiter="\t")


In [None]:
gg_mob = pd.read_csv(gg)

Get facebook mobility for country and regions of interest.

In [None]:
country = fb_mob["country"] == "VNM"
fb_mob = fb_mob[country]

fb_hcmc = fb_mob["polygon_id"].str.contains("VNM.25")
fb_hanoi = fb_mob["polygon_id"].str.contains("VNM.27")

fb_mob.loc[fb_hanoi, "polygon_name"] = "Hanoi"
fb_mob.loc[fb_hcmc, "polygon_name"] = "Ho Chi Minh City"

fb_mob = fb_mob[(fb_hcmc) | (fb_hanoi)]
fb_mob.rename(columns={'all_day_bing_tiles_visited_relative_change':'visited_tiles', 'all_day_ratio_single_tile_users':'single_tiles'}, inplace =True)
fb_mob = fb_mob[
    [
        "ds",
        "polygon_name",
        'visited_tiles',
        "single_tiles",
    ]
]
fb_mob["ds"] = pd.to_datetime(fb_mob["ds"], format="%Y-%m-%d")


For now take the mean. But ideally the weighted average based on population should be used.<br>
To implement this the population of each sub-region(VNM.25, VNM.27) is required.
 

In [None]:
fb_mob = fb_mob.groupby(['ds','polygon_name'], as_index=False).mean()

In [None]:
MOB_COL = ["retail_and_recreation",
    "grocery_and_pharmacy",
    "parks",
    "transit_stations",
    "workplaces",
    "residential"]

GG_COLS = [
    "date",
    "sub_region_1",
    
] + MOB_COL

gg_hcmc = gg_mob["sub_region_1"].str.contains("Ho Chi Minh City")
gg_hanoi = gg_mob["sub_region_1"].str.contains("Hanoi")
gg_mob = gg_mob[(gg_hcmc) | (gg_hanoi)]
gg_mob.rename(columns=lambda x: str(x).replace("_percent_change_from_baseline",""),inplace=True)
gg_mob = gg_mob[GG_COLS]
gg_mob["date"] = pd.to_datetime(gg_mob["date"], format="%Y-%m-%d")


In [None]:
split_date = '2021-10-20'

In [None]:
df_mob = pd.merge(fb_mob, gg_mob,  how='left', left_on=['ds', 'polygon_name'], right_on=['date', 'sub_region_1'])
df_mob = df_mob[df_mob['ds'] >= '2021-01-01'] # For now, there is FB data for 2021.

In [None]:
corr_plot = df_mob[['visited_tiles','single_tiles','retail_and_recreation','grocery_and_pharmacy','parks','transit_stations','workplaces','residential']].corr()
plt.figure(figsize=(5,5))
hm = sns.heatmap(corr_plot, annot = True)

In [None]:
def get_xs(region, split_date, df):
    "Split the data set into test and train and add an intercept component"

    x_train = df[(df['polygon_name'] == region) & (df['ds'] <= split_date)][['visited_tiles', 'single_tiles']].to_numpy()
    x_test = df[(df['polygon_name'] == region) & (df['ds'] > split_date)][['visited_tiles', 'single_tiles']].to_numpy()
    x_train = sm.add_constant(x_train)
    x_test = sm.add_constant(x_test)

    return x_test, x_train

In [None]:
def get_train_y(region, split_date, df, y_col):

    y = df[(df['polygon_name'] == region) & (df['ds'] <= split_date)][[y_col]].to_numpy()

    return y



In [None]:
for region in REGIONS:

    x_test, x_train = get_xs(region,split_date, df_mob )

    for mobility_type in MOB_COL:
        
        y_train = get_train_y(region,split_date, df_mob, mobility_type )

        model = sm.OLS(y_train,x_train)
        results = model.fit()
        print(results.summary())

        y_pred = results.predict(x_test)


        df_mob.loc[(df_mob[mobility_type].isna()) & (df_mob['polygon_name']==region), mobility_type] = y_pred



In [None]:
df_mob.set_index(df_mob['ds'], inplace= True)

In [None]:
df_mob[df_mob['polygon_name'] == "Hanoi"][MOB_COL].plot(figsize=(20,6))

In [None]:
df_mob[df_mob['polygon_name'] == "Ho Chi Minh City"][MOB_COL].plot(figsize=(20,6))

Create a csv for input db.

In [None]:
input_df = df_mob[df_mob['ds'] > split_date][['ds', 'polygon_name']+MOB_COL]

In [None]:
input_df.rename(columns=lambda x:  f"{x}_percent_change_from_baseline" if x not in {'ds',"polygon_name"} else x,inplace=True)
input_df.rename(columns={'polygon_name':"sub_region_1", 'ds':"date"}, inplace = True)
input_df["country_region"] = "Vietnam"

In [None]:
vnm_csv = p.joinpath(INPUT_DATA_PATH, "mobility", "VNM_mobility.csv")

In [None]:
input_df.to_csv(vnm_csv, index=False)