In [None]:
import os, itertools, csv, ast
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from tqdm import tqdm

In [None]:
dir = "Helge_code"
inMigration_file = "Sudan_states_inmigration_2023-10-12.csv"
outMigration_file = "Sudan_states_outmigration_2023-10-12.csv"

with open(os.path.join(dir, inMigration_file),"r") as f1, \
    open(os.path.join(dir, outMigration_file),"r") as f2:
    in_m = pd.read_csv(f1).rename(columns={"to":"location"})
    out_m = pd.read_csv(f2).rename(columns={"from":"location"})
in_m["date_later"] = pd.to_datetime(in_m["date_later"])
out_m["date_later"] = pd.to_datetime(out_m["date_later"])

In [None]:
time_delta_dev = 5
time_deltas = [i for i in range(-1*abs(time_delta_dev),abs(time_delta_dev)+1)]

def time_shift(df, td, date_col = "date_later", val_col = "val", loc_col = "location"):
    dates = list(df[date_col].unique())
    def shift(date):
        if dates.index(date)+td < 0 or dates.index(date)+td >= len(dates):
            return "08/11/2024"
        else:
            return dates[dates.index(date)+td]

    left = df[[date_col,val_col,loc_col]].copy()
    right = df[[elem for elem in df.columns if elem!=val_col]].copy()
    right[date_col] = pd.to_datetime(right[date_col].apply(shift))
    return pd.merge(left, right, on = [date_col,loc_col], how = "inner").drop(columns = [date_col,loc_col])

In [None]:
def rm_0_rows(df):
    return df[~(df[df.columns[2:]] == 0).all(axis=1)]

if True:
    in_m = rm_0_rows(in_m)
    out_m = rm_0_rows(out_m)

In [None]:
in_m

In [None]:
out_m

In [222]:
all_covars = list(in_m.columns[3:])

df_types = [("IN",in_m),("OUT",out_m)]
locations = ["all"] + list(in_m["location"].unique())

model = LinearRegression()

k_features = 10

length = 2*3*len(locations)*len(time_deltas)*k_features
prog_bar = tqdm(total = length, desc = "correlations")

cat_corrs = "correlations/cat_corrs.csv"
with open(cat_corrs, "w+") as o:
    writer = csv.writer(o)

    header = ["migration","indicator","location","time_shift (time chunk)",
              "n","var_num","covars","R^2"]
    writer.writerow(header)
    
    for df_type in df_types:
        name = df_type[0]
        df = df_type[1]

        constant_cols = ["val","date_later","location"]
        df_all = df.copy()
        df_emo = df[constant_cols + [c for c in df.columns[3:] if "sentiment" not in c]].copy()
        df_senti = df[constant_cols + [c for c in df.columns[3:] if "sentiment" in c]].copy()

        for cat, data in zip(["all","emo","senti"],[df_all, df_emo, df_senti]):
            for loc in locations:
                data_loc = data[data["location"]==loc] if loc != "all" else data.copy()

                for td in time_deltas:
                    data_loc_td = time_shift(data_loc, td)
                    if len(data_loc_td) == 0:
                        continue

                    X = data_loc_td[data_loc_td.columns[1:]]
                    Y = data_loc_td["val"]

                    for var_num in range(1,min(X.shape[1], k_features)):
                        prog_bar.update(1)

                        rfe = RFE(model, n_features_to_select=var_num)
                        rfe.fit(X,Y)
                        relevant_features = rfe.support_
                        X_fit = X[X.columns[relevant_features]]

                        model.fit(X_fit,Y)
                        r_squared = round(model.score(X_fit,Y), 4)

                        row = [name,cat,loc,td,len(data_loc_td),var_num,
                               str(list(X_fit.columns)),r_squared]
                        writer.writerow(row)

correlations:   2%|▏         | 199/12540 [00:28<29:29,  6.97it/s]


KeyboardInterrupt: 

In [None]:
with open(cat_corrs, "r") as f:
    corrs = pd.read_csv(f)
corrs

In [None]:
best_corrs = {}
for io in corrs["migration"].unique():
    best_corrs[io] = {}
    for indic in corrs["indicator"].unique():
        best_corrs[io][indic] = {}
        for loc in corrs["location"].unique():
            subset = corrs[(corrs["migration"] == io) & \
                           (corrs["indicator"] == indic) & \
                            (corrs["location"] == loc)]
            
            if len(subset) > 0:
                row = subset.loc[subset["R^2"].idxmax()]

                best_corrs[io][indic][loc] = {
                    "td":row["time_shift (time chunk)"],
                    "cv":ast.literal_eval(row["covars"])
                }

In [None]:
for io in best_corrs:
    for indic in best_corrs[io]:
        for loc in best_corrs[io][indic]:
            print(loc)

            if io == "IN":
                reg_df = in_m.copy()
            else:
                reg_df = out_m.copy()
            
            if loc != "all":
                reg_df = reg_df[reg_df["location"] == loc]
            reg_df = reg_df[best_corrs[io][indic][loc]["cv"]]