In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the 4th sheet
df_sheet4 = pd.read_csv('data/poor.csv')

# Display the first few rows of the dataframe from the 4th sheet to understand its structure
df_sheet4_renamed = df_sheet4.rename(columns=lambda x: 'y' + str(x) if str(x).isdigit() else x)

# Extract the years 2016 to 2021 for slope calculation
years_columns_renamed = ['y' + str(year) for year in range(2015, 2022)]
years_2015_to_2021 = df_sheet4_renamed[years_columns_renamed]

# Calculate slopes for each location
slopes = []
year_nums = np.arange(2015, 2022)
for index, row in years_2015_to_2021.iterrows():
    slope, intercept = np.polyfit(year_nums, row, 1)
    slope = slope + np.random.uniform(-0.05, 0.05)
    slopes.append(slope)

# Generate extrapolated numbers for 2012 to 2015 based on the slope
generated_data = {f'y{year}': [] for year in range(2012, 2015)}
for slope in slopes:
    for year in range(2012, 2015):
        value = years_2015_to_2021.iloc[0]['y2015'] + slope * (year - 2016)
        generated_data[f'y{year}'].append(value)

# Convert generated data to a DataFrame and round to 2 decimal places
generated_df = pd.DataFrame(generated_data).round(2)

# Combine generated data with original data from 2016 to 2021
full_df = pd.concat([generated_df, years_2015_to_2021], axis=1)

# Add back the NAME column
full_df['diaDanh'] = df_sheet4_renamed['diaDanh']

# Reorder columns to have NAME as the first column
full_df = full_df[['diaDanh'] + [col for col in full_df.columns if col != 'diaDanh']]

# Round all numeric columns to 2 decimal places
full_df_rounded = full_df.copy()
full_df_rounded.iloc[:, 1:] = full_df_rounded.iloc[:, 1:].round(2)


In [None]:
full_df_rounded.count()

In [None]:
pop = pd.read_csv('data/pop.csv')

In [None]:
melted_df = pd.melt(pop, id_vars=['diaDanh'], var_name='YEAR', value_name='POP')

In [None]:
melted_df['diaDanh'] = melted_df['diaDanh'].str.replace(' ','')

In [None]:
melted_df['YEAR'] = melted_df['YEAR'].astype(int)

In [None]:
melted_poor = pd.melt(full_df_rounded, id_vars=['diaDanh'], var_name='YEAR', value_name='POOOR')

# Removing 'y' from 'YEAR' and converting to int
melted_poor['YEAR'] = melted_poor['YEAR'].str.replace('y', '').astype(int)

melted_poor.head()

In [None]:
melted_poor['diaDanh'] = melted_poor['diaDanh'].str.replace(' ','')

In [None]:
melted_poor.count()

In [None]:
df_merged = pd.merge(melted_df, melted_poor, on=['diaDanh', 'YEAR'])