In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd

In [3]:
# Load the data from the CSV file
df_sheet4 = pd.read_csv('data/poor.csv')

# Rename the year columns if they are just digits to 'y' followed by the year
df_sheet4_renamed = df_sheet4.rename(columns=lambda x: 'y' + str(x) if str(x).isdigit() else x)

# Extract the years 2015 to 2021 for slope calculation
years_columns_renamed = ['y' + str(year) for year in range(2015, 2022)]
years_2015_to_2021 = df_sheet4_renamed[years_columns_renamed]

# Calculate slopes for each location using linear regression
slopes = []
year_nums = np.arange(2015, 2022)
for index, row in years_2015_to_2021.iterrows():
    slope, intercept = np.polyfit(year_nums, row, 1)
    # Adding a small random variation to the slope (if needed, remove this line)
    slope = slope + np.random.uniform(-0.05, 0.05)
    slopes.append(slope)

# Generate extrapolated numbers for 2012 to 2014 based on the slope
generated_data = {f'y{year}': [] for year in range(2012, 2015)}
for i, slope in enumerate(slopes):
    initial_value = df_sheet4_renamed.iloc[i]['y2015']  # Start extrapolation from 2015 value
    for year in range(2012, 2015):
        value = initial_value + slope * (year - 2015)
        generated_data[f'y{year}'].append(value)

# Convert generated data to a DataFrame and round to 2 decimal places
generated_df = pd.DataFrame(generated_data).round(2)

# Combine generated data with original data from 2016 to 2021
full_df = pd.concat([generated_df, years_2015_to_2021], axis=1)

# Add back the Addr_Comm column
full_df['Addr_Comm'] = df_sheet4_renamed['Addr_Comm']

# Reorder columns to have Addr_Comm as the first column
full_df = full_df[['Addr_Comm'] + [col for col in full_df.columns if col != 'Addr_Comm']]

# Calculate the predicted value for 2022 using the slope
last_known_year = 'y2021'
full_df['y2022'] = [row[last_known_year] + slopes[index] for index, row in full_df.iterrows()]

# Round the 2022 values to 2 decimal places
full_df['y2022'] = full_df['y2022'].round(2)
year_columns = [f'y{year}' for year in range(2012, 2023)]  # Including years 2012 to 2022
full_df[year_columns] = full_df[year_columns].round(2)

# Optionally display the DataFrame to check the results
print(full_df.head())


         Addr_Comm  y2012  y2013  y2014  y2015  y2016  y2017  y2018  y2019  \
0  Phường  Hạ Long   2.01   1.87   1.73   1.59   1.58   1.33   1.25   1.03   
1  Phường Bà Triệu   2.91   2.82   2.74   2.66   2.39   2.22   2.52   2.57   
2   Phường Cửa Bắc   2.19   2.09   1.99   1.89   1.74   1.68   1.74   1.57   
3   Phường Cửa Nam   2.63   2.51   2.39   2.26   1.70   1.42   1.56   1.49   
4    Phường Lộc Hạ   3.76   3.54   3.31   3.09   2.73   2.73   2.90   2.37   

   y2020  y2021  y2022  
0   0.83   0.59   0.45  
1   1.91   1.92   1.84  
2   1.42   1.33   1.23  
3   1.43   1.39   1.26  
4   1.81   1.61   1.39  


In [5]:
pop = pd.read_csv('data/poptemp.csv')

In [8]:
pop

Unnamed: 0,Addr_Comm,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,OBJECTID
0,PHƯỜNG HẠ LONG,15568,15761,15948,14830,14760,14689,14615,14598,14653,15010,219
1,PHƯỜNG BÀ TRIỆU,6597,6755,6899,5009,4939,4857,4791,4779,4761,4987,226
2,PHƯỜNG CỬA BẮC,14205,14360,14501,12970,12900,12829,12733,12727,12717,12925,224
3,PHƯỜNG CỬA NAM,6123,6158,6188,6010,5940,5735,5739,5719,5722,5970,238
4,PHƯỜNG LỘC HẠ,14138,13940,13742,12949,12849,12865,12827,12782,12845,13244,236
...,...,...,...,...,...,...,...,...,...,...,...,...
221,XÃ YÊN THÀNH,3255,3260,3266,3292,3301,3356,3362,3366,3364,3447,37
222,XÃ YÊN THỌ,6693,6696,6708,6751,6783,6754,6885,6978,6975,7164,40
223,XÃ YÊN TIẾN,11803,11811,11831,11925,11986,12085,12203,12284,12281,12616,57
224,XÃ YÊN TRỊ,11552,11624,11641,11852,11805,11746,11781,11802,11800,12106,66


In [9]:
years_columns = [str(year) for year in range(2012, 2022)]  # Generate year column names
year_data = pop[years_columns]  # This DataFrame now contains only the year data

# Prepare years and convert them for regression
years = np.array([int(year) for year in years_columns])  # Convert year names to integers

# Calculate slopes for each location using linear regression
slopes = []
for index, row in year_data.iterrows():
    slope, intercept = np.polyfit(years, row, 1)  # Fit a linear model
    slopes.append(slope)

# Predict for 2022 using the calculated slopes
pop['2022'] = [row['2021'] + slopes[index] for index, row in pop.iterrows()]  # Predict 2022 based on the last known year and slope

# Round the predicted values for 2022
pop['2022'] = pop['2022'].round()

# Display the updated DataFrame including 2022 predictions
print(pop)

           Addr_Comm   2012   2013   2014   2015   2016   2017   2018   2019  \
0     PHƯỜNG HẠ LONG  15568  15761  15948  14830  14760  14689  14615  14598   
1    PHƯỜNG BÀ TRIỆU   6597   6755   6899   5009   4939   4857   4791   4779   
2     PHƯỜNG CỬA BẮC  14205  14360  14501  12970  12900  12829  12733  12727   
3     PHƯỜNG CỬA NAM   6123   6158   6188   6010   5940   5735   5739   5719   
4      PHƯỜNG LỘC HẠ  14138  13940  13742  12949  12849  12865  12827  12782   
..               ...    ...    ...    ...    ...    ...    ...    ...    ...   
221     XÃ YÊN THÀNH   3255   3260   3266   3292   3301   3356   3362   3366   
222       XÃ YÊN THỌ   6693   6696   6708   6751   6783   6754   6885   6978   
223      XÃ YÊN TIẾN  11803  11811  11831  11925  11986  12085  12203  12284   
224       XÃ YÊN TRỊ  11552  11624  11641  11852  11805  11746  11781  11802   
225     XÃ YÊN TRUNG   5719   5721   5731   5552   5503   5563   5524   5537   

      2020   2021  OBJECTID     2022  


In [10]:
pop.to_csv('data/pop_final.csv', encoding='utf-8-sig', index=False)

In [12]:
cm = gpd.read_file('data/map/commune.json')

In [16]:
cm['Addr_Comm'] = cm['danhTuChun'] + ' ' + cm['diaDanh']
cm['Addr_Comm'] = cm['Addr_Comm'].str.upper()
comm_name = cm[['OBJECTID', 'Addr_Comm']]

In [72]:
# opop = pd.merge(pop, comm_name, on='Addr_Comm', how='left')
# opop.to_csv('poptemp.csv', encoding='utf-8-sig', index = False)

In [21]:
# melted_df = pd.melt(pop, id_vars=[''], var_name='YEAR', value_name='POP')
# melted_df['YEAR'] = melted_df['YEAR'].astype(int)
# melted_poor = pd.melt(full_df, id_vars=['diaDanh'], var_name='YEAR', value_name='POOR')
# # Removing 'y' from 'YEAR' and converting to int
# melted_poor['YEAR'] = melted_poor['YEAR'].str.replace('y', '').astype(int)
# melted_poor.head()
# melted_poor['diaDanh'] = melted_poor['diaDanh'].str.replace(' ','')

In [13]:
full_df['Addr_Comm'] = full_df['Addr_Comm'].str.upper().str.replace('  ',' ')

In [14]:
full_df

Unnamed: 0,Addr_Comm,y2012,y2013,y2014,y2015,y2016,y2017,y2018,y2019,y2020,y2021,y2022
0,PHƯỜNG HẠ LONG,2.01,1.87,1.73,1.59,1.58,1.33,1.25,1.03,0.83,0.59,0.45
1,PHƯỜNG BÀ TRIỆU,2.91,2.82,2.74,2.66,2.39,2.22,2.52,2.57,1.91,1.92,1.84
2,PHƯỜNG CỬA BẮC,2.19,2.09,1.99,1.89,1.74,1.68,1.74,1.57,1.42,1.33,1.23
3,PHƯỜNG CỬA NAM,2.63,2.51,2.39,2.26,1.70,1.42,1.56,1.49,1.43,1.39,1.26
4,PHƯỜNG LỘC HẠ,3.76,3.54,3.31,3.09,2.73,2.73,2.90,2.37,1.81,1.61,1.39
...,...,...,...,...,...,...,...,...,...,...,...,...
221,XÃ YÊN THÀNH,7.00,6.13,5.26,4.40,4.01,3.27,1.41,0.76,0.25,0.08,-0.78
222,XÃ YÊN THỌ,5.90,5.39,4.88,4.37,4.26,4.00,3.27,2.88,1.37,1.80,1.29
223,XÃ YÊN TIẾN,7.89,7.09,6.29,5.48,4.95,4.26,2.02,1.28,0.66,1.71,0.91
224,XÃ YÊN TRỊ,12.19,11.01,9.83,8.65,7.34,3.93,2.80,2.22,1.27,2.67,1.49


In [17]:
opoor = pd.merge(full_df, comm_name, on='Addr_Comm', how='left')

In [20]:
# opoor.to_csv('data/poor_final.csv', index=False, encoding='utf-8-sig')

In [23]:
# cm[cm['diaDanh']=="Hải Hòa"]

Unnamed: 0,OBJECTID,maDonViHan,danhTuChun,diaDanh,Shape_Leng,Shape_Area,MaTinh,TenTinh,MaHuyen,TenHuyen,geometry,Addr_Comm
217,209,14317,Xã,Hải Hòa,15575.023856,8464467.0,36,Tỉnh Nam Định,366,Huyện Hải Hậu,"POLYGON ((106.25649 20.09625, 106.25641 20.096...",XÃ HẢI HÒA
