In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Load dataset
df_pop = pd.read_csv("datasets/PopulationData.csv")
print(df_pop.info())
print(df_pop.head())
# Convert 'Year' to numeric and sort values
df_pop['Year'] = pd.to_numeric(df_pop['Year'], errors='coerce')
df_pop = df_pop.sort_values(by=['Country', 'Year']).reset_index(drop=True)

# Step 1: Create a complete year range for each country
all_years = range(df_pop['Year'].min(), 2024)  # Up to 2023

def fill_missing_years(group):
    country_name = group['Country'].iloc[0]
    full_range = pd.DataFrame({'Year': all_years, 'Country': country_name})
    return pd.merge(full_range, group, on=['Year', 'Country'], how='left')

df_pop = df_pop.groupby('Country', group_keys=False).apply(fill_missing_years)

# Step 2: Interpolate missing population values
df_pop['Population'] = df_pop.groupby('Country')['Population'].transform(lambda x: x.interpolate())

# Step 3: Extrapolate population for the last few years (if needed)
def extrapolate_population(group):
    X = group.dropna()['Year'].values.reshape(-1, 1)
    y = group.dropna()['Population'].values

    model = LinearRegression()
    model.fit(X, y)

    missing_years = group[group['Population'].isna()]['Year'].values.reshape(-1, 1)
    if len(missing_years) > 0:
        group.loc[group['Population'].isna(), 'Population'] = model.predict(missing_years)

    return group

df_pop = df_pop.groupby('Country', group_keys=False).apply(extrapolate_population)

# Save the updated dataset
df_pop.to_csv("Processed_PopulationData.csv", index=False)

print(df_pop.info())
print(df_pop.head())# Verify the final results


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Country     2500 non-null   object 
 1   Year        2500 non-null   int64  
 2   Population  2500 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 58.7+ KB
None
       Country  Year  Population
0  Afghanistan  1975  12185168.7
1  Afghanistan  1980  12516846.7
2  Afghanistan  1985  10548339.5
3  Afghanistan  1990  10733987.7
4  Afghanistan  1995  16453396.5


  df_pop = df_pop.groupby('Country', group_keys=False).apply(fill_missing_years)


<class 'pandas.core.frame.DataFrame'>
Index: 12250 entries, 0 to 48
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Year        12250 non-null  int64  
 1   Country     12250 non-null  object 
 2   Population  12250 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 382.8+ KB
None
   Year      Country  Population
0  1975  Afghanistan  12185168.7
1  1976  Afghanistan  12251504.3
2  1977  Afghanistan  12317839.9
3  1978  Afghanistan  12384175.5
4  1979  Afghanistan  12450511.1


  df_pop = df_pop.groupby('Country', group_keys=False).apply(extrapolate_population)
