# Libraries

In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder , PolynomialFeatures
from sklearn.model_selection import train_test_split , cross_val_score ,  GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score , mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression

# Load Data

In [24]:
file_path = '../web_download.xlsx'
xls = pd.ExcelFile(file_path)
print(xls.sheet_names)

['readme', 'data']


In [25]:
data = pd.read_excel(file_path, sheet_name='data')
print(data.head())

                                            IND_NAME DIM_GEO_NAME  \
0             Adolescent birth rate (per 1000 women)  Afghanistan   
1             Adolescent birth rate (per 1000 women)  Afghanistan   
2  Age-standardized mortality rate attributed to ...  Afghanistan   
3  Age-standardized prevalence of hypertension am...  Afghanistan   
4  Age-standardized prevalence of obesity among a...  Afghanistan   

               IND_CODE DIM_GEO_CODE  DIM_TIME_YEAR           DIM_1_CODE  \
0        MDG_0000000003          AFG           2021  AGEGROUP_YEARS15-19   
1        MDG_0000000003          AFG           2021  AGEGROUP_YEARS10-14   
2            SDGAIRBODA          AFG           2019             SEX_BTSX   
3  NCD_HYP_PREVALENCE_A          AFG           2019             SEX_BTSX   
4           NCD_BMI_30A          AFG           2022             SEX_BTSX   

   VALUE_NUMERIC VALUE_STRING  \
0      62.000000         62.0   
1      18.000000         18.0   
2     265.664520        265.7

In [26]:
print(data.describe())

       DIM_TIME_YEAR  VALUE_NUMERIC
count   10503.000000   1.050300e+04
mean     2020.753499   4.625948e+05
std         1.641991   1.985762e+07
min      2014.000000   0.000000e+00
25%      2020.000000   5.433967e+00
50%      2021.000000   2.204675e+01
75%      2022.000000   6.490374e+01
max      2023.000000   1.619405e+09


In [27]:
print(data.dtypes)

IND_NAME           object
DIM_GEO_NAME       object
IND_CODE           object
DIM_GEO_CODE       object
DIM_TIME_YEAR       int64
DIM_1_CODE         object
VALUE_NUMERIC     float64
VALUE_STRING       object
VALUE_COMMENTS     object
dtype: object


In [28]:
print(data.isnull().sum())

IND_NAME             0
DIM_GEO_NAME         0
IND_CODE             0
DIM_GEO_CODE         0
DIM_TIME_YEAR        0
DIM_1_CODE        6640
VALUE_NUMERIC        0
VALUE_STRING         0
VALUE_COMMENTS    8656
dtype: int64


In [29]:
# fill missing values
data.ffill(inplace=True)

# Data Preprocessing

In [30]:
#encoding the categoricxal variables 
label_encoder = LabelEncoder()
data['DIM_GEO_NAME'] = label_encoder.fit_transform(data['DIM_GEO_NAME'])
data['IND_NAME'] = label_encoder.fit_transform(data['IND_NAME'])

In [31]:
#sacling the numerrical features
scaler = StandardScaler()
data['VALUE_NUMERIC'] = scaler.fit_transform(data[['VALUE_NUMERIC']])

In [32]:
# feature and target
X = data[['DIM_GEO_NAME', 'IND_NAME', 'DIM_TIME_YEAR']]
y = data['VALUE_NUMERIC']

# Splitting the Data

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the Model

In [34]:
model = LinearRegression()

In [35]:
model.fit(X_train, y_train)

# Model Evaluation

In [36]:
y_pred = model.predict(X_test)

In [37]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.5294629524564218
R^2 Score: 0.001914941783326185


In [38]:
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 0.053053479419965084


Before cross validation or learning curve we do the polynomial regression to extend the linear regression to capture non-linear relationships 

In [39]:
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)
model_poly = LinearRegression()
model_poly.fit(X_train_poly, y_train)
y_pred_poly = model_poly.predict(X_test_poly)
mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)
print(f'Polynomial Regression Mean Squared Error: {mse_poly}')
print(f'Polynomial Regression R^2 Score: {r2_poly}')

Polynomial Regression Mean Squared Error: 0.5289876625881158
Polynomial Regression R^2 Score: 0.0028109057288245864


# Feature Selection 

In [40]:
selector = SelectKBest(score_func=f_regression, k=3)
X_new = selector.fit_transform(X, y)
X_train_new, X_test_new, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)
model_new = LinearRegression()
model_new.fit(X_train_new, y_train)
y_pred_new = model_new.predict(X_test_new)
mse_new = mean_squared_error(y_test, y_pred_new)
r2_new = r2_score(y_test, y_pred_new)
print(f'Feature Selection Mean Squared Error: {mse_new}')
print(f'Feature Selection R^2 Score: {r2_new}')

Feature Selection Mean Squared Error: 0.5294629524564218
Feature Selection R^2 Score: 0.001914941783326185


# Cross validation 

In [41]:
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_mse = -cv_scores.mean()
print(f'Cross-Validated MSE: {cv_mse}')

Cross-Validated MSE: 0.9990636611066815


# Hyperparameter Tuning