<h1 align=center style="line-height:200%;font-family:vazir;color:#0099cc">
<font face="vazirmatn" color="#0099cc">
regression</font>
</h1>

In [180]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import math

In [181]:
train_df = pd.read_csv("../data/train.csv")
train_df.head()

Unnamed: 0,Country,Year,Status,Population,Hepatitis B,Measles,Polio,Diphtheria,HIV/AIDS,infant deaths,under-five deaths,Total expenditure,GDP,BMI,thinness 1-19 years,Alcohol,Schooling,Life expectancy
0,Afghanistan,2015,Developing,33736494.0,65.0,1154,6.0,65.0,0.1,62,83,8.16,584.25921,19.1,17.2,0.01,10.1,65.0
1,Afghanistan,2014,Developing,327582.0,62.0,492,58.0,62.0,0.1,64,86,8.18,612.696514,18.6,17.5,0.01,10.0,59.9
2,Afghanistan,2013,Developing,31731688.0,64.0,430,62.0,64.0,0.1,66,89,8.13,631.744976,18.1,17.7,0.01,9.9,59.9
3,Afghanistan,2012,Developing,3696958.0,67.0,2787,67.0,67.0,0.1,69,93,8.52,669.959,17.6,17.9,0.01,9.8,59.5
4,Afghanistan,2011,Developing,2978599.0,68.0,3013,68.0,68.0,0.1,71,97,7.87,63.537231,17.2,18.2,0.01,9.5,59.2


In [182]:
train_df.isnull().sum()

Country                   0
Year                      0
Status                    0
Population              644
Hepatitis B             542
Measles                   0
Polio                    19
Diphtheria               19
HIV/AIDS                  0
infant deaths             0
under-five deaths         0
Total expenditure       221
GDP                     442
BMI                      32
thinness  1-19 years     32
Alcohol                 188
Schooling               160
Life expectancy           0
dtype: int64

In [183]:
test_df = pd.read_csv("../data/test.csv")
test_df.head()

Unnamed: 0,Country,Year,Status,Population,Hepatitis B,Measles,Polio,Diphtheria,HIV/AIDS,infant deaths,under-five deaths,Total expenditure,GDP,BMI,thinness 1-19 years,Alcohol,Schooling
0,Burundi,2015,Developing,119927.0,94.0,9,94.0,94.0,0.7,21,31,,33.681223,18.7,7.3,,10.6
1,Burundi,2014,Developing,989179.0,95.0,0,95.0,95.0,0.7,22,32,7.54,312.748979,18.2,7.4,0.01,10.6
2,Burundi,2013,Developing,96186.0,96.0,0,96.0,96.0,1.0,22,32,8.3,282.755525,17.6,7.4,0.01,10.5
3,Burundi,2012,Developing,931971.0,96.0,49,96.0,96.0,1.2,22,33,8.21,265.285651,17.1,7.5,0.01,10.3
4,Burundi,2011,Developing,94358.0,96.0,129,95.0,96.0,1.5,22,33,8.58,26.479973,16.6,7.6,4.16,9.9


In [184]:
target_col = "Life expectancy"
feature_cols = [c for c in train_df.columns if c != target_col]

X = train_df[feature_cols]
y = train_df[target_col]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_test = test_df[feature_cols].copy()

In [185]:
country_col = "Country"
cat_cols = ["Country", "Status"]
num_cols = [c for c in X_train.columns if c not in cat_cols]

In [186]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X_train_cat = ohe.fit_transform(X_train[cat_cols])
X_val_cat   = ohe.transform(X_val[cat_cols])
X_test_cat  = ohe.transform(X_test[cat_cols])

ohe_cols = ohe.get_feature_names_out(cat_cols)

X_train_cat = pd.DataFrame(X_train_cat, columns=ohe_cols, index=X_train.index)
X_val_cat   = pd.DataFrame(X_val_cat,   columns=ohe_cols, index=X_val.index)
X_test_cat  = pd.DataFrame(X_test_cat,  columns=ohe_cols, index=X_test.index)


In [187]:
X_train_num = X_train[num_cols]
X_val_num   = X_val[num_cols]
X_test_num  = X_test[num_cols]


In [188]:
group_means = X_train.groupby(country_col)[num_cols].mean()

global_medians = X_train[num_cols].median()

In [189]:
def fill_missing(df, country_series):
    df = df.copy()
    
    for col in num_cols:
        df[col] = df[col].fillna(country_series.map(group_means[col]))
    
    for col in num_cols:
        df[col] = df[col].fillna(global_medians[col])
    return df


In [190]:
X_train_num_filled = fill_missing(X_train_num, X_train[country_col])
X_val_num_filled   = fill_missing(X_val_num,   X_val[country_col])
X_test_num_filled  = fill_missing(X_test_num,  X_test[country_col])


In [191]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_num_scaled = scaler.fit_transform(X_train_num_filled)
X_val_num_scaled   = scaler.transform(X_val_num_filled)
X_test_num_scaled  = scaler.transform(X_test_num_filled)


In [192]:
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns=num_cols, index=X_train.index)
X_val_num_scaled   = pd.DataFrame(X_val_num_scaled,   columns=num_cols, index=X_val.index)
X_test_num_scaled  = pd.DataFrame(X_test_num_scaled,  columns=num_cols, index=X_test.index)


In [193]:
X_train_final = pd.concat([X_train_num_scaled, X_train_cat], axis=1)
X_val_final   = pd.concat([X_val_num_scaled,   X_val_cat],   axis=1)
X_test_final  = pd.concat([X_test_num_scaled,  X_test_cat],  axis=1)


In [194]:
# train_data.loc[:, 'Population'] = train_data['Population'].fillna(Population_mode)
# test_data.loc[:, 'Population'] = test_data['Population'].fillna(Population_mode)

# train_data.loc[:, 'Hepatitis B'] = train_data['Hepatitis B'].fillna(Hepatitis_B_mode)
# test_data.loc[:, 'Hepatitis B'] = test_data['Hepatitis B'].fillna(Hepatitis_B_mode)

# train_data.loc[:, 'Total expenditure'] = train_data['Total expenditure'].fillna(Total_expenditure_mode)
# test_data.loc[:, 'Total expenditure'] = test_data['Total expenditure'].fillna(Total_expenditure_mode)

# train_data.loc[:, 'GDP'] = train_data['GDP'].fillna(GDP_mode)
# test_data.loc[:, 'GDP'] = test_data['GDP'].fillna(GDP_mode)

# train_data.loc[:, 'BMI'] = train_data['BMI'].fillna(BMI_mode)
# test_data.loc[:, 'BMI'] = test_data['BMI'].fillna(BMI_mode)

# train_data.loc[:, 'thinness  1-19 years'] = train_data['thinness  1-19 years'].fillna(thinness_years_mode)
# test_data.loc[:, 'thinness  1-19 years'] = test_data['thinness  1-19 years'].fillna(thinness_years_mode)

# train_data.loc[:, 'Alcohol'] = train_data['Alcohol'].fillna(Alcohol_mode)
# test_data.loc[:, 'Alcohol'] = test_data['Alcohol'].fillna(Alcohol_mode)

# train_data.loc[:, 'Schooling'] = train_data['Schooling'].fillna(Schooling_mode)
# test_data.loc[:, 'Schooling'] = test_data['Schooling'].fillna(Schooling_mode)

In [195]:
# 

In [196]:
from sklearn.preprocessing import PolynomialFeatures

poly_transformer = PolynomialFeatures(degree=1, include_bias=False)

X_train_num_poly = poly_transformer.fit_transform(X_train_num_filled[numeric_columns])

X_val_num_poly   = poly_transformer.transform(X_val_num_filled[numeric_columns])
X_test_num_poly  = poly_transformer.transform(X_test_num_filled[numeric_columns])


In [197]:
poly_cols = poly_transformer.get_feature_names_out(numeric_columns)

X_train_num_poly = pd.DataFrame(X_train_num_poly, columns=poly_cols, index=X_train_num_filled.index)
X_val_num_poly   = pd.DataFrame(X_val_num_poly,   columns=poly_cols, index=X_val_num_filled.index)
X_test_num_poly  = pd.DataFrame(X_test_num_poly,  columns=poly_cols, index=X_test_num_filled.index)


In [198]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_num_poly, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [199]:
print(model.coef_)
print(model.intercept_)

[-2.79776201e-02  1.33982277e-09 -2.89298530e-03 -1.57352052e-05
  3.26937777e-02  4.19970595e-02 -7.22044962e-01  1.17423202e-01
 -8.79264957e-02  7.73889919e-02  8.06470842e-05  6.33851501e-02
 -1.06603583e-01  1.36232831e-01  1.04416200e+00]
104.84835780834936


In [200]:
from sklearn.metrics import r2_score

y_pred = model.predict(X_val_num_poly)
r2_score = r2_score(y_val, y_pred)
print(r2_score)

0.7776182559317764


In [201]:
test_pred = model.predict(X_test_num_poly)
submission = pd.DataFrame(test_pred).rename(columns={0:"Life expectancy"})
submission

Unnamed: 0,Life expectancy
0,66.900345
1,66.630741
2,66.426656
3,65.963088
4,65.854101
...,...
75,61.007083
76,63.008699
77,62.157493
78,61.576726
