<a href="https://colab.research.google.com/github/piserushikesh/Gen_AI_Training/blob/main/Life_Expectancy_Problem_solve_by_Linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Life Expectancy Problem solve by Linear regression

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [36]:
# Step 1: Read the CSV file
data = pd.read_csv("Life Expectancy Data.csv")

In [37]:
data.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [38]:
print(data['Country'].dtype)
print(data['Status'].dtype)

object
object


In [39]:
# Step 2: Data Cleaning
# Convert variables to factor
data['Country'] = data['Country'].astype('category')
data['Status'] = data['Status'].astype('category')

In [40]:
print(data['Country'].dtype)
print(data['Status'].dtype)

category
category


In [41]:
data.isnull().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

In [42]:
print("Number of rows in the dataset:", data.shape[0])

Number of rows in the dataset: 2938


In [43]:
# Drop missing rows
data.dropna(inplace=True)

In [44]:
data.isnull().sum()

Country                            0
Year                               0
Status                             0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
 BMI                               0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
 HIV/AIDS                          0
GDP                                0
Population                         0
 thinness  1-19 years              0
 thinness 5-9 years                0
Income composition of resources    0
Schooling                          0
dtype: int64

In [45]:
print("Number of rows in the dataset:", data.shape[0])

Number of rows in the dataset: 1649


In [46]:
# Step 3: Run Linear Regression
# Drop unnecessary variables
data.drop(columns=['Country', 'Year', 'Status'], inplace=True)

In [47]:
data.head()

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,65.0,263.0,62,0.01,71.279624,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,59.9,271.0,64,0.01,73.523582,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,59.9,268.0,66,0.01,73.219243,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,59.5,272.0,69,0.01,78.184215,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,59.2,275.0,71,0.01,7.097109,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [49]:
# Run initial linear regression
X = data.drop(columns=['Life expectancy ']).values
y = data['Life expectancy '].values
model = LinearRegression()
model.fit(X, y)

In [54]:
# Step 4: Outlier Detection
# Detect and remove outliers using IQR
Q1 = data['Life expectancy '].quantile(0.25)
Q3 = data['Life expectancy '].quantile(0.75)
IQR = Q3 - Q1
outliers = data[(data['Life expectancy '] < (Q1 - 1.5 * IQR)) | (data['Life expectancy '] > (Q3 + 1.5 * IQR))]
data = data[~data.index.isin(outliers.index)]

In [55]:
# Run linear regression again
X = data.drop(columns=['Life expectancy '])
y = data['Life expectancy ']
model = LinearRegression()
model.fit(X, y)

In [56]:
# Step 5: Multicollinearity
# Check for multicollinearity using VIF
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]


In [57]:
# Identify variables with VIF > 5
high_vif = vif_data[vif_data['VIF'] > 5]['feature'].tolist()

In [58]:
# Drop variables with high VIF
data.drop(columns=high_vif, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=high_vif, inplace=True)


In [59]:
data.head()

Unnamed: 0,Life expectancy,Adult Mortality,Alcohol,Measles,HIV/AIDS,Population
0,65.0,263.0,0.01,1154,0.1,33736494.0
1,59.9,271.0,0.01,492,0.1,327582.0
2,59.9,268.0,0.01,430,0.1,31731688.0
3,59.5,272.0,0.01,2787,0.1,3696958.0
4,59.2,275.0,0.01,3013,0.1,2978599.0


In [62]:
# Run linear regression again
X = data.drop(columns=['Life expectancy '])
y = data['Life expectancy ']
model = LinearRegression()
model.fit(X, y)

In [63]:
# Step 6: Set seed and split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [64]:
# Step 7: Run linear regression on train data
model.fit(X_train, y_train)


In [65]:
# Step 8: Predict test data and calculate metrics
y_pred = model.predict(X_test)


In [66]:
# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)


In [67]:
# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)


In [68]:
# Output results
print("RMSE:", rmse)
print("MAE:", mae)

RMSE: 5.2949362890991045
MAE: 3.8797443918288885
