In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression , Ridge, Lasso,ElasticNet,RidgeCV, LassoCV,ElasticNetCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

#with simple train test

##Label encoding

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [None]:

df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/insurance.csv")
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df['region'] = le.fit_transform(df['region'])



X = df[['age','sex','bmi','children','smoker','region']]
y = df['expenses']
scaler = StandardScaler()
X = scaler.fit_transform(X)
poly = PolynomialFeatures(degree=2)
co2_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(co2_poly,y, train_size=0.7, random_state=10)

regr = LinearRegression()
regr.fit(X_train, y_train)

predictedCO2 = regr.predict(X_test)
#train_score_lr = regr.score(X_train, y_train)
#test_score_lr = regr.score(X_test, y_test)

#print("The train score for lr model is {}".format(train_score_lr))
#print("The test score for lr model is {}".format(test_score_lr))

mae = mean_absolute_error(y_test , predictedCO2)
mse = mean_squared_error(y_test , predictedCO2)
rmse = np.sqrt(mse)
r2=r2_score(y_test , predictedCO2)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")
print(100*'_')

MAE: 2917.6876797080376
MSE: 22290096.951296516
RMSE: 4721.238921225711
R2: 0.8283500631159754
____________________________________________________________________________________________________


##one hot encoding

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/insurance.csv")
encoder = OneHotEncoder(sparse_output=False)

# Fitting the encoder and transforming the data
one_hot_encoded_array = encoder.fit_transform(df[['sex', 'smoker','region']])

# The transformed data is an array, so we need to convert it back to a DataFrame
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_array, columns=encoder.get_feature_names_out(['sex', 'smoker','region']))

# Concatenating the one-hot encoded columns to the original DataFrame
df = pd.concat([df, one_hot_encoded_df], axis=1).drop(['sex', 'smoker','region'], axis=1)
#print(df.columns)
X = df.drop(columns=['expenses'])
y = df['expenses']
poly = PolynomialFeatures(degree=2)
co2_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(co2_poly,y, train_size=0.8, random_state=10)

regr = LinearRegression()
regr.fit(X_train, y_train)

predictedCO2 = regr.predict(X_test)

mae = mean_absolute_error(y_test , predictedCO2)
mse = mean_squared_error(y_test , predictedCO2)
rmse = np.sqrt(mse)
r2=r2_score(y_test , predictedCO2)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")
print(100*'_')

MAE: 3158.145385579588
MSE: 29126386.793914936
RMSE: 5396.886768676451
R2: 0.792326258736392
____________________________________________________________________________________________________


#LASSO

##label encoding

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/insurance.csv")
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df['region'] = le.fit_transform(df['region'])


X = df[['age','sex','bmi','children','smoker','region']]
y = df['expenses']
scaler = StandardScaler()
X = scaler.fit_transform(X)

poly = PolynomialFeatures(degree=2)
co2_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(co2_poly,y, train_size=0.8, random_state=10)


l1 = LassoCV(alphas=[0.001,0.01,.1,10,80,85,90,95,96,97,100,110,1000])  # alpha is the regularization parameter
l1.fit(X_train, y_train)
#regr = LinearRegression()
#regr.fit(X_train, y_train)

predictedCO2 = l1.predict(X_test)

mae = mean_absolute_error(y_test  , predictedCO2)
mse = mean_squared_error(y_test  , predictedCO2)
rmse = np.sqrt(mse)
r2=r2_score(y_test  , predictedCO2)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")
print(f"BEST ALPHA: {l1.alpha_}")
print(100*'_')

MAE: 3156.7439009249756
MSE: 27849147.02965086
RMSE: 5277.229105283459
R2: 0.8014330924199571
BEST ALPHA: 97.0
____________________________________________________________________________________________________


##One hot encoding

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/insurance.csv")
encoder = OneHotEncoder(sparse_output=False)

# Fitting the encoder and transforming the data
one_hot_encoded_array = encoder.fit_transform(df[['sex', 'smoker','region']])

# The transformed data is an array, so we need to convert it back to a DataFrame
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_array, columns=encoder.get_feature_names_out(['sex', 'smoker','region']))

# Concatenating the one-hot encoded columns to the original DataFrame
df = pd.concat([df, one_hot_encoded_df], axis=1).drop(['sex', 'smoker','region'], axis=1)
#print(df.columns)
X = df.drop(columns=['expenses'])
y = df['expenses']
scaler = StandardScaler()
X = scaler.fit_transform(X)

poly = PolynomialFeatures(degree=2)
co2_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(co2_poly,y, train_size=0.8, random_state=10)


l1 = LassoCV(alphas=[0.001,0.01,.1,10,80,85,90,95,96,97,100,110,1000])  # alpha is the regularization parameter
l1.fit(X_train, y_train)
#regr = LinearRegression()
#regr.fit(X_train, y_train)

predictedCO2 = l1.predict(X_test)

mae = mean_absolute_error(y_test  , predictedCO2)
mse = mean_squared_error(y_test  , predictedCO2)
rmse = np.sqrt(mse)
r2=r2_score(y_test  , predictedCO2)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")
print(100*'_')

MAE: 3172.6936432685497
MSE: 28586391.679572184
RMSE: 5346.624325644377
R2: 0.7961764721683944
____________________________________________________________________________________________________


#Ridge

##label encoder

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/insurance.csv")
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df['region'] = le.fit_transform(df['region'])


X = df[['age','sex','bmi','children','smoker','region']]
y = df['expenses']
scaler = StandardScaler()
X = scaler.fit_transform(X)
poly = PolynomialFeatures(degree=2)
co2_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(co2_poly,y, train_size=0.8, random_state=10)


l2 = RidgeCV(alphas=[0.001,0.01,.1,9,10,13,15,20,21,25,30,100])  # alpha is the regularization parameter
l2.fit(X_train, y_train)
#regr = LinearRegression()
#regr.fit(X_train, y_train)

predictedCO2 = l2.predict(X_test)

mae = mean_absolute_error(y_test  , predictedCO2)
mse = mean_squared_error(y_test  , predictedCO2)
rmse = np.sqrt(mse)
r2=r2_score(y_test  , predictedCO2)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")
print(f"BEST ALPHA: {l2.alpha_}")
print(100*'_')

MAE: 3129.8293514290554
MSE: 27927791.51905898
RMSE: 5284.675157382805
R2: 0.800872350180945
BEST ALPHA: 15.0
____________________________________________________________________________________________________


##one hot encoder

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/insurance.csv")
encoder = OneHotEncoder(sparse_output=False)

# Fitting the encoder and transforming the data
one_hot_encoded_array = encoder.fit_transform(df[['sex', 'smoker','region']])

# The transformed data is an array, so we need to convert it back to a DataFrame
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_array, columns=encoder.get_feature_names_out(['sex', 'smoker','region']))

# Concatenating the one-hot encoded columns to the original DataFrame
df = pd.concat([df, one_hot_encoded_df], axis=1).drop(['sex', 'smoker','region'], axis=1)
#print(df.columns)
X = df.drop(columns=['expenses'])
y = df['expenses']
scaler = StandardScaler()
X = scaler.fit_transform(X)

poly = PolynomialFeatures(degree=2)
co2_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(co2_poly,y, train_size=0.8, random_state=10)


l2 = RidgeCV(alphas=[0.001,0.01,.1,9,10,13,15,20,21,25,30,100])  # alpha is the regularization parameter
l2.fit(X_train, y_train)
#regr = LinearRegression()
#regr.fit(X_train, y_train)

predictedCO2 = l2.predict(X_test)

mae = mean_absolute_error(y_test  , predictedCO2)
mse = mean_squared_error(y_test  , predictedCO2)
rmse = np.sqrt(mse)
r2=r2_score(y_test  , predictedCO2)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")
print(f"BEST ALPHA: {l2.alpha_}")
print(100*'_')

MAE: 3157.786140077261
MSE: 28925463.395054642
RMSE: 5378.239804532208
R2: 0.7937588605295343
BEST ALPHA: 30.0
____________________________________________________________________________________________________


#Elastic net

##label encoder

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/insurance.csv")
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df['region'] = le.fit_transform(df['region'])


X = df[['age','sex','bmi','children','smoker','region']]
y = df['expenses']
scaler = StandardScaler()
X = scaler.fit_transform(X)

poly = PolynomialFeatures(degree=2)
co2_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(co2_poly,y, train_size=0.8, random_state=10)


ElasticNett = ElasticNetCV(alphas = [0.0001, 0.001,0.01, 0.1,1,2],l1_ratio =[0.01,.1, .5, .7, .9, .95, .99, 1])  # alpha is the regularization parameter
ElasticNett.fit(X_train, y_train)
#regr = LinearRegression()
#regr.fit(X_train, y_train)

predictedCO2 = ElasticNett.predict(X_test)

mae = mean_absolute_error(y_test  , predictedCO2)
mse = mean_squared_error(y_test  , predictedCO2)
rmse = np.sqrt(mse)
r2=r2_score(y_test  , predictedCO2)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")
print(ElasticNett.alpha_ )
print(ElasticNett.l1_ratio_)
print(100*'_')

MAE: 3134.2965780439135
MSE: 27899097.828929428
RMSE: 5281.959658017981
R2: 0.8010769387563157
2.0
0.99
____________________________________________________________________________________________________


##one hot encoding

In [None]:

df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/insurance.csv")
encoder = OneHotEncoder(sparse_output=False)

# Fitting the encoder and transforming the data
one_hot_encoded_array = encoder.fit_transform(df[['sex', 'smoker','region']])

# The transformed data is an array, so we need to convert it back to a DataFrame
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_array, columns=encoder.get_feature_names_out(['sex', 'smoker','region']))

# Concatenating the one-hot encoded columns to the original DataFrame
df = pd.concat([df, one_hot_encoded_df], axis=1).drop(['sex', 'smoker','region'], axis=1)
#print(df.columns)
X = df.drop(columns=['expenses'])
y = df['expenses']
scaler = StandardScaler()
X = scaler.fit_transform(X)

poly = PolynomialFeatures(degree=2)
co2_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(co2_poly,y, train_size=0.8, random_state=10)


ElasticNett = ElasticNetCV(alphas = [0.0001, 0.001,0.01, 0.1,1,2],l1_ratio =[0.01,.1, .5, .7, .9, .95, .99, 1])  # alpha is the regularization parameter
ElasticNett.fit(X_train, y_train)
#regr = LinearRegression()
#regr.fit(X_train, y_train)

predictedCO2 = ElasticNett.predict(X_test)

mae = mean_absolute_error(y_test  , predictedCO2)
mse = mean_squared_error(y_test  , predictedCO2)
rmse = np.sqrt(mse)
r2=r2_score(y_test  , predictedCO2)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")
print(ElasticNett.alpha_ )
print(ElasticNett.l1_ratio_)
print(100*'_')

MAE: 3156.2713956210646
MSE: 28961615.555003032
RMSE: 5381.599720808213
R2: 0.79350109239769
2.0
0.99
____________________________________________________________________________________________________
