In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("insurance.csv")
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df['region'] = le.fit_transform(df['region'])
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,1,3,16884.92
1,18,1,33.8,1,0,2,1725.55
2,28,1,33.0,3,0,2,4449.46
3,33,1,22.7,0,0,1,21984.47
4,32,1,28.9,0,0,1,3866.86


In [None]:
X = df[['age'	,'sex',	'bmi',	'children'	,'smoker'	,'region']]


#X = pd.get_dummies(X, columns=['sex', 'smoker', 'region'])
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,3
1,18,1,33.8,1,0,2
2,28,1,33.0,3,0,2
3,33,1,22.7,0,0,1
4,32,1,28.9,0,0,1


In [None]:
y = df['expenses']
y.head()

Unnamed: 0,expenses
0,16884.92
1,1725.55
2,4449.46
3,21984.47
4,3866.86


In [None]:
regr = LinearRegression()
regr.fit(X,y)

y_pred = regr.predict(X)
y_pred

array([25109.80994164,  3697.72420138,  6963.4622875 , ...,
        4380.50432659,  1106.47891796, 37021.39871475])

In [None]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error,r2_score
mae = mean_absolute_error(y , y_pred)
mse = mean_squared_error(y , y_pred)
rmse = np.sqrt(mse)
r2=r2_score(y , y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")

MAE: 4172.254300120063
MSE: 36525536.8689183
RMSE: 6043.636063572847
R2: 0.7507516902763371


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=10)

In [None]:
regr = LinearRegression()
regr.fit(X_train,y_train)

y_pred = regr.predict(X_test)
y_pred

array([ 8025.02717443,  5856.01717022, 15121.19175386, 11475.88628275,
        3727.6258385 ,  9572.51820541,  9505.4901631 , 26036.34978064,
        7770.1274273 , 30719.02537733,  1719.2082066 ,  4166.85756021,
        7356.37530515, 11212.93352665, 12872.5557682 , 11090.67076628,
        5736.37042178, 18179.6689095 , 10647.20390419, 30851.7178098 ,
       34101.3636778 ,  4300.46315288,  1227.3533416 ,  7311.48046109,
        4475.63273511, 13380.97437341, 11769.4686291 , 10304.24504037,
        6355.97048334, 30973.5588612 , -1774.94433045, 33930.29788436,
        9628.71928776,  7361.23050133,  8202.57286825, 12083.19320625,
       13416.43531945, 12620.57514399,  9918.41390527,  8426.83085222,
       16061.68781747,  9803.31982275,  5476.95515429, 11495.79085485,
        5558.92120485,  7464.22535545,  6596.60885596, 34204.40510332,
       11064.39219654, 10637.49406335, 32836.42187534,  5660.95244175,
       10313.04615816, 14601.71399491, 11307.53183955, 13454.45184302,
      

In [None]:
regr.intercept_

-12511.174040663609

In [None]:
regr.coef_

array([  259.74995462,  -261.34635207,   344.60183267,   544.52619449,
       24079.86232715,  -253.59645408])

In [None]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error,r2_score
mae = mean_absolute_error(y_test , y_pred)
mse = mean_squared_error(y_test , y_pred)
rmse = np.sqrt(mse)
r2=r2_score(y_test , y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")

MAE: 4540.009512256522
MSE: 42660609.7279342
RMSE: 6531.508993175635
R2: 0.6958260394784812


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

data = {
    'Employee id': [10, 20, 15, 25, 30],
    'Gender': ['M', 'F', 'F', 'M', 'F'],
    'Remarks': ['Good', 'Nice', 'Good', 'Great', 'Nice']
}

df = pd.DataFrame(data)
print(f"Original Employee Data:\n{df}\n")
# Use pd.get_dummies() to one-hot encode the categorical columns
df_pandas_encoded = pd.get_dummies(df, columns=['Gender', 'Remarks'])
print(f"One-Hot Encoded Data using Pandas:\n{df_pandas_encoded}\n")

encoder = OneHotEncoder(sparse_output=False)
categorical_columns=['Gender','Remarks']
one_hot_encoded = encoder.fit_transform(df[categorical_columns])

one_hot_df = pd.DataFrame(one_hot_encoded,
                          columns=encoder.get_feature_names_out(categorical_columns))

df_sklearn_encoded = pd.concat([df.drop(categorical_columns, axis=1), one_hot_df], axis=1)

print(f"One-Hot Encoded Data using Scikit-Learn:\n{df_sklearn_encoded}\n")

Original Employee Data:
   Employee id Gender Remarks
0           10      M    Good
1           20      F    Nice
2           15      F    Good
3           25      M   Great
4           30      F    Nice

One-Hot Encoded Data using Pandas:
   Employee id  Gender_F  Gender_M  Remarks_Good  Remarks_Great  Remarks_Nice
0           10     False      True          True          False         False
1           20      True     False         False          False          True
2           15      True     False          True          False         False
3           25     False      True         False           True         False
4           30      True     False         False          False          True

One-Hot Encoded Data using Scikit-Learn:
   Employee id  Gender_F  Gender_M  Remarks_Good  Remarks_Great  Remarks_Nice
0           10       0.0       1.0           1.0            0.0           0.0
1           20       1.0       0.0           0.0            0.0           1.0
2           15  