## The Used Packages 

In [718]:
# Import the numpy library to use its numerical and mathematical functions.
import numpy as np

# Import the train_test_split function from scikit-learn to split the dataset into training and testing sets.
from sklearn.model_selection import train_test_split

# Import the LinearRegression class from scikit-learn to create and train a linear regression model(to compare it with the implemented one).
from sklearn.linear_model import LinearRegression as skLinearRegression

# Import the mean_squared_error function from scikit-learn to calculate the mean squared error as a model evaluation metric(to compare it with the MSE of the implemented one).
from sklearn.metrics import mean_squared_error

# Import the pandas library as to work with data in a structured and tabular form using DataFrames.
import pandas as pd


## Implementing Linear Regression model from scratch

In [719]:
#Implementation for a Linear Regression class from scratch.

class LinearRegression:

    # Constructor for the LinearRegression class, with default values for learning rate and number of iterations.
    def __init__(self, alpha=0.001, n_iters=1000):
        self.alpha = alpha
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    # Method for training the linear regression model on the input data and target values.
    def fit(self, X, y):
        # Get the number of samples (m) and features (n).
        m, n = X.shape  

        # Initialize the weights with zeros.
        self.weights = np.zeros(n)  
        
        # Initialize the bias with zero.
        self.bias = 0  

        # Iteratively update the weights and bias using gradient descent.
        for _ in range(self.n_iters):
            
            # Model function:
            y_pred = np.dot(X, self.weights) + self.bias

            # Calculate gradients:
            dj_dw = (1/m) * np.dot((y_pred - y), X)
            dj_db = (1/m) * np.sum(y_pred - y)
            # Update the weights and bias using the calculated gradients.
            self.weights -= self.alpha * dj_dw
            self.bias -= self.alpha * dj_db
    
    # Method for making predictions with the trained model.
    def predict(self, X):
        # Model function:
        y_pred = np.dot(X, self.weights) + self.bias
        return y_pred
    
    # Method for calculating the mean squared error (MSE) between predicted and actual target values.

    def mse(self, y_test, y_pred):
        return np.mean((y_test - y_pred)**2)
    
    # Calculate R^2 score (coefficient of determination) as a measure of model performance
    def score(self,y_test,y_pred):
        mean_y = np.mean(y_test)
        ss_total = np.sum((y_test - mean_y) ** 2)
        ss_residual = np.sum((y_test - y_pred) ** 2)
        r_squared = 1 - (ss_residual / ss_total)
        return r_squared



## Testing the model

In [720]:
# reading the csv file as a dataframe
df = pd.read_csv(r"E:\FOURTH YEAR\FIRST SEMESTER\AI in robotics\Task1\insurance.csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


### Data exploration and cleaning

In [721]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## Data Preprocessing
### Now we have no null values and 3 categorical features

In [722]:
df['sex'].unique()


array(['female', 'male'], dtype=object)

In [723]:
# converting categorical features to numerical
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df['sex'].dtype

dtype('int64')

In [724]:
df['smoker'].unique()


array(['yes', 'no'], dtype=object)

In [725]:
# converting categorical features to numerical
df['smoker'] = df['smoker'].map({'yes': 1, 'no': 0})
df['smoker'].dtype

dtype('int64')

In [726]:
df['region'].unique()


array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [727]:
# Use pd.factorize() to automatically assign unique numerical labels to each distinct value in the 'region' column.
df['region'] = pd.factorize(df['region'])[0]
df['region'].dtype

dtype('int64')

In [728]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.900,0,1,0,16884.92400
1,18,0,33.770,1,0,1,1725.55230
2,28,0,33.000,3,0,1,4449.46200
3,33,0,22.705,0,0,2,21984.47061
4,32,0,28.880,0,0,2,3866.85520
...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,2,10600.54830
1334,18,1,31.920,0,0,3,2205.98080
1335,18,1,36.850,0,0,1,1629.83350
1336,21,1,25.800,0,0,0,2007.94500


In [729]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   int64  
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 73.3 KB


In [730]:
#Input variables
X = df.drop(columns=['charges']) 
y = df['charges'] 


## Using sklearn model


In [731]:
model = skLinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print(f"mse = {mse}")
print('R^2 Score sk: ', model.score(X_test, y_test))

mse = 33635210.43117845
R^2 Score sk:  0.7833463107364536


In [732]:
y_pred

array([ 8924.40724442,  7116.29501758, 36909.01352144,  9507.87469118,
       27013.3500079 , 10790.77956153,   226.29844571, 16942.71599941,
        1056.63079407, 11267.91997309, 28048.59793155,  9424.36324087,
        5326.32232088, 38460.06017922, 40303.40597026, 37147.01010262,
       15287.91876684, 35965.05485917,  9179.1753067 , 31510.8319849 ,
        3797.79068365, 10070.82803304,  2312.57551348,  7074.41348194,
       11352.37224357, 12907.77079523, 14448.84678727,  6205.65997921,
        9917.00839638,  2239.50032819,  9060.55469043, 13120.56214535,
        4617.70702822,  3467.91218926,  4402.74821855, 12967.91608907,
        1927.44498944,  8757.9180081 , 33324.35180597, 32638.47697026,
        3852.41756615,  4370.39670883, 14080.76023234, 11478.63402576,
        8829.26135924, 12046.15119133,  5322.80515731,  3100.71182484,
       35546.60547574,  9201.61196817, 15894.23763341,  2406.04003607,
       12397.52052544,  1433.90617387, 13448.14094304, 12519.54174599,
      

## Using the implemented model

In [733]:
# Splitting the data into training and testing sets, with 80% for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating an instance of implemented LinearRegression class with specific hyperparameters (learning rate and number of iterations).
reg = LinearRegression(0.0001, 1000)

# Fitting (training) the LinearRegression model on the training data (X_train, y_train).
reg.fit(X_train, y_train)

# Making predictions using the trained model on the testing data.
pred = reg.predict(X_test)

# Calculating the mean squared error (MSE) between the predicted values (pred) and the actual target values (y_test).
mse = reg.mse(y_test, pred)

# Printing the MSE as a measure of the model's performance on the test data.
print(f"mse = {mse}")
# Print R^2 score
print('R^2 Score: ', reg.score(y_test, pred))

mse = 130892113.41490564
R^2 Score:  0.1568877107260519


In [734]:
pred

array([13592.59898907, 12604.50098066, 17859.94181837, 13992.91234169,
       10061.87758969, 14716.31142089,  7883.01861112, 19582.96650857,
        8668.1725028 , 15129.41368459, 10984.39698451, 13858.19985697,
       11217.13070497, 17472.6113417 , 19738.80485367, 16963.61693051,
       17635.11226581, 16425.74179526, 14245.81071905, 13762.59942283,
        9668.04490885, 14610.54836816,  9384.30806965, 11711.26168068,
       15178.20989506, 16621.36318654, 18223.86431796, 11117.35412904,
       14249.51564612,  9147.5630849 , 13499.79550151, 17039.65523376,
       10578.49531422,  9746.93589743, 10741.14043015, 16238.46506498,
        9052.19263218, 13999.68341328, 15068.90865003, 14051.52339096,
        9337.35865285, 10227.40420286, 17640.33843143, 16278.28493534,
       13255.99253799, 16711.93787821, 10485.27738703,  9571.44252794,
       15693.23812583, 13359.85438564, 19040.80944398,  9069.36789492,
       15584.40864075,  8410.19512471, 17364.68972143, 15905.70332164,
      

#### We can see that the model is not very accurate

## Enhancing the results

In [735]:
# Method to normalize the input variables
def normalize_features(X):
    means = np.mean(X, axis=0)
    stds = np.std(X, axis=0)
    normalized_X = (X - means) / stds
    return normalized_X


In [736]:
# Splitting the data into training and testing sets, with 80% for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(normalize_features(X), y, test_size=0.2, random_state=42)

# Creating an instance of implemented LinearRegression class with specific hyperparameters (learning rate and number of iterations).
reg = LinearRegression(0.05, 1000)

# Fitting (training) the LinearRegression model on the training data (X_train, y_train).
reg.fit(X_train, y_train)

# Making predictions using the trained model on the testing data.
pred = reg.predict(X_test)

# Calculating the mean squared error (MSE) between the predicted values (pred) and the actual target values (y_test).
mse = reg.mse(y_test, pred)

# Printing the MSE as a measure of the model's performance on the test data.
print(f"mse = {mse}")
# Print R^2 score
print('R^2 Score: ', reg.score(y_test, pred))

mse = 33635210.43117842
R^2 Score:  0.7833463107364538


## Conclusion

### by normalizing the input variables, fine tuning the learning rate we have the same accuracy of the sklearn model