In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [41]:
class linearRegression:
  """
  A class for performing linear regression.
  Args:
    learning_rate: The learning rate to be used during training.
    n_iterations: The number of iterations to be performed during training.
  """
  def __init__(self, learning_rate=0.0001, n_iterations=100000):
    """
    Initializes the LinearRegression class.
    Args:
      learning_rate: The learning rate to be used during training.
      n_iterations: The number of iterations to be performed during training.
    """
    self.learning_rate = learning_rate
    self.n_iterations = n_iterations
    self.wights = None
    self.bias = None
  def fit_model(self, X, y):
    """
    Fits the linear regression model to the given data.
    Args:
      X: The training data.
      y: The target values.
    """
    n_samples, n_features = X.shape
    self.wights = np.zeros(n_features)
    self.bias = 0
    for _ in range(self.n_iterations):
      y_pred = np.dot(X, self.wights) + self.bias
      dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
      db = (1 / n_samples) * np.sum(y_pred - y)
      self.wights = self.wights - self.learning_rate * dw
      self.bias = self.bias - self.learning_rate * db
  def predict_model(self, X):
    """
    Predicts the target values for the given data.

    Args:
      X: The data to be used for prediction.

    Returns:
      The predicted target values.
    """
    y_pred = np.dot(X, self.wights) + self.bias
    return y_pred

  def mean_square_error(self, y_true, y_pred):
    """
    Calculates the mean squared error between the true and predicted values.
    Args:
      y_true: The true values.
      y_pred: The predicted values.

    Returns:
      The mean squared error.
    """
    mse = np.mean((y_true - y_pred) ** 2)
    return mse

In [None]:
insurance_df = pd.read_csv('/content/insurance.csv')
insurance_df.head(20)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [None]:
# Check for outliers
insurance_df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [None]:
#since the min value is much smaller than the 25% value and the max value is much higher than the 75% value there is outliers
#we will test the model with and without the outliers

In [None]:
# Check for missing values
insurance_df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [None]:
# Check for collinearity
insurance_df.corr()

  insurance_df.corr()


Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


In [None]:
#it appears that childern is weakly correlated with the charges so we try the model with childern and without it

In [None]:
pd.Series(insurance_df['region'].values).unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [None]:
#the regression model preforms better with intger valued columns
insurance_df['sex'] = insurance_df['sex'].map({'male': 0, 'female': 1})
insurance_df['smoker'] = insurance_df['smoker'].map({'yes': 1, 'no': 0})
insurance_df['region'] = insurance_df['region'].map({'southwest': 0, 'southeast': 1, 'northwest': 2, 'northeast':3})

In [None]:
charges_region_corr = insurance_df['charges'].corr(insurance_df['region'])
charges_region_corr

0.006208234909444513

In [None]:
charges_sex_corr = insurance_df['charges'].corr(insurance_df['sex'])
charges_sex_corr

-0.057292062202025484

In [None]:
charges_smoker_corr = insurance_df['charges'].corr(insurance_df['smoker'])
charges_smoker_corr

0.787251430498478

In [None]:
insurance_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.900,0,1,0,16884.92400
1,18,0,33.770,1,0,1,1725.55230
2,28,0,33.000,3,0,1,4449.46200
3,33,0,22.705,0,0,2,21984.47061
4,32,0,28.880,0,0,2,3866.85520
...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,2,10600.54830
1334,18,1,31.920,0,0,3,2205.98080
1335,18,1,36.850,0,0,1,1629.83350
1336,21,1,25.800,0,0,0,2007.94500


In [None]:
insurance_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   int64  
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 73.3 KB


In [None]:
#summary to what happend we found outliers, no missing values, the correlation values show a weakly
#negative correlation between sex and our target value an large posetive correlation between smoknig
#and the target value a small posetive between the region and the target

In [None]:
X = insurance_df.drop('charges', axis=1)
y = insurance_df['charges']

In [None]:
X = X.to_numpy()
y = y.to_numpy()

In [None]:
#splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
X_train.shape

(1070, 6)

In [None]:
y_train.shape

(1070,)

In [None]:
X_test.shape

(268, 6)

In [None]:
y_test = y_test.reshape(-1, 1)
y_test.shape

(268, 1)

In [42]:
implemented_model = linearRegression()
implemented_model.fit_model(X_train, y_train)
implemented_model_y_pred = implemented_model.predict_model(X_test)
implemented_model_accuracy = r2_score(y_test, implemented_model_y_pred)
implemented_model_accuracy

0.7294607670384561

In [None]:
# adjust hyperparameters learning rate and number of iterations
learning_rate = [0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.3, 0.5, 0.7, 0.9]
number_of_iterations = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
accuracies = []
for lr in learning_rate:
  for n_iters in number_of_iterations:
    hyperparameter_tuned_model = linearRegression(lr, n_iters)
    hyperparameter_tuned_model.fit_model(X_train, y_train)
    y_prediciton = hyperparameter_tuned_model.predict_model(X_test)
    accuracies.append(r2_score(y_test, y_prediciton))
print(np.max(accuracies))

In [None]:
sklearn_model = LinearRegression()
sklearn_model.fit(X_train, y_train)
sklearn_y_pred = sklearn_model.predict(X_test)
sklearn_accuracy = r2_score(y_test, sklearn_y_pred)
sklearn_accuracy

0.7833463107364538