<a href="https://colab.research.google.com/github/nars95/super-project/blob/main/task01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing librarys or packges that we will use in this task

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from joblib import dump, load

### Reading the data and selecting the the features that are our interest in this job.

In [None]:
df_hospital = pd.read_excel('/content/hospital.xls')

In [None]:
df_hospital.head(1)

Unnamed: 0,ID,LastName,Sex,Age,Weight,Smoker,BloodPressure_1,BloodPressure_2,Trials_1,Trials_2,Trials_3,Trials_4
0,YPL-320,SMITH,Male,38,176,True,124,93,18.0,,,


In [None]:
df_hospital.columns

Index(['ID', 'LastName', 'Sex', 'Age', 'Weight', 'Smoker', 'BloodPressure_1',
       'BloodPressure_2', 'Trials_1', 'Trials_2', 'Trials_3', 'Trials_4'],
      dtype='object')

In [None]:
features = ['Sex', 'Age', 'Weight', 'Smoker']
features

['Sex', 'Age', 'Weight', 'Smoker']

In [None]:
df = df_hospital[features]

In [None]:
df

Unnamed: 0,Sex,Age,Weight,Smoker
0,Male,38,176,True
1,Male,43,163,False
2,Female,38,131,False
3,Female,40,133,False
4,Female,49,119,False
...,...,...,...,...
95,Male,25,171,True
96,Male,44,188,True
97,Male,49,186,False
98,Male,45,172,True


In [None]:
df.info

<bound method DataFrame.info of        Sex  Age  Weight  Smoker
0     Male   38     176    True
1     Male   43     163   False
2   Female   38     131   False
3   Female   40     133   False
4   Female   49     119   False
..     ...  ...     ...     ...
95    Male   25     171    True
96    Male   44     188    True
97    Male   49     186   False
98    Male   45     172    True
99    Male   48     177   False

[100 rows x 4 columns]>

### Changing the column sex from string or object to int. Moreover, we'll do similiar thing with the column smoker

In [None]:
sex_map = {'Male':1, 'Female':0}
df.loc[:, 'Sex'] = df.loc[:, 'Sex'].replace(sex_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'Sex'] = df.loc[:, 'Sex'].replace(sex_map)
  df.loc[:, 'Sex'] = df.loc[:, 'Sex'].replace(sex_map)


In [None]:
smoker_map ={True:1, False:0}
df.loc[:,'Smoker'] = df.loc[:,'Smoker'].replace(smoker_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'Smoker'] = df.loc[:,'Smoker'].replace(smoker_map)


In [None]:
df.head()

Unnamed: 0,Sex,Age,Weight,Smoker
0,1,38,176,1
1,1,43,163,0
2,0,38,131,0
3,0,40,133,0
4,0,49,119,0


### Separating the data and the target

In [None]:
data = df.values
target = df_hospital['BloodPressure_1'].values


In [None]:
target

array([124, 109, 125, 117, 122, 121, 130, 115, 115, 118, 114, 115, 127,
       130, 114, 130, 124, 123, 119, 125, 121, 123, 114, 128, 129, 114,
       113, 125, 120, 127, 134, 121, 115, 127, 121, 127, 136, 117, 124,
       120, 128, 116, 132, 137, 117, 116, 119, 123, 116, 124, 129, 130,
       132, 117, 129, 118, 120, 138, 117, 113, 122, 115, 120, 117, 123,
       123, 119, 110, 121, 138, 125, 122, 120, 117, 125, 124, 121, 118,
       120, 118, 118, 122, 134, 131, 113, 125, 135, 128, 123, 122, 138,
       124, 130, 123, 129, 128, 124, 119, 136, 114])

In [None]:
print(data.shape, target.shape)

(100, 4) (100,)


### Normalize our data using StandardScaler from skleran.preprocessing

In [None]:
scaler = StandardScaler()
data_scaler = scaler.fit_transform(data)
data_scaler.shape

(100, 4)

### Let's take a litle bit look how our data after transform it using DataFrame

In [None]:
df_scaler = pd.DataFrame(data_scaler, columns = features)
df_scaler

Unnamed: 0,Sex,Age,Weight,Smoker
0,1.061913,-0.039001,0.832128,1.393261
1,1.061913,0.657450,0.340416,-0.717741
2,-0.941697,-0.039001,-0.869952,-0.717741
3,-0.941697,0.239579,-0.794304,-0.717741
4,-0.941697,1.493193,-1.323841,-0.717741
...,...,...,...,...
95,1.061913,-1.849776,0.643008,1.393261
96,1.061913,0.796741,1.286017,1.393261
97,1.061913,1.493193,1.210368,-0.717741
98,1.061913,0.936031,0.680832,1.393261


### Separating the data into train and test

In [None]:
X, X_test, y, y_test = train_test_split(data_scaler, target, train_size=0.8)

In [None]:
print(X.shape, X_test.shape)

(80, 4) (20, 4)


## Build the linear model using the Linear class from package linear_model from sklear

In [None]:
model = LinearRegression()

### Using cross validation with 5 folds

In [None]:
folds = KFold( n_splits=5, shuffle=True)

In [None]:
def cross_validation(model, kfold):
  """
    Perform cross-validation using the provided model and k-fold object.

    Parameters:
        model (object): The machine learning model to be evaluated.
        kfold (object): The k-fold cross-validation object that splits the data into training and validation sets.

    Returns:
        list: A list containing two lists - mses and persons.
              - mses: Mean squared error (MSE) values for each fold.
              - persons: Pearson correlation coefficients for each fold.

    """
  mses = []
  persons = []
  metrics = [mses, persons]
  for train_index, val_index in kfold.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    model.fit(X_train, y_train)
    y_predict = model.predict(X_val)
    mse = mean_squared_error(y_val, y_predict)
    person = np.corrcoef(y_val, y_predict)
    metrics[0].append(mse)
    metrics[1].append(person)

  return metrics



In [None]:
results = cross_validation(model, folds)

### Average mse and person coeficient for the folds

In [None]:
results[0]

[39.78676010080535,
 12.785503664585624,
 23.511303933098425,
 17.916836018885462,
 29.746120898000605]

In [None]:
results[1]

[array([[1.        , 0.43733649],
        [0.43733649, 1.        ]]),
 array([[1.        , 0.85915575],
        [0.85915575, 1.        ]]),
 array([[1.       , 0.7255693],
        [0.7255693, 1.       ]]),
 array([[1.       , 0.7626764],
        [0.7626764, 1.       ]]),
 array([[1.        , 0.68568074],
        [0.68568074, 1.        ]])]

In [None]:
avg_mse = np.average(results[0])

In [None]:
avg_mse

24.749304923075094

In [None]:
np.average(results[1], axis = 0)

array([[1.        , 0.69408374],
       [0.69408374, 1.        ]])

### MSE and person coeficient int the test data

In [None]:
predict = model.predict(X_test)

In [None]:
mse_test = mean_squared_error(y_test, predict)
mse_test

22.212967412596832

In [None]:
person_test = np.corrcoef(y_test, predict)
person_test

array([[1.        , 0.65714336],
       [0.65714336, 1.        ]])

### Save the model

In [None]:
dump(model, 'linear_model_hospital.joblib')

['linear_model_hospital.joblib']

### load the model to check if really is the same linear model.

In [None]:
linear_model = load('linear_model_hospital.joblib')

In [None]:
np.corrcoef(y_test, linear_model.predict(X_test))

array([[1.        , 0.65714336],
       [0.65714336, 1.        ]])

In [None]:
mean_squared_error(y_test,linear_model.predict(X_test))

22.212967412596832