# Data Description 

This data set contains 416 liver patient records and 167 non liver patient records collected from North East of Andhra Pradesh, India. The "Dataset" column is a class label used to divide groups into liver patient (liver disease) or not (no disease). This data set contains 441 male patient records and 142 female patient records.

Any patient whose age exceeded 89 is listed as being of age "90".

Columns:

* Age of the patient
* Gender of the patient
* Total Bilirubin
* Direct Bilirubin
* Alkaline Phosphotase
* Alamine Aminotransferase
* Aspartate Aminotransferase
* Total Protiens
* Albumin
* Albumin and Globulin Ratio
* Dataset: field used to split the data into two sets (patient with liver disease, or no disease) (liver disease = 1, not liver diseas = 2)

Dataset link in Kaggle :
https://www.kaggle.com/uciml/indian-liver-patient-records/discussion

original notebook workflow :
https://www.kaggle.com/sanjames/liver-patients-analysis-prediction-accuracy


In [1]:
import pandas as pd

liver_df = pd.read_csv('Data/indian_liver_patient.csv')
liver_df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [2]:
#  fast iinformation about the dataFame

liver_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
Age                           583 non-null int64
Gender                        583 non-null object
Total_Bilirubin               583 non-null float64
Direct_Bilirubin              583 non-null float64
Alkaline_Phosphotase          583 non-null int64
Alamine_Aminotransferase      583 non-null int64
Aspartate_Aminotransferase    583 non-null int64
Total_Protiens                583 non-null float64
Albumin                       583 non-null float64
Albumin_and_Globulin_Ratio    579 non-null float64
Dataset                       583 non-null int64
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [3]:
liver_df.columns

Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Dataset'],
      dtype='object')

In [4]:
# is there nay missing values inside the data

liver_df.isna().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [5]:
# fiiling the missing values inside the Albumin_and_Globulin_Ratio Feature

# we will use the manual func to fill the missed data rather than the ready func OneHotEncoder

liver_df['Albumin_and_Globulin_Ratio'].fillna(value = liver_df['Albumin_and_Globulin_Ratio'].mean() , inplace = True)


#### The data has one categorical feature and we have to convert it to numerical 

In [6]:
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

categ_features = ['Gender']
one_hot = OneHotEncoder()

transformer = ColumnTransformer([('one_hot',
                                  one_hot,
                                  categ_features)],
                                  remainder = 'passthrough' )

liver_df_transformed = pd.DataFrame(transformer.fit_transform(liver_df) )

"""
 * Gender coverted to ----> columns 0 & 1
 * we just rename the columns with each name after the Gender coverted to numerical 
"""

liver_df_transformed.columns = ['0', '1', 'Age','Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Dataset']
liver_df_transformed.head()

Unnamed: 0,0,1,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,1.0,0.0,65.0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9,1.0
1,0.0,1.0,62.0,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1.0
2,0.0,1.0,62.0,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1.0
3,0.0,1.0,58.0,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.0,1.0
4,0.0,1.0,72.0,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.4,1.0


#### OK now the data is clean and ready for any computation 

In [7]:

liver_df_transformed.isna().sum()

0                             0
1                             0
Age                           0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

#### split the data into train & test 

In [8]:
from sklearn.model_selection import train_test_split
import numpy  as np

np.random.seed(50)
               
"""
 np.random.seed() == train_test_split (...... random_stat = 50) 
"""
x = liver_df_transformed.drop('Dataset' , axis = 1)
y = liver_df_transformed['Dataset']

#  casting float into integer
y = liver_df_transformed['Dataset'].astype(int)

x_train ,  x_test , y_train  , y_test = train_test_split(x , y , test_size = .2)

## ther are many classification models/estimators , so we will try all of them one by one to find the best for our data set 

1. from sklearn.linear_model import LinearRegression
2. from sklearn.linear_model import LogisticRegression
3. from sklearn.svm import SVC, LinearSVC
4. from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
5. from sklearn.neighbors import KNeighborsClassifier
6. from sklearn.naive_bayes import GaussianNB
7. from sklearn.linear_model import Perceptron
8. from sklearn.linear_model import SGDClassifier
9. from sklearn.tree import DecisionTreeClassifier
10. from sklearn.neural_network import MLPClassifier

## We also will train and evaluat every single model using all evaluating metrics 
 
* accuracy_score . 
* precision_score .
* recall_score .
* f1_score . 
* classification_report .
* confusion_matrix

## 1. linear_model

 *Training the model*

In [9]:
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import  confusion_matrix , classification_report , recall_score , precision_score


lin_mode = LinearRegression(n_jobs= 1 , normalize = True)
lin_mode.fit(x_train , y_train);


*Evaluating the model*

In [10]:
lin_mode.score(x_train , y_train)


0.12197306007044062

In [11]:
lin_score = lin_mode.score(x_test , y_test)
lin_score

-0.06176600425029765

In [17]:
y_predict_1 = lin_mode.predict(x_test)
y_predict_1[:10]

# y_predict_1 = y_predict_1.astype(int)
# lin_evaluation = classification_report(y_test, y_predict_1 , output_dict= True)

# pd.DataFrame(lin_evaluation)  # ??? Why not working ???

array([1.32464234, 0.69865513, 1.43592741, 1.47280503, 1.22637158,
       1.43000794, 1.42395976, 0.92825134, 1.46410501, 1.29802096])

In [18]:
recall_score(y_test , y_predict_1 ,  average = None)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

*Tuning the model Hyperparameters using the RandomizedSearchCV*

In [19]:
from sklearn.model_selection import RandomizedSearchCV

 # dictionary of hyperparameter combination
dict= {
        "n_estimators" : [10 , 100 , 200, 300 ,400 ,1000],
        "max_depth" : [None , 10 , 15 , 40 , 50],
        "max_features" : ['auto', 'sqrt'],
        "min_samples_split" : [3 , 6 , 10 , 16],
        "min_samples_leaf" : [2, 4 , 7]
        
    }   
lin_mode_imporved =  RandomizedSearchCV(estimator = lin_mode,
                                        param_distributions= dict,
                                       n_iter = 10,
                                       cv = 5, 
                                       verbose = 2)
lin_mode_imporved.fit(x_train , y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=50 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ValueError: Invalid parameter n_estimators for estimator LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True). Check the list of available parameters with `estimator.get_params().keys()`.

In [20]:
lin_mode.get_params().keys()

dict_keys(['copy_X', 'fit_intercept', 'n_jobs', 'normalize'])

In [21]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()

logistic_model.fit(x_train , y_train);




In [22]:
print(f'logistic_score_train : {logistic_model.score(x_train , y_train) * 100:.2f} %')

logistic_score_train : 73.82 %


In [23]:
print(f'logistic_score_test: {logistic_model.score(x_test , y_test) * 100:.2f} %')

logistic_score_test: 64.96 %


In [24]:
y_prediction_logistic = logistic_model.predict(x_test)

class_report = classification_report(y_test , y_prediction_logistic , output_dict = True)

pd.DataFrame(class_report)

Unnamed: 0,1,2,accuracy,macro avg,weighted avg
f1-score,0.768362,0.280702,0.649573,0.524532,0.597472
precision,0.673267,0.5,0.649573,0.586634,0.61255
recall,0.894737,0.195122,0.649573,0.544929,0.649573
support,76.0,41.0,0.649573,117.0,117.0


In [25]:
confusion_matrix(y_test , y_prediction_logistic )

array([[68,  8],
       [33,  8]], dtype=int64)

 *tuning the hyperparameter to imporve the model score*

In [26]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

dictionary = {"n_estimators" : [10 , 50 , 100, 200 ,500 , 900],
              "max_depth" : [None , 5 ,10, 15 ,20],
              "max_featurs" : ['auto' , 'sqrt'],
              "min_samples_leaf" : [4 , 2, 5 , 6],
              "min_samples_split" : [3, 4, 5 ,6 ]}




logistic_model_improved = RandomizedSearchCV( logistic_model,
                                             param_distributions = dictionary,
                                             n_iter = 10,
                                             cv = 5,
                                             verbose = 0,
                                             n_jobs= -1
                                             )
logistic_model_improved.fit(x_train , y_train)

ValueError: Invalid parameter n_estimators for estimator LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [27]:
y_predict_2 = logistic_model_improved.predict(x_test)
classification_report(y_test , y_predict_2)

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

 ## 3. SVC

In [33]:

from sklearn.svm import SVC
model_svc = SVC ()

model_svc.fit(x_train , y_train);




In [34]:
model_svc.score(x_train , y_train)

0.9957081545064378

In [36]:
model_svc.score(x_test , y_test)

0.6581196581196581

In [40]:
y_prediction_svc = model_svc.predict(x_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [42]:
confusion_matrix(y_test , y_prediction_svc)

array([[76,  0],
       [40,  1]], dtype=int64)

In [57]:
# another way to display the classification report
print(classification_report(y_test , y_prediction_svc))

              precision    recall  f1-score   support

           1       0.66      1.00      0.79        76
           2       1.00      0.02      0.05        41

    accuracy                           0.66       117
   macro avg       0.83      0.51      0.42       117
weighted avg       0.78      0.66      0.53       117



In [56]:
class_report_svc = classification_report(y_test , y_prediction_svc , output_dict= True)
pd.DataFrame(class_report_svc)

Unnamed: 0,1,2,accuracy,macro avg,weighted avg
f1-score,0.791667,0.047619,0.65812,0.419643,0.530932
precision,0.655172,1.0,0.65812,0.827586,0.776009
recall,1.0,0.02439,0.65812,0.512195,0.65812
support,76.0,41.0,0.65812,117.0,117.0


*svc model hyperparameter tuning  using GridSearchCV* 

   * note : every model like SVC or LinearRegression or Randomforest... or any algorithm
            has it's own hyperparameter which differ from algorithm to another

In [61]:
from sklearn.model_selection import GridSearchCV 
  
# defining parameter range

#  the SVC hyperparameter for example C , gamma , kernel ,which when you need to tuning svc hyperparameter
#   just change this variables manually or using built-in func GridSearchCV or RandomizedSearchCV

hyper_p_dict= {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
  
model_svc_improved = GridSearchCV( model_svc,
                                  param_grid = hyper_p_dict,
                                  refit = True,
                                  verbose = 3,
                                  n_jobs= -1) 
  
# fitting the model for grid search 
model_svc_imporved.fit(x_train, y_train) 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: Invalid parameter n_estimators for estimator SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [62]:
model_svc.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto_deprecated',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [67]:
logistic_model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'warn',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}