# Introduction to Scikit-learn (sklearn)

<mark>This notebook demonstrates some of the most useful 
function of beautiful Scikit-Learn Library</mark>

What we're going to cover:

0. An end-to-end Scikit-Learn workflow
1. Getting the data ready
2. Choose the right estimator/algorithm for our problems
3. Fit the model/alogrithm and use it to make predictions on our data
4. Evaulating a model
5. Improce a model
6. Save and load a trained model
7. Putting it all together!

In [1]:
# Standart imports
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

0. and end-to-end Scikit-learn workflow

In [2]:
# 1. Get the data ready 
! gdown --id 18i_UCl4ejQT4lA-s-IPZ7xgWJRtDC6Nq
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease

Downloading...
From: https://drive.google.com/uc?id=18i_UCl4ejQT4lA-s-IPZ7xgWJRtDC6Nq
To: /content/heart-disease.csv
100% 11.3k/11.3k [00:00<00:00, 13.6MB/s]


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
# We want to classify that some one has heart disease or not.
# Create x (features matrix)
x = heart_disease.drop("target",axis=1)

# Create y (labels)
y = heart_disease.target

In [4]:
# import warnings
# warnings.filterwarnings("ignore") it is generally not a good practice to ignore all warnings, as they can indicate potential issues with your code that you should addres

In [5]:
# 2. Choose the wright model and hyperparameters 
# RandomForestClassifier is used when it has to made decision on True or false, or Male or Female
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# we'll keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [6]:
# Fit the model to the training data means training the model to learn the relationships and patterns in the data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2)

In [7]:
# hey classification model randomforest find patterns in training data
clf.fit(x_train, y_train);

In [8]:
#3. make a prediction # Error due to row and colum got different dimension
# y_label = clf.predict(np.array([0,2,3,4]))

In [9]:
y_preds = clf.predict(x_test) # y_predict is convensional name for prection on test data
y_preds

array([0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1])

In [10]:
y_test

288    0
124    1
274    0
154    1
212    0
      ..
55     1
37     1
183    0
29     1
56     1
Name: target, Length: 61, dtype: int64

In [11]:
# 4. Evaluate the model on the training data and test data.
#   here model as found trainig data so well that it got 100 percent.
clf.score(x_train, y_train)

1.0

In [12]:
# it pridict 3/4 data ,others it does not seen the data so accuary is low 
clf.score(x_test, y_test)

0.8688524590163934

In [13]:
# y_test data are actual output and y_predict are the predicted data
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87        29
           1       0.90      0.84      0.87        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [14]:
accuracy_score(y_test, y_preds)

0.8688524590163934

In [15]:
confusion_matrix(y_test, y_preds)

array([[26,  3],
       [ 5, 27]])

In [16]:
from random import Random
# 5. improve a model
# Try different amount of n_estimators
np.random.seed(42)
for i in range(10,100,10):
  print(f"Trying model with {i} estimators...")
  clf = RandomForestClassifier(n_estimators = i).fit(x_train,y_train)
  print(f"Model accuracy on test set: {clf.score(x_test,y_test) *100:.2f}%")
  print(" ")

Trying model with 10 estimators...
Model accuracy on test set: 78.69%
 
Trying model with 20 estimators...
Model accuracy on test set: 85.25%
 
Trying model with 30 estimators...
Model accuracy on test set: 86.89%
 
Trying model with 40 estimators...
Model accuracy on test set: 90.16%
 
Trying model with 50 estimators...
Model accuracy on test set: 86.89%
 
Trying model with 60 estimators...
Model accuracy on test set: 81.97%
 
Trying model with 70 estimators...
Model accuracy on test set: 85.25%
 
Trying model with 80 estimators...
Model accuracy on test set: 81.97%
 
Trying model with 90 estimators...
Model accuracy on test set: 83.61%
 


In [17]:
#6. save a model and load it
import pickle

pickle.dump(clf, open("random_forest_model_1.pkl","wb"))

In [18]:
loaded_model = pickle.load(open("random_forest_model_1.pkl","rb"))

In [19]:
loaded_model.score(x_test,y_test)

0.8360655737704918

In [20]:
import sklearn
sklearn.show_versions()


System:
    python: 3.8.10 (default, Nov 14 2022, 12:59:47)  [GCC 9.4.0]
executable: /usr/bin/python3
   machine: Linux-5.10.147+-x86_64-with-glibc2.29

Python dependencies:
          pip: 22.0.4
   setuptools: 57.4.0
      sklearn: 1.0.2
        numpy: 1.21.6
        scipy: 1.7.3
       Cython: 0.29.33
       pandas: 1.3.5
   matplotlib: 3.2.2
       joblib: 1.2.0
threadpoolctl: 3.1.0

Built with OpenMP: True


## 1. Getting our data ready to be used with machine learning

Three main things we have to do:
  1. Split the data into features and labels(usually 'x','y') or input data and output data
  2. Filling (also called imputing) or disrefarding missing values
  3. Converting non-numerical values to numerical values (also called feature encoding)

In [21]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [22]:
# input data or features data
x = heart_disease.drop("target", axis =1)

# Actual Output data or Label data
y = heart_disease["target"]

In [23]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [24]:
# Fit model to the training data # 80% of data are given for training data in the model
x_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
70,54,1,2,120,258,0,0,147,0,0.4,1,0,3
177,64,1,2,140,335,0,1,158,0,0.0,2,0,2
284,61,1,0,140,207,0,0,138,1,1.9,2,1,3
29,53,1,2,130,197,1,0,152,0,1.2,0,0,2
71,51,1,2,94,227,0,1,154,1,0.0,2,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,51,0,2,130,256,0,0,149,0,0.5,2,0,2
229,64,1,2,125,309,0,1,131,1,1.8,1,0,3
37,54,1,2,150,232,0,0,165,0,1.6,2,0,3
157,35,1,1,122,192,0,1,174,0,0.0,2,0,2


In [25]:
x_test # 20% of data are given for testing

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
186,60,1,0,130,253,0,1,144,1,1.4,2,1,3
101,59,1,3,178,270,0,0,145,0,4.2,0,0,3
131,49,0,1,134,271,0,1,162,0,0.0,1,0,2
129,74,0,1,120,269,0,0,121,1,0.2,2,1,2
106,69,1,3,160,234,1,0,131,0,0.1,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,62,1,0,120,267,0,1,99,1,1.8,1,2,3
242,64,1,0,145,212,0,0,132,0,2.0,1,2,1
24,40,1,3,140,199,0,1,178,1,1.4,2,0,3
275,52,1,0,125,212,0,1,168,0,1.0,2,2,3


In [26]:
y_train # 80% of data are given for training

70     1
177    0
284    0
29     1
71     1
      ..
50     1
229    0
37     1
157    1
237    0
Name: target, Length: 242, dtype: int64

In [27]:
y_test # 20% of data are given for testing

186    0
101    1
131    1
129    1
106    1
      ..
198    0
242    0
24     1
275    0
235    0
Name: target, Length: 61, dtype: int64

In [29]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape 

((242, 13), (61, 13), (242,), (61,))

1.1 Make Sure it's all numerical

In [30]:
# downloading required dataset from drive
! gdown --id 1aW8w9km0zNu4dxa5_PdhTpMwnFcwfUoU

Downloading...
From: https://drive.google.com/uc?id=1aW8w9km0zNu4dxa5_PdhTpMwnFcwfUoU
To: /content/car-sales-extended.csv
100% 26.3k/26.3k [00:00<00:00, 36.7MB/s]


In [31]:
car_sales = pd.read_csv("car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [32]:
len(car_sales)

1000

In [55]:
car_sales.Doors.value_counts()# we have 3 differnt category here that is 4,5,6

4    856
5     79
3     65
Name: Doors, dtype: int64

In [64]:
car_sales["Odometer (KM)"].value_counts

<bound method IndexOpsMixin.value_counts of 0       35431
1      192714
2       84714
3      154365
4      181577
        ...  
995     35820
996    155144
997     66604
998    215883
999    248360
Name: Odometer (KM), Length: 1000, dtype: int64>

In [33]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [46]:
# Split into x and y
x = car_sales.drop("Price",axis = 1)
y = car_sales.Price

# Split into training and testing
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [50]:
# Build machine learning model
# RandomForestRegressor is used to predict the number based on input x dataset
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(x_train,y_train)
model.score(x_test,y_test)

ValueError: ignored

In [57]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorial_features = ["Make","Colour","Doors"]
# instance of class or objects of class
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorial_features)],
                                  remainder = "passthrough")
# converting x into numbers
transformed_x = transformer.fit_transform(x)
transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [58]:
pd.DataFrame(transformed_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [60]:
#another way of doing
# list of columns
dummies = pd.get_dummies(car_sales[["Make","Colour","Doors"]])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [68]:
#le'ts refit the model 
np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(transformed_x,y,test_size=0.2)

In [69]:
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.3235867221569877