## Intro to scikit learn
this notebook covers  some of the most useful functions of the scikit library

whats going to be covered:


0. workflow of scikit learn
1. getting the data ready
2. choose right estimator/algorithm for our problems
3. fit the model/algo and use it to make predictions on the data
4. evaluating a model
5. improve a model
6. Save and load a trained model
7. Putting it all together

## 0 an end end to scikit workflow

In [6]:
#1 Getting the Data Ready
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
hd=pd.read_csv("heart-disease (1).csv")
hd

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
## Create X which is called feature matrix,data
X=hd.drop("target",axis=1)
## create Y which is called the labels 
Y=hd["target"]

In [4]:
##2 Choose the right the model and hyper parameters
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier()
clf.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Getting Data Ready

## 3 main things 
1. Split into feature,label column usually X and y
2. Filling aka imputing or disregarding missing values
3. Convert non numerical values to numerical values

In [7]:
hd.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [8]:
X=hd.drop("target",axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [9]:
y=hd["target"]

In [10]:
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [11]:
##splitting into training and test sets
from sklearn.model_selection import train_test_split
## returns 4 values 
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2)

In [12]:
Xtrain

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
188,50,1,2,140,233,0,1,163,0,0.6,1,1,3
157,35,1,1,122,192,0,1,174,0,0.0,2,0,2
213,61,0,0,145,307,0,0,146,1,1.0,1,0,3
274,47,1,0,110,275,0,0,118,1,1.0,1,1,2
59,57,0,0,128,303,0,0,159,0,0.0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,39,1,0,118,219,0,1,140,0,1.2,1,0,3
209,59,1,0,140,177,0,1,162,1,0.0,2,1,3
48,53,0,2,128,216,0,0,115,0,0.0,2,0,0
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2


In [13]:
Xtest

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
199,65,1,0,110,248,0,0,158,0,0.6,2,2,1
257,50,1,0,144,200,0,0,126,1,0.9,1,0,3
261,52,1,0,112,230,0,1,160,0,0.0,2,1,2
87,46,1,1,101,197,1,1,156,0,0.0,2,0,3
277,57,1,1,124,261,0,1,141,0,0.3,2,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,45,1,0,115,260,0,0,185,0,0.0,2,0,2
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2
37,54,1,2,150,232,0,0,165,0,1.6,2,0,3
231,57,1,0,165,289,1,0,124,0,1.0,1,3,3


In [15]:
#1.1 Making sure all data is all numerical
carsales=pd.read_csv("car-sales-extended.csv")
carsales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [17]:
carsales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [19]:
## Split into x and y 
X=carsales.drop("Price",axis=1)
y=carsales["Price"]
Xtest,Xtrain,ytrain,ytest=train_test_split(X,y,test_size=0.2)

In [20]:
## build a machine learning model
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()
model.fit(Xtrain,ytrain)
model.score(Xtest,ytest)

ValueError: could not convert string to float: 'Toyota'

In [27]:
## convert strings into number int
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
catFeatures=["Make","Colour","Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot",one_hot,catFeatures)],remainder="passthrough")
transformedX=transformer.fit_transform(X)
transformedX
pd.DataFrame(transformedX)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [26]:
dummies=pd.get_dummies(carsales[["Make","Doors","Colour"]],dtype="int")
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [31]:
np.random.seed(42)
Xtrain,Xtest,ytrain,ytest=train_test_split(transformedX,y,test_size=0.2)
model.fit(Xtrain,ytrain)
model.score(Xtest,ytest)


0.3235867221569877

In [33]:
##gives lower result as it doesnt feed the remaining columns only the 3 we selected i.e Make ,Door,Colour
np.random.seed(40)
Xtrainn,Xtestt,ytrainn,ytestt=train_test_split(dummies,y,test_size=0.2)
model.fit(Xtrainn,ytrainn)
model.score(Xtestt,ytestt)

0.13627023446779662

In [35]:
carsalesmissing=pd.read_csv("car-sales-extended-missing-data.csv")
carsalesmissing.head(20)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
5,Honda,Red,42652.0,4.0,23883.0
6,Toyota,Blue,163453.0,4.0,8473.0
7,Honda,White,,4.0,20306.0
8,,White,130538.0,4.0,9374.0
9,Honda,Blue,51029.0,4.0,26683.0


In [37]:
carsalesmissing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [38]:
X=carsalesmissing.drop("Price",axis=1)
y=carsalesmissing["Price"]
## convert strings into number int
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
catFeatures=["Make","Colour","Doors"]
one_hot=OneHotEncoder()
transformer=ColumnTransformer([("one_hot",one_hot,catFeatures)],remainder="passthrough")
transformedX=transformer.fit_transform(X)
transformedX
pd.DataFrame(transformedX)

Unnamed: 0,0
0,<Compressed Sparse Row sparse matrix of dtype ...
1,<Compressed Sparse Row sparse matrix of dtype ...
2,<Compressed Sparse Row sparse matrix of dtype ...
3,<Compressed Sparse Row sparse matrix of dtype ...
4,<Compressed Sparse Row sparse matrix of dtype ...
...,...
995,<Compressed Sparse Row sparse matrix of dtype ...
996,<Compressed Sparse Row sparse matrix of dtype ...
997,<Compressed Sparse Row sparse matrix of dtype ...
998,<Compressed Sparse Row sparse matrix of dtype ...


In [39]:
np.random.seed(42)
Xtrain,Xtest,ytrain,ytest=train_test_split(transformedX,y,test_size=0.2)
model.fit(Xtrain,ytrain)
model.score(Xtest,ytest)

ValueError: Input y contains NaN.

## MISSING VALUES
We can either fill the missing values or just remove them

In [41]:

carsalesmissing.fillna({"Make":"Missing"},inplace=True)

In [42]:
carsalesmissing.isna().sum()

Make              0
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [43]:
carsalesmissing.fillna({"Colour":"Missing"},inplace=True)

In [45]:
carsalesmissing.fillna({"Odometer (KM)":carsalesmissing["Odometer (KM)"].mean()},inplace=True)

In [46]:
carsalesmissing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors            50
Price            50
dtype: int64

In [47]:
carsalesmissing.fillna({"Doors":4},inplace=True)

In [50]:
carsalesmissing.isna().sum()
##Remove rows with missing price value

carsalesmissing.dropna(inplace=True)

In [51]:
carsalesmissing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [53]:
X=carsalesmissing.drop("Price",axis=1)
y=carsalesmissing["Price"]

In [61]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
catFeatures=["Make","Colour","Doors"]
one_hot=OneHotEncoder(sparse_output=False)
transformer=ColumnTransformer([("one_hot",one_hot,catFeatures)],remainder="passthrough")
transformedX=transformer.fit_transform(X)
transformedX
pd.DataFrame(transformedX)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
946,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
947,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
948,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [59]:
np.random.seed(42)
Xtrain,Xtest,ytrain,ytest=train_test_split(transformedX,y,test_size=0.2)
model.fit(Xtrain,ytrain)
model.score(Xtest,ytest)

0.22233003292603104