# Introduction to sklearn 

Useful functions :

0. An end to end Scikit-learn workflow
1. Getting the data ready
2. Choose the right estimator/algo for our problem
3. Fit the model/algo and use it to make predictions
4. Evaluating the model
5. Improve the model
6. Save and load a trained model
7. Puttting it all together

## 0. An end to end Scikit-learn workflow

In [None]:
import numpy as np

In [None]:
#Get the data ready
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
heart = pd.read_csv("data/heart-disease.csv")
heart

In [None]:
import warnings 
warnings.filterwarnings("ignore")

In [None]:
# Create X (features matrix)
X = heart.drop("target", axis=1)

#Create Y (labels)
y = heart["target"]

In [None]:
# 2. Choose the right model and hyperparamters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# we keep the default hyperparameters
clf.get_params()

In [None]:
# 3. Fit the model to the data 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
clf.fit(X_train, y_train);

In [None]:
# make a prediction
y_label =clf.predict(np.array([0, 2, 3, 4]))

In [None]:
y_preds = clf.predict(X_test)
y_preds

In [None]:
y_test

In [None]:
# 4 .Evaluate the model 
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

In [None]:
confusion_matrix(y_test, y_preds)

In [None]:
accuracy_score(y_test, y_preds)

In [None]:
# 5 .Improve the model
# Try diff ammounts of estimator

np.random.seed(42)
for i in range (10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%")
    print(" ")

In [None]:
# 6. Save the model and load it 
import pickle

pickle.dump(clf, open("randomForestModel_1.pk1", "wb"))

In [None]:
loadedModel = pickle.load(open("randomForestModel_1.pk1", "rb"))
loadedModel.score(X_test, y_test)

## 1. Getting data ready

The main things to do:
 1. Split the data into features and labels (usually 'X' & 'y')
 2. Filling/Imputing missing values
 3. Convert non numerical values to numeric(encoding)

In [None]:
heart.head(3)

In [None]:
X = heart.drop("target", axis=1)
X.head(3)


In [None]:
y = heart["target"]
y.head(3)

In [None]:
# Split the data to training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Make it Numerical

In [None]:
carSales = pd.read_csv("data/car-sales-extended.csv")
carSales.head(3)

In [None]:
carSales.dtypes

In [None]:
#split the data 
X = carSales.drop("Price", axis=1 )
y = carSales["Price"]

# Split to training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model =RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# turn the categories into  numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalFeatures = ["Make", "Colour", "Doors"]
OneHot = OneHotEncoder()
tranformer = ColumnTransformer([("OneHot", 
                                OneHot, 
                                categoricalFeatures)],
                                remainder="passthrough")
tranformedX = tranformer.fit_transform(X)
tranformedX

In [None]:
pd.DataFrame(tranformedX)

In [None]:
X.head(5)

In [None]:
dummy = pd.get_dummies(carSales[["Make", "Colour", "Doors"]])
dummy

In [None]:
# lets refil the model
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(tranformedX, y, test_size=0.2)
model.fit(X_train, y_train);

In [None]:
model.score(X_test, y_test)

### Missing Values

1. Fill them with some values
2. Remove the sample with missing data together

In [None]:
carMiss = pd.read_csv("data/car-sales-extended-missing-data.csv")
carMiss.isna().sum()

In [None]:
#create x and y
X = carMiss.drop("Price", axis=1)
y = carMiss["Price"]

In [None]:
# Lets convert our data to numbers
# turn the categories into  numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalFeatures = ["Make", "Colour", "Doors"]
OneHot = OneHotEncoder()
tranformer = ColumnTransformer([("OneHot", 
                                OneHot, 
                                categoricalFeatures)],
                                remainder="passthrough")
tranformedX = tranformer.fit_transform(X)
tranformedX

In [None]:
carMiss

#### Fill missing data with pandas

In [None]:
carMiss["Doors"].value_counts()

In [None]:
# Fil the make column
carMiss["Make"].fillna("missing", inplace=True)
carMiss["Colour"].fillna("missing", inplace=True)

carMiss["Odometer (KM)"].fillna(carMiss["Odometer (KM)"].mean(), inplace=True)

carMiss["Doors"].fillna(4, inplace=True)

In [None]:
carMiss.isna().sum()

In [None]:
#Remove rows with missing price values
carMiss.dropna(inplace=True)
carMiss.isna().sum()

In [None]:
X = carMiss.drop("Price", axis=1)
y = carMiss["Price"]

In [None]:
# turn the categories into  numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalFeatures = ["Make", "Colour", "Doors"]
OneHot = OneHotEncoder()
tranformer = ColumnTransformer([("OneHot", 
                                OneHot, 
                                categoricalFeatures)],
                                remainder="passthrough")
tranformedX = tranformer.fit_transform(carMiss)
tranformedX

### Option 2: Fill missing values with scikitLearn

In [None]:
carMiss = pd.read_csv("data/car-sales-extended-missing-data.csv")
carMiss.isna().sum()

In [None]:
#Drop rows with no label
carMiss.dropna(subset=["Price"], inplace=True)
carMiss.isna().sum()

In [None]:
X = carMiss.drop("Price", axis=1)
y = carMiss["Price"]

In [None]:
# Fill missing values with Scikitlearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill missing values with missng and numerical value with mean
catImputer = SimpleImputer(strategy="constant", fill_value="missing")
doorImputer = SimpleImputer(strategy="constant", fill_value=4)
numImputer = SimpleImputer(strategy="mean")

#Define columns
catFeature = ["Make", "Colour"]
doorFeature = ["Doors"]
numFeature = ["Odometer (KM)"]

# Create and Imputer(sth that fills the missing dta)
imputer = ColumnTransformer([("catImputer", catImputer, catFeature),
                             ("doorImputer", doorImputer, doorFeature),
                             ("numImputer", numImputer, numFeature)])

# Transform the data
filledX = imputer.fit_transform(X)
filledX

In [None]:
carSalesFilled = pd.DataFrame(filledX, 
                              columns=["Make", "Colour", "Doors", "Odometer (KM)"])
carSalesFilled

In [None]:
carSalesFilled.isna().sum()

In [None]:
# turn the categories into  numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalFeatures = ["Make", "Colour", "Doors"]
OneHot = OneHotEncoder()
tranformer = ColumnTransformer([("OneHot", 
                                OneHot, 
                                categoricalFeatures)],
                                remainder="passthrough")
transformedX = tranformer.fit_transform(carSalesFilled)
transformedX

In [None]:
# Now we have got our data as numbers and filled(no missing values)
# Let's fit model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformedX, y, test_size=0.2)
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
len(carSalesFilled), len(carSales)

## Choosing the right Estimator/algorithm for ut ptoblem
Some things to note:
* Sklearn refers to machine learning models, algorithms as estimators
* Classification problem predicting a category (heart disease or not)
   * Sometimes you will see clf (short for classifier) used as a classification estimator
* Regression problem - predicting a number (selling price of a car)

### Picking a machine learning model for regression problem
lets use the Carlifornia Housing data sets

In [None]:
# Getting the California Housing datasets
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

In [None]:
housingDf = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housingDf

In [None]:
housingDf["MedHouseVal"] = housing["target"]
housingDf

In [None]:
housingDf["target"] = housing["target"]
housingDf

In [None]:
housingDf = housingDf.drop("MedHouseVal", axis=1)

In [None]:
housingDf

In [None]:
# Import algorithm 
from sklearn.linear_model import Ridge

# set up random seed
np.random.seed(42)

# create the data
X = housingDf.drop("target", axis=1)
y = housingDf["target"]

#split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Instantiate and fit the model (on the training set)
model = Ridge()
model.fit(X_train, y_train)

# Check the score of the model (on the test set)
model.score(X_test, y_test)

# Lasso = 0.2841671821008396
# ElasticNet = 0.41655189098028245
# SVR(kernel="linear") = -0.01648536010717372
# EnsembleRegressors = 0.9345833333333333
    # RandomForest = 0.8065734772187598
# SVR(kernel="rbf") = 

In [None]:
# Using SVM 
from sklearn import svm

# set up random seed
np.random.seed(42)

# create the data
X = housingDf.drop("target", axis=1)
y = housingDf["target"]

#split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Instantiate and fit the model (on the training set)
model = svm.SVR(kernel="rbf")
model.fit(X_train, y_train)

# Check the score of the model (on the test set)
model.score(X_test, y_test)

In [None]:
# Using Ensemble Model (Combination of smaller models to try and make better prediction than just a single model )

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.datasets import make_hastie_10_2

# set up random seed
np.random.seed(42)

# create the data
# X = housingDf.drop("target", axis=1)
# y = housingDf["target"]

X, y = make_hastie_10_2(random_state=0)

#split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Instantiate and fit the model (on the training set)
# model = svm.SVR()
model = HistGradientBoostingClassifier().fit(X_train, y_train)

# Check the score of the model (on the test set)
model.score(X_test, y_test)


In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housingDf.drop("target", axis=1)
y = housingDf["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()

model.fit(X_train, y_train)

model.score(X_test, y_test)

## Picking Machine Learning Model for Classification