In [11]:
from sklearn.model_selection import cross_val_score
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

  return f(*args, **kwds)


# Machine Learning Pipeline

 ### iterate all 3 steps till satisfactory model performance:
     step 1: Data Processing -> Feature Extraction & Engineering -> Feature Selection & Scaling
     step 2: Modeling (with ML Algorithm)
     step 3: Model Evaluation & Tuning

     step 4: Deployment & Monitoring

# Types of Machine Learning


 1. Supervised
    the training data includes the solutions called **'labels'**
    
    in classification the label is the class that the sample belong to (i.e spam / not spam)
    
    in regression tasks the label is the target values (i.e the car price)
    

2. Unsupervised
    involved models that describe **'unlabeled'**
    
    one common case of unsupervised is clustering

# Feature Matrix


shape [n_samples, n_features]
often stored in varibale X

samples - rows - refers to the individual objects

features - columns - refers to the distinct observations that describe each sample. 
                      can be real values, boolean or discrete-value

# Target Array


shape [n_samples,1] 
usually but not always - is one dimensional with length n_samples

often stored in variable y - called labels or target

may have continouns numerical values, discrete class/labels 

# Data Representation sample

In [9]:
import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [16]:
# for use ml we will extract the features matrix and the target array
X = iris.drop('species',axis=1)
y = iris['species']
display(type(X),type(y),type(y.values))
X.head(),y.values

pandas.core.frame.DataFrame

pandas.core.series.Series

numpy.ndarray

(   sepal_length  sepal_width  petal_length  petal_width
 0           5.1          3.5           1.4          0.2
 1           4.9          3.0           1.4          0.2
 2           4.7          3.2           1.3          0.2
 3           4.6          3.1           1.5          0.2
 4           5.0          3.6           1.4          0.2,
 array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
        'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
        'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
        'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
        'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
        'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
        'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
        'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
        'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor',
        'versicolor', 'versicolor', 'versic

# Hyper-parameters vs Parameters

Hyper-parameters - are those which we supply to the model, for example: number of hidden Nodes and Layers,input features, Learning Rate, Activation Function etc in Neural Network, 

Parameters are those which would be learned by the machine like Weights and Biases.


# Model Evaluation - Cross Validation

typically, we validate that our model is a good fit ti the data, by holding back some subset of the training data, and use this **holdout set** to check the model performance

The problem is that er lose a portion of our data to the model training
we can address this issue by using **cross validation**

we perform a sequence of fits, where each subset of the data is used both as a training set and as validation set
the validation results are combined (e.g averaged) over the rounds

The cross validation procedure has a parameter k that refers to the number of groups that a given data sample is to be split into
As such, the procedure is often called **k-fold cross validation**
common tactics for choosing a value for k:

1. k = 10 - the value for k is fixed to 10. a value that has been found through experimentation to generally result in a model skill estimate with low bias and modest variance
2. k = n - the value for k is fixed to n. where n is the size of the dataset to give each test sample an opportunity to be used in the holdout set. This approach is called **leave one out** cross validation
3. Representative - the value for k is choosen such that each train/test group of data samples is large enough to be statistically representative of the broader dataset

we can use **cross_val_score()** to perform cross validation

cross validation allows you to get not only an estimator of the performance of your model, but also a measure of how precise this estimate is (its standatd deviation)

The **cv** keyword argument determines the cross validation splitting strategy. 
possible inputs for cv are:
1. None - use the defualt 3
2. integer - to specify the number of folds in a (stratified) KFold 
3. an iterable yielding train,test splits. need to write a function that returns the indexe for each iteration (generator fumction) 
4. an object to be used as a cross validation generator

for integer or none inputs , if the estimator is a classifier , **StratifiedKFold** is used
StratifiedKFold performs stratified sampling to produce folds that preserve the percentage of samples for each class
stratification is the process of dividing members of the population into homogeneous subgroups before sampling.

In addition you can specify the scoring strategy for the cross validation using the **scoring** argument

for regreesion, you will typically use RMSE as the scoring and not the default.
the cross validation features expect a utility function (greater is better) rather than a cost function (lower is better), so the scoring function is actually the opposite of MSE (i.e a negative value)
thus you should specify **scoring='neg_mean_squared_error**
in addition you need to take the negative of the result -scores before calculating its square root

In [18]:
#get the titanic dataset
titanic = sns.load_dataset('titanic')
display(titanic.head())

#create the preprocessing pipeline for the numerical data
numeric_features = ['age','fare']
numeric_transformer = Pipeline([('imputer',SimpleImputer(strategy='median')),
                               ('scaler',StandardScaler())])

#create the preprocessing pipeline for the categorial data
category_features = ['embarked','sex','pclass']
category_transformer = Pipeline([('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
                               ('onehot',OneHotEncoder())])

preprocessor = ColumnTransformer([
    ('num',numeric_transformer,numeric_features),
    ('cat',category_transformer,category_features)
])

clf = Pipeline([
    ('pre',preprocessor),
    ('cls',RandomForestClassifier(n_estimators=20))
])

X = titanic.drop('survived',axis=1)
y = titanic['survived']

X_train,X_test,y_train ,y_test = train_test_split(X,y,test_size=0.2)
X_train,X_test ,y_train ,y_test

clf.fit(X_train,y_train)

scores = cross_val_score(clf,X_train,y_train,cv=10)
scores,scores.mean(),scores.std()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


(array([0.86111111, 0.76388889, 0.77464789, 0.85915493, 0.84507042,
        0.84507042, 0.73239437, 0.81690141, 0.77464789, 0.77464789]),
 0.8047535211267606,
 0.04377097264754835)