# ML Lab Cheatsheet
<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/marcinsawinski/UEP_KIE_ML_LAB_PROG/blob/main/00_cheatsheet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  </td>
</table>

# Libs

In [2]:
# get data from url
import urllib.request

#shift image
from scipy.ndimage import shift


# viz an data libs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix

# preprocessing and impute 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics.pairwise import rbf_kernel

# create and compose pipelines
from sklearn.compose import TransformedTargetRegressor
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn import set_config

# models for regression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# models for classification
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# split and evluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict


# metrics for regression
from sklearn.metrics import mean_squared_error

# metrics for classification 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay


# custom code
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.utils.validation import check_array
from sklearn.utils.validation import check_is_fitted

# sklearn datasets
from sklearn.datasets import fetch_openml


# Data

In [None]:
# Task 1.1
url_lifesat = 'https://github.com/marcinsawinski/UEP_KIE_ML_LAB_PROG/raw/main/datasets/lifesat/lifesat.csv'
# Task 1.3
url_gdb = 'https://github.com/marcinsawinski/UEP_KIE_ML_LAB_PROG/raw/main/datasets/lifesat/gdp-per-capita-worldbank.csv'
url_oecd = 'https://github.com/marcinsawinski/UEP_KIE_ML_LAB_PROG/raw/main/datasets/lifesat/oecd_bli_20221109.csv'

# Task 2
url_housing = 'https://github.com/marcinsawinski/UEP_KIE_ML_LAB_PROG/raw/main/datasets/housing/housing.csv'

# Basic operations

In [None]:
# Fetch file from url 
urllib.request.urlretrieve(url, filename)

# map list values from labels to bool
y_nn = (y == 'nn')

## Pandas

### Create, copy and merge dataframe

In [None]:
# Read remote or local csv into a pandas dataframe
df = pd.read_csv('file or url address')

# Deepcopy dataframe df_A to dataframe df_B (full copy, not reference)
df_B = df_A.copy()

# Shallow copy dataframe df_A to dataframe df_B (just reference)
df_B = df_A

# Shallow copy subset of dataframe df_A to dataframe df_B (column1 and column2)
df_B = df_A[['column1','column2']]

### Merge dataframes df_a and df_b into df_c (like sql join)
# using matching indexes
df_c = pd.merge(left=df_a, right=df_b, left_index=True, right_index=True)
# using key column column1
df_c = pd.merge(left=df_a, right=df_b, on='column1')

### Create and drop columns

In [None]:
# New feature column3 calculated as column1/column2
df["column3"] = df["column1"] / df["column2"]

# Drop columns in single dataframe
df.drop(columns=['column1','column2'] inplace=True)
# fDrop columns or many dataframes
for set_ in (df1, df2):
    set_.drop(columns=['column1','column2'] inplace=True)

# Generate categories (manually specified bins 1-5 as ranges 0-1.5, 1.5-3, 3-4.5, etc)
df["category_col"] = pd.cut(df["numeric_col"],
                               bins=[0, 1.5, 3, 4.5, 6, np.inf],
                               labels=[1, 2, 3, 4, 5])

# Generate categories (automatically 1 bin for each percentile)
percentiles = [np.percentile(df["column1"], p) for p in range(1, 100)]
flattened_column1 = pd.cut(df["column1"],
                    bins=[-np.inf] + percentiles + [np.inf],
                    labels=range(1, 100 + 1))

### Select and convert data from dataframe

In [None]:
### Pick specific columns (column1 and column2)
cols = ['column1','column2']
df[cols]
# or one line
df[['column1','column2']]

### Pick rows that fill specifi criteria e.g. rows where column1 is equal 100
df[df.column1 == 100]
#or
df[df['column1'] == 100]

### Convert pandas to numpy array
df.values 
# or
df.to_numpy()

### Pick numnerical columns
df_num = df.select_dtypes(include=[np.number])

### Preview and visualize data in dataframe

In [None]:
# Show first lines
df.head()

# Columns type and count summary
df.info() 

# Categorical column - values and counts
df["category_col"].value_counts() 

# Categorical column - bins size share
df["category_col"].value_counts() / len(df)

# Numberical columns - basic statistics
df.describe() 

# Check for null values
df.isna().sum()

# Find null values
null_rows_idx = df.isnull().any(axis=1)
df.loc[null_rows_idx]
# or
df[df.isna().any(axis=1)]

# Histograms for all columns in dataframe
df.hist(bins=50, figsize=(12, 8)) 

# Visualize dataframe as basic scatter plot
df.plot(kind='scatter', grid=True,
             x="column1", y="column2")

# Visualize dataframe as scatter plot with extra options
df.plot(kind="scatter", x="longitude", y="latitude", grid=True, alpha=1, 
s=df["column1"] ,c="column2",cmap="jet", figsize=(10, 7),
legend=True, colorbar=True,label="column2")

# Correlation table
df[['column1','column2']].corr()

# Correlation list
corr_matrix = df[['column1','column2']].corr()
corr_matrix["column1"].sort_values(ascending=False)

# Correlation plot
df.plot(kind="scatter", x="column1", y="column2",
             alpha=0.1, grid=True)

# Scatter matrix
scatter_matrix(df[['column1','column2']], figsize=(12, 8))

# Median values
df.median().values

# Visualize category column 
df["category_col"].value_counts().sort_index().plot.bar(rot=0)

# Show plot (optional in Jupyter)
plt.show()

# Show 2 sublots side by side, original value and transfomred with log function
fig, axs = plt.subplots(1, 2, figsize=(12, 3), sharey=True)
df["column1"].hist(ax=axs[0], bins=50)
df["column1"].apply(np.log).hist(ax=axs[1], bins=50)
axs[0].set_xlabel("Feature 1")
axs[1].set_xlabel("Feature 1 log")
axs[0].set_ylabel("Label")
plt.show()

# Show RBF gamma parameter
col1_range = np.linspace(df["column1"].min(),
                   df["column1"].max(),
                   500).reshape(-1, 1)
gamma1 = 0.1
gamma2 = 0.03
rbf1 = rbf_kernel(col1_range, [[35]], gamma=gamma1)
rbf2 = rbf_kernel(col1_range, [[35]], gamma=gamma2)

fig, ax1 = plt.subplots()

ax1.set_xlabel("Feature 1")
ax1.set_ylabel("Label")
ax1.hist(housing["column1"], bins=50)

ax2 = ax1.twinx()  # create a twin axis that shares the same x-axis
color = "blue"
ax2.plot(col1_range, rbf1, color=color, label="gamma = 0.10")
ax2.plot(col1_range, rbf2, color=color, label="gamma = 0.03", linestyle="--")
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylabel("Feature 1 similarity", color=color)
plt.legend(loc="upper left")
plt.show()

## scikit-learn

### Datasets

In [None]:
#load dataset e.g. mnist_784
data = fetch_openml('mnist_784', as_frame=False)
# check content
data.keys() 
# assign X and y
X, y = data.data, data.target

### Models

In [None]:
### Create ML model
# Create Linear Regression Model
model = LinearRegression()
# Create Regression Model based on k-nearest neighbors with k=5
model = KNeighborsRegressor(n_neighbors=5)

# Create SGD classifier
model = SGDClassifier()
# Create RandomForestClassifier
model = RandomForestClassifier()
# Create KNeighborsClassifier
model = knn_clf = KNeighborsClassifier()

# Fit model with independant variables X and dependant variable y
model.fit(X, y)

# Score model
score = model.score(X, y)

# Predict using  model for one new values (one element matrix with value 100
x_n = [[100]]
model.predict(x_n)

### Train /test split

In [None]:
### Random train /test split 80/20
train_set, test_set = train_test_split(df, test_size=0.2)

### Stratified train /test split 80/20 using category_column
# single split
strat_train_set, strat_test_set = train_test_split(
    housing, test_size=0.2, stratify=df["category_col"])

# or mulitple splits
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
strat_splits = []
for train_index, test_index in splitter.split(df, df["category_col"]):
    strat_train_set_n = df.iloc[train_index]
    strat_test_set_n = df.iloc[test_index]
    strat_splits.append([strat_train_set_n, strat_test_set_n])
#pick sets from first split 
strat_train_set, strat_test_set = strat_splits[0]



### Preprocessing of **numerical** features

In [None]:
# Input missing data (median) on 2 columns
imputer = SimpleImputer(strategy="median")
imputer.fit(df_num)
X = imputer.transform(df['num_col1', 'num_col2'])
# Check values (not required)
imputer.statistics_
# Back from array to DF (not required)
df_tr = pd.DataFrame(X, columns=df.columns,
                          index=df.index)

# Normalize numnerical values
min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
X_min_max_scaled = min_max_scaler.fit_transform(df_num)

# Standardize numnerical values
std_scaler = StandardScaler()
X_std_scaled = std_scaler.fit_transform(df_num)
# Back from array to DF (not required)
df = pd.DataFrame(X_std_scaled, columns=df_num.columns, index=df_num.index)

# Feature logaritmic transformation 
log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)
log_data = log_transformer.transform(df[["column1", "column2"]])

# Create new feature as distnce for a fixed point(base = 35) in column 1 using RBF with gamma 0.1
col1_simil_35 = rbf_kernel(df[["column1"]], [[35]], gamma=0.1)
# or 
rbf_transformer = FunctionTransformer(rbf_kernel,
                                      kw_args=dict(Y=[[35.]], gamma=0.1))
column1_simil_35 = rbf_transformer.transform(df[["column1"]])

# Create new feature as distnce for a fixed point(base = 37.7749, -122.41) in 2 columns using RBF with gamma 0.1
base = 37.7749, -122.41
base_transformer = FunctionTransformer(rbf_kernel,
                                     kw_args=dict(Y=[base], gamma=0.1))
base_simil = sf_transformer.transform(df[["column1", "column2"]])

# Ratio transformer
ratio_transformer = FunctionTransformer(lambda X: X[:, [0]] / X[:, [1]])
ratio_transformer.transform(np.array([[1., 2.], [3., 4.]]))

### Preprocessing of **categorical** features

In [None]:
# Encode ordinal categories
ordinal_encoder = OrdinalEncoder()
X_cat_encoded = ordinal_encoder.fit_transform(df_cat)
# Check values (not required)
ordinal_encoder.categories_
# Back to DF and count (not required)

pd.DataFrame(X_cat_encoded, columns=df_cat.columns,
                index=df_cat.index).value_counts()

# Encode non-ordinal categories
cat_encoder = OneHotEncoder()
X_cat_1hot = cat_encoder.fit_transform(df_cat)
# note:  By default, the `OneHotEncoder` class returns a sparse array, 
# but we can convert it to a dense array if needed by calling the `toarray()` method:
X_cat_1hot.toarray()
# or alternatively, you can set `sparse=False` when creating the `OneHotEncoder`:
cat_encoder = OneHotEncoder(sparse=False)

# Ignore new categories
cat_encoder.handle_unknown = "ignore"
# or 
cat_encoder = OneHotEncoder(handle_unknown="ignore")

# Check encoder (not required)
cat_encoder.categories_
cat_encoder.feature_names_in_
cat_encoder.get_feature_names_out()
# Back to DF 
df_output = pd.DataFrame(cat_encoder.transform(df).toarray(),
                         columns=cat_encoder.get_feature_names_out(),
                         index=df.index)

### Preprocessing of **outliers**

In [None]:
# Remove outliers
isolation_forest = IsolationForest(random_state=42)
outlier_pred = isolation_forest.fit_predict(X)
df_tr[outlier_pred==1]
# outliers
df_tr[outlier_pred==-1]

### Target transformation

In [None]:
# Standardize target (labels)
target_scaler = StandardScaler()
scaled_labels = target_scaler.fit_transform(labels.to_frame())

model = LinearRegression()
model.fit(df_num, scaled_labels)
new_data = df_num.iloc[:5]  # pretend this is new data

# manual inversion
scaled_predictions = model.predict(new_data)
predictions = target_scaler.inverse_transform(scaled_predictions)

# automatic inversion
model = TransformedTargetRegressor(LinearRegression(),
                                   transformer=StandardScaler())
model.fit(df_num, labels)
predictions = model.predict(new_data)

### Custom transformations

_Add custom columns_
- col1_2 = column1 / column2 (always)
- col3_4 = column3 / column4 (optional when hyperparamter add_col3_4 = True)

In [None]:
# Get index for columns
col_names = "colum1", "colum2", "column3", "column4"
col1_ix, col2_ix, col3_ix, col4_ix = [
    df.columns.get_loc(c) for c in col_names]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_col3_4=True): # no *args or **kargs
        self.add_col3_4 = add_col3_4
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        col1_2 = X[:, col1_ix] / X[:, col2_ix]
        if self.add_col3_4:
            col3_4 = X[:, col3_ix] / X[:, col4_ix]
            return np.c_[X, col1_2, col3_4]
        else:
            return np.c_[X, col1_2]

attr_adder = CombinedAttributesAdder(add_col3_4=False)
X_extra_attribs = attr_adder.transform(df.values)

# Back to dataframe
df_extra_attribs = pd.DataFrame(
    X_extra_attribs,
    columns=list(df.columns)+["col1_2", "add_col3_4"],
    index=df.index)

_Custom transformer - clone of StandardScaler_

In [None]:
class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True):  # no *args or **kwargs!
        self.with_mean = with_mean

    def fit(self, X, y=None):  # y is required even though we don't use it
        X = check_array(X)  # checks that X is an array with finite float values
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        self.n_features_in_ = X.shape[1]  # every estimator stores this in fit()
        return self  # always return self!

    def transform(self, X):
        check_is_fitted(self)  # looks for learned attributes (with trailing _)
        X = check_array(X)
        assert self.n_features_in_ == X.shape[1]
        if self.with_mean:
            X = X - self.mean_
        return X / self.scale_

_Custom transformation for cluster similarity (define n clusters and add n features as similarity to each cluser using RBF with gamma_

In [None]:
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

### Create pipeline

_Set sklearn utils_

In [None]:
# set configuration paramters
set_config(display='diagram')

# check steps for pipline called num_pipeline 
num_pipeline.steps

# display 2nd step for pipline called num_pipeline 
num_pipeline[1]

# display simpleimputer step for pipline called num_pipeline 
num_pipeline.named_steps["simpleimputer"]

# set simpleimputer strategy paramter for pipline called num_pipeline 
num_pipeline.set_params(simpleimputer__strategy="median")

_Numeric feature pipeline with median inputer  and stanardization scaler_

In [None]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X_tr = num_pipeline.fit_transform(df_num)
# or
num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())


_Numeric feature pipeline with median inputer, attributes adder and stanardization scaler_

In [None]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

X_tr = num_pipeline.fit_transform(df_num)

_Compose pipeline (num_pipline for numeric features and OneHotEncoder for cat features_

In [None]:
num_attribs = list(housing_num)
cat_attribs = ["column1"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

X_prepared = full_pipeline.fit_transform(df)

_Compose 2 pipelines (num_pipline and cat_pipeline)_

In [None]:
num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"), 
    StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

num_attribs = ["column1","column2"]
cat_attribs = ["column3","column4"]

preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
    ])
#or
preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)

#call pipline
X_prepared = preprocessing.fit_transform(df)

# Back to dataframe (not required)
df_prepared = pd.DataFrame(
    X_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=df.index)

### Select an train the model

### Regression

_Process data, train model calcualte RSME_

In [None]:
df = strat_train_set.drop(columns="label")
y_labels = strat_train_set["label"].copy()

num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"), 
    StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)
X_prepared = preprocessing.fit_transform(df)

lin_reg = LinearRegression()
lin_reg.fit(X_prepared, y_labels)
y_predictions = lin_reg.predict(X_prepared)

# Calculate RSME
lin_mse = mean_squared_error(y_labels, y_predictions)
lin_rmse = np.sqrt(lin_mse)
# or 
mean_squared_error(y_labels, y_predictions, squared=False)

_Perforem train and validation for Decision Tree_

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_prepared, y_labels)

y_predictions = tree_reg.predict(X_prepared)
tree_rmse = mean_squared_error(y_labels, y_predictions, squared=False)

_Perforem train and validation for Random forest_

In [None]:
forest_reg = RandomForestRegressor(n_estimators=100)
forest_reg.fit(X_prepared, y_labels)

y_predictions = forest_reg.predict(X_prepared)
forest_mse = mean_squared_error(y_labels, y_predictions, squared=False)

_Cross validation_

In [None]:
#cross validation for regression
scores = cross_val_score(tree_reg, X_prepared, y_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)


#cross validation for classification
cross_val_score(model, X, y, cv=3, scoring="accuracy")

### Hyperparameter tuning

### Grid Search

In [None]:
# create model (e.g. KNeighborsClassifier)
model = knn_clf = KNeighborsClassifier()
# set grid of hyperparamters values
param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5, 6]}]

# fit all models ( one per param_grid combination) 
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_small, y_small)

# check which hyperparamters values are best
grid_search.best_params_

# check score with best hyperparamters values
grid_search.best_score_

# reuse model with best hyperparamters values
grid_search.best_estimator_.fit(X_all, y_all)

# check score on test data
score = grid_search.score(X_test, y_test)

### Classification

### metrics

In [None]:
# calculate confusion matrix
confusion_matrix(y, y_pred)

# calculateprecision
precision_score(y, y_pred)

# calculate recall
recall_score(y, y_pred)

# calculate f1
f1_score(y, y_pred)

### tresholds and curves

In [None]:
# get scores from decision_function (e.g. SVM)
y_scores = cross_val_predict(model, X, y, cv=3,
                             method="decision_function")

# get class probabilities from decision_function (e.g. RandomForestClassifier)
y_probas = cross_val_predict(model, X, y, cv=3,
                             method="predict_proba")
# get scores from probas
y_scores = y_probas[:, 1]

# calculate precisions, recalls for thresholds for precision-recall plot
precisions, recalls, thresholds = precision_recall_curve(y, y_scores)

# calculate  fpr, tpr, thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

In [None]:
# set a treshold to visualize
threshold = 1000
# get index of first pos ≥ threshold
idx = (thresholds >= threshold).argmax()
# plot precision, recall vs threshold 
plt.figure(figsize=(8, 4)) 
plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
plt.vlines(threshold, 0, 1.0, "k", "dotted", label="threshold")

plt.plot(thresholds[idx], precisions[idx], "bo")
plt.plot(thresholds[idx], recalls[idx], "go")
plt.axis([-50000, 50000, 0, 1])
plt.grid()
plt.xlabel("Threshold")
plt.legend(loc="center right")
plt.show()

In [None]:
# make precision-recall plot
plt.figure(figsize=(6, 5))  

plt.plot(recalls, precisions, linewidth=2, label="Precision/Recall curve")

plt.plot([recalls[idx], recalls[idx]], [0., precisions[idx]], "k:")
plt.plot([0.0, recalls[idx]], [precisions[idx], precisions[idx]], "k:")
plt.plot([recalls[idx]], [precisions[idx]], "ko",
         label="Point at threshold 3,000")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.axis([0, 1, 0, 1])
plt.grid()
plt.legend(loc="lower left")
plt.show()

In [None]:
# plot ROC curve
idx_for_threshold_at_90 = (thresholds <= threshold_for_90_precision).argmax()
tpr_90, fpr_90 = tpr[idx_for_threshold_at_90], fpr[idx_for_threshold_at_90]

plt.figure(figsize=(6, 5))  # extra code – not needed, just formatting
plt.plot(fpr, tpr, linewidth=2, label="ROC curve")
plt.plot([0, 1], [0, 1], 'k:', label="Random classifier's ROC curve")
plt.plot([fpr_90], [tpr_90], "ko", label="Threshold for 90% precision")

plt.xlabel('False Positive Rate (Fall-Out)')
plt.ylabel('True Positive Rate (Recall)')
plt.grid()
plt.axis([0, 1, 0, 1])
plt.legend(loc="lower right", fontsize=13)
plt.show()

In [None]:
# calc AUC
roc_auc_score(y, y_scores)

In [None]:
# what would be treshold for 90% precision?
idx_for_90_precision = (precisions >= 0.90).argmax()
threshold_for_90_precision = thresholds[idx_for_90_precision]
threshold_for_90_precision

In [None]:
# transform scores to labels using  90% precision treshold
y_train_pred_90 = (y_scores >= threshold_for_90_precision)
# check precision
precision_score(y_train_5, y_train_pred_90)

# Others

In [None]:
# shift image
data = np.array([0,0,0,1,1,1,0,0,0])
image = data.reshape((3, 3))
shifted_image = shift(image, [1, 1], cval=0, mode="constant")
shifted_image
