https://scikit-learn.org/stable/getting_started.html

https://scikit-learn.org/stable/tutorial/index.html

https://scikit-learn.org/stable/user_guide.html

In [1]:
# Loading of relevant modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

### Step 1

In [2]:
# Data
# Loading/Reading, Pre-processing, Preparation

# 1. Loading Data
X = [[ 1,  2,  3],  # 2 samples, 3 features -> Input
     [11, 12, 13]]

y = [0, 1]  # classes of each sample - Ground Truth

# 2. Pre-processing Step
# Bring all features in the range [-1, 1] or [0, 1]
# StandardScalar(): [-1, 1]
# MinMaxScalar(): [0, 1]
X = StandardScaler().fit(X).transform(X)



### Step 2

In [3]:
# Model Dev
clf = RandomForestClassifier(random_state=0) # 1. Model Definiton Step
clf.fit(X, y) # 2. Model Learning Step

### Step 3

In [4]:
# Evaluation
# Qual (Visualization) & Quant (Metric/Num)

print(clf.predict(X))  # predict classes of the training data

test_data = StandardScaler().fit([[4, 5, 6], [14, 15, 16]]).transform([[4, 5, 6], [14, 15, 16]])
print(clf.predict(test_data))  # predict classes of new data

[0 1]
[0 1]


### Pipelines, Data set splits

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Step 1 - Data
# load the iris dataset and split it into train and test sets
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


# Step 2a - create a pipeline object
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)

# Step 2b - fit the whole pipeline
pipe.fit(X_train, y_train)


# Step 3 - Evaluation
accuracy_score(pipe.predict(X_test), y_test)

0.9736842105263158

In [19]:
### V2: Repeating for Another Classifier & Making eval more robust
from sklearn import tree
from sklearn.model_selection import cross_validate

# Step 1 - Data
# load the iris dataset and split it into train and test sets
X, y = load_iris(return_X_y=True)
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


# Step 2a - create a pipeline object
pipe = make_pipeline(
    StandardScaler(),
    tree.DecisionTreeClassifier()
)

# Step 3 - Robust Evaluation (Cross Validation): Pre-proc + .fit() + .predict() + accuracy_score()
result = cross_validate(pipe, X, y)
print(result['test_score'])


[0.96666667 0.96666667 0.9        0.96666667 1.        ]


### Hyper-param Tuning - SearchCV

In [6]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint


### DATA
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### MODEL DEV
# define the parameter space that will be searched over
param_distributions = {'n_estimators': randint(1, 5),
                       'max_depth': randint(5, 10)}

# now create a searchCV object and fit it to the data
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
                            n_iter=5,
                            param_distributions=param_distributions,
                            random_state=0)
search.fit(X_train, y_train)
print(search.best_params_)

# the search object now acts like a normal random forest estimator
# with max_depth=9 and n_estimators=4
search.score(X_test, y_test)

{'max_depth': 9, 'n_estimators': 4}


0.735363411343253