# Getting started scikit-learn guide

1. Fitting and predicting: estimator basics
2. Transformers and pre-processors
3. Pipelines: chaining pre-processors and estimators
4. Model evaluation
5. Automatic parameter searches

### 1. Fitting and predicting: estimator basics

In [6]:
# Importing necessary libraries
from sklearn.ensemble import RandomForestClassifier

In [7]:
# Create data
X = [[1, 2, 3], # 2 samples, 3 features
     [11, 12, 13]]
    
y = [0, 1] # Classes of each sample

In [8]:
# Fit data to use for predicting target values
clf = RandomForestClassifier(random_state=0)
clf.fit(X, y)

In [10]:
# Predicting taget values of new data
clf.predict(X) # Predict classes of training data

# No need to retrain the estimater for new data
clf.predict([[4,5,6], [14,15,16]]) # Predict classes of new data

array([0, 1])

### 2. Transformers and pre-processors

In [1]:
# Importing necessary libraries
from sklearn.preprocessing import StandardScaler

In [2]:
# Create data
X = [[0,15],
     [1, -10]]


In [3]:
# Scale data according to computet scaling values

In [4]:
StandardScaler().fit(X).transform(X)

array([[-1.,  1.],
       [ 1., -1.]])

### 3. Pipelines: chaining pre-processors and estimators

In [2]:
# Importing necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# Create a pipline object
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)

In [4]:
# Load the iris dataset and split it into train and test sets
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [6]:
# Fit the whole pipeline
pipe.fit(X_train, y_train)

In [8]:
# We can now use it like any other estimator
accuracy_score(pipe.predict(X_test), y_test)

0.9736842105263158

### 4. Model evaluation

In [9]:
# Importing necessary libraries
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

In [11]:
# Get data
X, y = make_regression(n_samples=1000, random_state=0)
lr = LinearRegression()

In [12]:
# Results
result = cross_validate(lr, X, y) # Defaults to 5-fold CV
result['test_score'] # r_squared score is high because data_set is easy

array([1., 1., 1., 1., 1.])

### 5. Automatic parameter searches

In [14]:
# Import nessecary libraries
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

In [15]:
# Get data
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [16]:
# Define the parameter space that will be searched over
param_distributions = {
    'n_estimators': randint(1,5),
    'max_depth': randint(5, 10)
}

In [21]:
# Now create a searchCV object and fit it to the data
search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=0),
    n_iter=5,
    param_distributions=param_distributions,
    random_state=0
)
search.fit(X_train, y_train)

In [20]:
search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [22]:
search.score(X_test, y_test)

0.735363411343253