# Parameter & HyperParameter

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv('../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv', index_col=0)

target = df['default.payment.next.month']
features = df.drop(['default.payment.next.month'], axis=1)

y = target
X = pd.get_dummies(features, columns=['SEX', 'EDUCATION', 'MARRIAGE'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=0)

### Coef / Parameter of  Logistic Regression result

In [None]:
log_reg_clf = LogisticRegression(solver='liblinear')
log_reg_clf.fit(X_train, y_train)

Top N feature positive coef. / negative coef.

In [None]:
features_name = X_train.columns
model_coef = log_reg_clf.coef_.ravel()
coef_df = pd.DataFrame({'Variable':features_name, 'Coef':model_coef})
# display(coef_df.sort_values('Coef', ascending=False).iloc[:3,:]); display(coef_df.sort_values('Coef', ascending=True).iloc[:3,:])
display(coef_df.nlargest(3, 'Coef')); display(coef_df.nsmallest(3, 'Coef'))

## Coef/Parameter of Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [None]:
rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(X_train, y_train)

In [None]:
# Tree no. 7
tree_no7 = rf_clf.estimators_[6]

# First Column splited
split_column = tree_no7.tree_.feature[0]
split_column_name = X_train.columns[split_column]
split_value = tree_no7.tree_.threshold[0]

print(f'Col : {split_column_name}, splited at val : {split_value}')

In [None]:
prediction = rf_clf.predict(X_test)
confusion_matrix(y_test, prediction)

In [None]:
print(accuracy_score(y_test, prediction)); print(precision_score(y_test, prediction)); print(recall_score(y_test, prediction))

In [None]:
print((4403+496)/(4403+496+270+831)); print((496)/(496+270)); print((496)/(831+496))

## Learning Curve

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

from matplotlib import pyplot as plt

In [None]:
# Set the learning rates & accuracies list
learn_rates = np.linspace(0.01, 2, num=10)
accuracies = []

# Create the for loop
for learn_rate in learn_rates:
  	# Create the model, predictions & save the accuracies as before
    model = GradientBoostingClassifier(learning_rate=learn_rate)
    predictions = model.fit(X_train, y_train).predict(X_test)
    accuracies.append(accuracy_score(y_test, predictions))

In [None]:
# Plot results    
plt.plot(learn_rates, accuracies)
plt.gca().set(xlabel='learning_rate', ylabel='Accuracy', title='Accuracy for different learning_rates')
plt.show()

# GridSearch

In [None]:
import os
os.cpu_count()

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv('../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv', index_col=0)

target = df['default.payment.next.month']
features = df.drop(['default.payment.next.month'], axis=1)

y = target
X = pd.get_dummies(features, columns=['SEX', 'EDUCATION', 'MARRIAGE'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=0)

In [None]:
rf_class = RandomForestClassifier(criterion='entropy')
param_grid = {'max_depth': [2,4,8], 'max_features': ['auto', 'sqrt']} 

# Create a GridSearchCV object
grid_rf_class = GridSearchCV(
    estimator=rf_class,
    param_grid=param_grid,
    scoring='roc_auc',
    n_jobs=3,
    cv=5,
    refit=True, return_train_score=False)
print(grid_rf_class)

In [None]:
grid_rf_class.fit(X_train, y_train)

In [None]:
cv_results_df = pd.DataFrame(grid_rf_class.cv_results_)

In [None]:
cv_results_df

In [None]:
# Read the cv_results property into a dataframe & print it out
cv_results_df = pd.DataFrame(grid_rf_class.cv_results_)
print(cv_results_df)

# Extract and print the column with a dictionary of hyperparameters used
column = cv_results_df.loc[:, ['params']]
print(column)

# Extract and print the row that had the best mean test score
best_row = cv_results_df[cv_results_df['rank_test_score'] == 1 ]
print(best_row)

In [None]:
grid_rf_class.best_score_

In [None]:
grid_rf_class.best_index_

In [None]:
grid_rf_class.best_params_

In [None]:
# See what type of object the best_estimator_ property is
print(type(grid_rf_class.best_estimator_))

# Create an array of predictions directly using the best_estimator_ property
predictions = grid_rf_class.best_estimator_.predict(X_test)

# Take a look to confirm it worked, this should be an array of 1's and 0's
print(predictions[0:5])

# Now create a confusion matrix 
print("Confusion Matrix \n", confusion_matrix(y_test, predictions))

# Get the ROC-AUC score
predictions_proba = grid_rf_class.best_estimator_.predict_proba(X_test)[:,1]
print("ROC-AUC Score \n", roc_auc_score(y_test, predictions_proba))

# Random Search

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV

In [None]:
df = pd.read_csv('../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv', index_col=0)

target = df['default.payment.next.month']
features = df.drop(['default.payment.next.month'], axis=1)

y = target
X = pd.get_dummies(features, columns=['SEX', 'EDUCATION', 'MARRIAGE'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=0)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {'learning_rate': np.linspace(0.1,2,150), 'min_samples_leaf': list(range(20,65))}

random_GBM_class = RandomizedSearchCV(
    estimator = GradientBoostingClassifier(),
    param_distributions = param_grid,
    n_iter = 10,
    scoring='accuracy', n_jobs=3, cv = 5, refit=True, return_train_score = True)

In [None]:
random_GBM_class.fit(X_train, y_train)

# Infromed Search

## Coarse to Fine Search

In [1]:
def visualize_hyperparameter(name):
    
    from matplotlib import pyplot as plt
    plt.clf()
    plt.scatter(results_df[name],results_df['accuracy'], c=['blue']*500)
    plt.gca().set(xlabel='{}'.format(name), ylabel='accuracy', title='Accuracy for different {}s'.format(name))
    plt.gca().set_ylim([0,100])
    plt.show()
    return None

## Baysian HyperParamtuning

In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score

In [11]:
df = pd.read_csv('../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv', index_col=0)

target = df['default.payment.next.month']
features = df.drop(['default.payment.next.month'], axis=1)

y = target
X = pd.get_dummies(features, columns=['SEX', 'EDUCATION', 'MARRIAGE'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=0)

In [21]:
from sklearn.ensemble import GradientBoostingClassifier
from hyperopt import hp, fmin, tpe

In [22]:
# Set up space dictionary with specified hyperparameters
space = {'max_depth': hp.quniform('max_depth', 2, 10, 2),'learning_rate': hp.uniform('learning_rate', 0.001,0.9)}

In [23]:
# Set up objective function
def objective(params):
    params = {'max_depth': int(params['max_depth']),'learning_rate': params['learning_rate']}
    gbm_clf = GradientBoostingClassifier(n_estimators=100, **params) 
    best_score = cross_val_score(gbm_clf, X_train, y_train, scoring='accuracy', cv=2, n_jobs=4).mean()
    loss = 1 - best_score
    return loss

In [24]:
# Run the algorithm
best = fmin(fn=objective,space=space, max_evals=20, rstate=np.random.RandomState(42), algo=tpe.suggest)
print(best)

100%|██████████| 20/20 [04:06<00:00, 12.31s/trial, best loss: 0.1822083333333333]
{'learning_rate': 0.0128515490384306, 'max_depth': 6.0}


## Genetic Hyperparameter optimizer

In [25]:
import tpot

In [27]:
# Assign the values outlined to the inputs
number_generations = 3
population_size = 4
offspring_size = 3
scoring_function = 'accuracy'

In [28]:
# Create the tpot classifier
tpot_clf = tpot.TPOTClassifier(generations=number_generations, population_size=population_size,
                          offspring_size=offspring_size, scoring=scoring_function,
                          verbosity=2, random_state=2, cv=2)

# Fit the classifier to the training data
tpot_clf.fit(X_train, y_train)

# Score on the test set
print(tpot_clf.score(X_test, y_test))

Optimization Progress:   0%|          | 0/13 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8188333333333333

Generation 2 - Current best internal CV score: 0.8188333333333333

Generation 3 - Current best internal CV score: 0.8188333333333333

Best pipeline: DecisionTreeClassifier(input_matrix, criterion=gini, max_depth=1, min_samples_leaf=10, min_samples_split=9)
0.8226666666666667


_____

#### Encode Features with ColumnTransformer

In [None]:
col_tr = ColumnTransformer(
    [('ohc', OneHotEncoder(dtype='int', drop='first'), ['SEX', 'EDUCATION', 'MARRIAGE']),]
    , remainder='passthrough')

featrues_tran = col_tr.fit_transform(features)

In [None]:
# Column transformer, features name got lossed
print(col_tr.get_feature_names())

ColTransformer 2 attributes  
.`transformers` = Defined transformer  
.`transformers_`  = Result of all transformed step, include reminder = 'passthrough'  

In [None]:
ohc =  col_tr.named_transformers_['ohc']
print(ohc)
# OnehotEncoder , aslo loss original column name
ohc.get_feature_names()

In [None]:
# if passtrought, encoded data with be in first group, follow by the rest
# inverse trans of incoded
ohc.inverse_transform(featrues_tran[:5, :10])

In [None]:
features.loc[:5, ['SEX', 'EDUCATION', 'MARRIAGE']]

Transform with Pandas

In [None]:
# sex_encode = 
pd.get_dummies(features, columns=['SEX', 'EDUCATION', 'MARRIAGE'], drop_first=True).iloc[:, 20:]