<h2>Project Starter</h2>

<p>This notebook provides code to build machine learning experiments very fast.</p>

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale, minmax_scale, normalize, binarize
from sklearn.feature_selection import SelectKBest, SelectPercentile, VarianceThreshold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.svm import LinearSVC, SVC, SVR, LinearSVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.model_selection import LeaveOneOut, KFold, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt

<h3>Input data</h3>

In [None]:
def manual_input(var, val):
    col_dict = {var[index]:val[index] for index in range(len(var))}
    return pd.DataFrame.from_dict(col_dict)

In [None]:
# From csv
if(False):
    data = pd.read_csv('')

# Manually
if(False):    
    var = []
    val = [[],[]]
    data = manual_input(var, val)

# Sample
data.head(20)

<h3>Missing data</h3>

In [None]:
# Are there missing data?
print("# Missing = %d" % data.isnull().sum().sum())

In [None]:
# Fill data?

# Backward fill 
if(False):
    data.fillna(method='bfill', inplace=True)
    
# Forward fill
if(False):
    data.fillna(method='ffill', inplace=True)

# Fill with mean of each column
if(False):
    data.fillna(data.mean(), inplace=True)
    
# Fill with mediana of each column
if(False):
    data.fillna(data.median(), inplace=True)

<h3>Standardization</h3>

In [None]:
# Standardization of datasets

# Scale with zero mean and unit variance
if(False):
    data.iloc[:,:] = scale(data)
    
# Scale features to range 0-1
if(False):
    data.iloc[:,:] = minmax_scale(data)

# Normalize each row using L2
if(False):
    data.iloc[:,:] = normalize(data)

<h3>Thresholding</h3>

In [None]:
# Thresholding numerical features

# Binarize all values with threshold equals 0 
if(False):
    data.iloc[:,:] = binarize(data, threshold=2)

<h3>Categorical features (NT)</h3>

In [None]:
# Encoding 
cat_to_onehot = []
cat_to_int = list(data.columns.values)
total = cat_to_int + cat_to_onehot

# Perform conversion of object columns to categorical columns
if(True):
    data = data.apply(lambda s: s.astype('category') if(s.name in total) else s)

# Categorical to one hot encoding 
if(False):
    data = pd.get_dummies(data, columns = cat_to_onehot)
    
# Categorical to int
if(True):
    data = data.apply(lambda s: s.cat.codes if(s.name in cat_to_int) else s)

<h3>Feature selection (NT)</h3>

<p>Functions available to perform feature selection:</p>
<ul>
    <li>regression problems: f_regression, mutual_info_regression</li>
    <li>classification problems: chi2, f_classif, mutual_info_classif</li>
</ul>

In [None]:
# Must specify features matrix and target vector!

# Select features with variance above a threshold
if(False):
    data = VarianceThreshold(threshold=0.0).fit_transform(data)

# Select K best variables (fill function and k parameters!)
if(False):
    data = SelectKBest(function, k=10).fit_transform(data, data)
    
# Select X percent best features (fill function and percentile (how much to keep) parameters!)
if(False):
    data = SelectKBest(function, percentile=50).fit_transform(data.X, data.Y)
    
# Perform PCA (principal component analysis)
if(False):
    pass
    # PCA

<h3>Correlation coefficient</h3>

In [None]:
# Must specify features matrix and target vector.

# Person's coefficient
if(False):
    person = []
    map(lambda s: person.append(X[s].corr(Y, method='person')), X.columns)
    
# Spearman's coefficient
if(False):
    spearman = []
    map(lambda s: person.append(X[s].corr(Y, method='spearman')), X.columns)

<h3>Validation</h3>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

val = LeaveOneOut()
val = KFold() 

for train_index, test_index in val.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

<h3>Regression</h3>

In [None]:
# Linear regression without regularization

model = LinearRegression(jobs=-1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
regression_metrics(y_pred, Yva)

In [None]:
# Linear regression with Ridge (L2) regularization

model = Ridge(alpha=1.0, random_state=1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
regression_metrics(y_pred, Yva)

In [None]:
# Linear regression with Lasso (L1) regularization

model = Lasso(alpha=1.0, random_state=1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
regression_metrics(y_pred, Yva)

In [None]:
# Linear regression with Elastic-net regularization

model = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
regression_metrics(y_pred, Yva)

In [None]:
# Support vector regressor with linear kernel

model = LinearSVR(epsilon=0.0, loss='epsilon_insensitive', C=1.0, random_state=1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
regression_metrics(y_pred, Yva)

In [None]:
# Support vector regressor with other kernels 

model = SVR(kernel='rbf', gamma='auto', epsilon=0.1, C=1.0, random_state=1, cache_size=1000)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
regression_metrics(y_pred, Yva)

In [None]:
# K Nearest neighbors regressor

model = KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', \
                            leaf_size=30, p=2, metric='minkowski', n_jobs=-1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
regression_metrics(y_pred, Yva)

In [None]:
# Tree regressor

model = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=2, \
                              min_samples_leaf=1, min_weight_fraction_leaf=0.0, \
                              random_state=1, max_leaf_nodes=None, min_impurity_decrease=0.0, \
                              min_impurity_split=None)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
regression_metrics(y_pred, Yva)

In [None]:
# Neural network regressor

model = MLPRegressor(hidden_layer_sizes=(100,), activation='relu', \
                     learning_rate='constant', learning_rate_init=0.001, random_state=1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
regression_metrics(y_pred, Yva)

In [None]:
# Random forest regressor

model = RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=None, \
                              min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, \
                              max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, \
                              min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=-1, \
                              random_state=1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
regression_metrics(y_pred, Yva)

<h3>Classification</h3>

In [None]:
# Logistic regression with regularization

model = LogisticRegression(penalty='l2', C=1.0, random_state=1, n_jobs=-1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
classification_metrics(y_pred, Yva)

In [None]:
# Support vector classifier with linear kernel

model = LinearSVC(penalty='l2', C=1.0, random_state=1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
classification_metrics(y_pred, Yva)

In [None]:
# Support vector classifier with other kernels 

model = SVC(kernel='rbf', gamma='auto', C=1.0, random_state=1, cache_size=1000)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
classification_metrics(y_pred, Yva)

In [None]:
# K nearest neighbors classifier

model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', \
                             leaf_size=30, p=2, metric='minkowski', n_jobs=-1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
classification_metrics(y_pred, Yva)

In [None]:
# Naive bayes classifier using normal distribution

model = GaussianNB()
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
classification_metrics(y_pred, Yva)

In [None]:
# Naive bayes classifier using multinomial distribution

model = MultinomialNB(alpha=0.000001)
model.fit(data.loc[:, data.columns != 'classe'], data['classe'])
y_pred = model.predict_proba([[1,2,1,1]])
y_pred
#classification_metrics(y_pred, Yva)

In [None]:
# Naive bayes classifier using Bernoulli distribution

model = BernoulliNB(alpha=1.0, binarize=0.0)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
classification_metrics(y_pred, Yva)

In [None]:
# Tree classifier

model = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=None, \
                               min_samples_split=2, min_samples_leaf=1, \
                               min_weight_fraction_leaf=0.0, max_features=None, \
                               random_state=1, max_leaf_nodes=None, \
                               min_impurity_decrease=0.0, min_impurity_split=None, \
                               presort=False)
model.fit(data.loc[:, data.columns != 'espera'], data['espera'])
y_pred = model.predict([[1,1,1,0]])
print(y_pred)
#classification_metrics(y_pred, Yva) 

In [None]:
# Neural network classifier

model = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', \
                      learning_rate='constant', learning_rate_init=0.001, random_state=1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
regression_metrics(y_pred, Yva)

In [None]:
# Random forest classifier

model = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, \
                               min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, \
                               max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, \
                               min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=-1, \
                               random_state=1)
model.fit(Xtr, Ytr)
y_pred = model.predict(Xva)
regression_metrics(y_pred, Yva)

<h3>Clustering</h3>

In [None]:
# K Means

model = KMeans(n_clusters=2, init='k-means++', max_iter=5, tol=0.0001, \
               random_state=1, copy_x=True, \
               n_jobs=-1, algorithm='auto')
model.fit([[1,1],[1,2],[2,1],[2,2],[5,1],[6,1],[5,2]])
print(model.cluster_centers_)

In [None]:
# DBSCAN
model = DBSCAN(eps=0.5, min_samples=5, metric='euclidean', \
               metric_params=None, algorithm='auto', leaf_size=30, \
               p=None, n_jobs=-1)

In [None]:
# Hierar..

model = AgglomerativeClustering(n_clusters=2, affinity='euclidean', memory=None, \
                                connectivity=None, compute_full_tree='auto', \
                                linkage='ward', pooling_func=<function mean at 0x7f0ea81166a8>)

<h3>Results / metrics</h3>

In [None]:
def regression_metrics(y_pred, Yva):
    print('MAE = ' + str(mean_absolute_error(Yva, y_pred)))
    print('MSE = ' + str(mean_squared_error(Yva, y_pred)))
    print('MAE = ' + str(r2_score(Yva, y_pred)))
    
def regression_metrics(y_pred, Yva):
    print('MAE = ' + str(accuracy_score(Yva, y_pred)))
    print('MSE = ' + str(f1_score(Yva, y_pred)))
    print('MAE = ' + str(precision_score(Yva, y_pred)))
    print('MAE = ' + str(recall_score(Yva, y_pred)))
    print('MAE = ' + str(roc_auc_score(Yva, y_pred)))