In [None]:
import sys

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
from utilities import visualize_classifier

Load input data

In [None]:
input_file = 'data_imbalance.txt'
data = np.loadtxt(input_file, delimiter=',')
X, y = data[:, :-1], data[:, -1]

Separate input data into two classes based on labels

In [None]:
class_0 = np.array(X[y==0])
class_1 = np.array(X[y==1])

Visualize input data

In [None]:
plt.figure()
plt.scatter(class_0[:, 0], class_0[:, 1], s=75, facecolors='black', 
                edgecolors='black', linewidth=1, marker='x')
plt.scatter(class_1[:, 0], class_1[:, 1], s=75, facecolors='white', 
                edgecolors='black', linewidth=1, marker='o')
plt.title('Input data')

Split data into training and testing datasets 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=5)

Extremely Random Forests classifier

In [None]:
params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0}
if len(sys.argv) > 1:
    if sys.argv[1] == 'balance':
        params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0, 'class_weight': 'balanced'}
    else:
        raise TypeError("Invalid input argument; should be 'balance'")

In [None]:
classifier = ExtraTreesClassifier(**params)
classifier.fit(X_train, y_train)
visualize_classifier(classifier, X_train, y_train, 'Training dataset')

In [None]:
y_test_pred = classifier.predict(X_test)
visualize_classifier(classifier, X_test, y_test, 'Test dataset')

Evaluate classifier performance

In [None]:
class_names = ['Class-0', 'Class-1']
print("\n" + "#"*40)
print("\nClassifier performance on training dataset\n")
print(classification_report(y_train, classifier.predict(X_train), target_names=class_names))
print("#"*40 + "\n")

In [None]:
print("#"*40)
print("\nClassifier performance on test dataset\n")
print(classification_report(y_test, y_test_pred, target_names=class_names))
print("#"*40 + "\n")

In [None]:
plt.show()

########################################################################################

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
from utilities import visualize_classifier

Load input data

In [None]:
input_file = 'data_decision_trees.txt'
data = np.loadtxt(input_file, delimiter=',')
X, y = data[:, :-1], data[:, -1]

Separate input data into two classes based on labels

In [None]:
class_0 = np.array(X[y==0])
class_1 = np.array(X[y==1])

Visualize input data

In [None]:
plt.figure()
plt.scatter(class_0[:, 0], class_0[:, 1], s=75, facecolors='black', 
        edgecolors='black', linewidth=1, marker='x')
plt.scatter(class_1[:, 0], class_1[:, 1], s=75, facecolors='white', 
        edgecolors='black', linewidth=1, marker='o')
plt.title('Input data')

Split data into training and testing datasets 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)

Decision Trees classifier 

In [None]:
params = {'random_state': 0, 'max_depth': 4}
classifier = DecisionTreeClassifier(**params)
classifier.fit(X_train, y_train)
visualize_classifier(classifier, X_train, y_train, 'Training dataset')

In [None]:
y_test_pred = classifier.predict(X_test)
visualize_classifier(classifier, X_test, y_test, 'Test dataset')

Evaluate classifier performance

In [None]:
class_names = ['Class-0', 'Class-1']
print("\n" + "#"*40)
print("\nClassifier performance on training dataset\n")
print(classification_report(y_train, classifier.predict(X_train), target_names=class_names))
print("#"*40 + "\n")

In [None]:
print("#"*40)
print("\nClassifier performance on test dataset\n")
print(classification_report(y_test, y_test_pred, target_names=class_names))
print("#"*40 + "\n")

In [None]:
plt.show()

########################################################################################

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import datasets
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

Load housing data

In [None]:
housing_data = datasets.load_boston() 

Shuffle the data

In [None]:
X, y = shuffle(housing_data.data, housing_data.target, random_state=7)

Split data into training and testing datasets 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

AdaBoost Regressor model

In [None]:
regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), 
        n_estimators=400, random_state=7)
regressor.fit(X_train, y_train)

Evaluate performance of AdaBoost regressor

In [None]:
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred )
print("\nADABOOST REGRESSOR")
print("Mean squared error =", round(mse, 2))
print("Explained variance score =", round(evs, 2))

Extract feature importances

In [None]:
feature_importances = regressor.feature_importances_
feature_names = housing_data.feature_names

Normalize the importance values 

In [None]:
feature_importances = 100.0 * (feature_importances / max(feature_importances))

Sort the values and flip them

In [None]:
index_sorted = np.flipud(np.argsort(feature_importances))

Arrange the X ticks

In [None]:
pos = np.arange(index_sorted.shape[0]) + 0.5

Plot the bar graph

In [None]:
plt.figure()
plt.bar(pos, feature_importances[index_sorted], align='center')
plt.xticks(pos, feature_names[index_sorted])
plt.ylabel('Relative Importance')
plt.title('Feature importance using AdaBoost regressor')
plt.show()

########################################################################################

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report

Define the minimum and maximum values for X and Y<br>
that will be used in the mesh grid

In [None]:
def visualize_classifier(classifier, X, y):
  min_x, max_x = X[:, 0].min() - 1.0, X[:, 0].max() + 1.0
  min_y, max_y = X[:, 1].min() - 1.0, X[:, 1].max() + 1.0

In [None]:
print("\nLoad file")

Load input data

In [None]:
input_file = 'data_random_forests.txt'
data = np.loadtxt(input_file, delimiter=',')
X, y = data[:, :-1], data[:, -1]

Separate input data into three classes based on labels

In [None]:
class_0 = np.array(X[y==0])
class_1 = np.array(X[y==1])
class_2 = np.array(X[y==2])

Split the data into training and testing datasets 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=5)

Define the parameter grid 

In [None]:
parameter_grid = [ {'n_estimators': [100], 'max_depth': [2, 4, 7, 12, 16]},
                   {'max_depth': [4], 'n_estimators': [25, 50, 100, 250]}
                 ]

In [None]:
metrics = ['precision_weighted', 'recall_weighted']

In [None]:
for metric in metrics:
    print("\n##### Searching optimal parameters for", metric)
    classifier = GridSearchCV(
            ExtraTreesClassifier(random_state=0), 
            parameter_grid, cv=5, scoring=metric)
    classifier.fit(X_train, y_train)
    
    print("\nGrid scores for the parameter grid:")
    for results in classifier.cv_results_:
        print(results)
    print("\nBest parameters:", classifier.best_params_)
    y_pred = classifier.predict(X_test)
    print("\nPerformance report:\n")
    print(classification_report(y_test, y_pred))

########################################################################################

In [None]:
import argparse
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report
from utilities import visualize_classifier

Argument parser

In [None]:
def build_arg_parser():
 parser = argparse.ArgumentParser(description='Classify data using \
 Ensemble Learning techniques')
 parser.add_argument('--classifier-type', dest='classifier_type',
 required=True, choices=['rf', 'erf'], help="Type of classifier \
 to use; can be either 'rf' or 'erf'")
 return parser
if __name__=='__main__':
 # Parse the input arguments
 args = build_arg_parser().parse_args()
 classifier_type = args.classifier_type
# Load input data
 input_file = 'data_random_forests.txt'
 data = np.loadtxt(input_file, delimiter=',')
 X, y = data[:, :-1], data[:, -1]

Separate input data into three classes based on labels

In [None]:
 class_0 = np.array(X[y==0])
 class_1 = np.array(X[y==1])
 class_2 = np.array(X[y==2])

In [None]:
 # Visualize input data
 plt.figure()
 plt.scatter(class_0[:, 0], class_0[:, 1], s=75,
             facecolors='white',
             edgecolors='black', linewidth=1, marker='s')
 plt.scatter(class_1[:, 0], class_1[:, 1], s=75, facecolors='white',
 edgecolors='black', linewidth=1, marker='o')
 plt.scatter(class_2[:, 0], class_2[:, 1], s=75,
facecolors='white',
 edgecolors='black', linewidth=1, marker='^')
 plt.title('Input data')
 # Split data into training and testing datasets
 X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.25, random_state=5)

In [None]:
 # Ensemble Learning classifier
 params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0}
if classifier_type == 'rf':
     classifier = RandomForestClassifier(**params)
else:
     classifier = ExtraTreesClassifier(**params)
     classifier.fit(X_train, y_train)
visualize_classifier(classifier, X_train, y_train, 'Training dataset')

In [None]:
y_test_pred = classifier.predict(X_test)
visualize_classifier(classifier, X_test, y_test, 'Test dataset')

Evaluate classifier performance

In [None]:
class_names = ['Class-0', 'Class-1', 'Class-2']

In [None]:
print("\n" + "#" * 40)

In [None]:
print("\nClassifier performance on training dataset\n")

In [None]:
print(classification_report(y_train, classifier.predict(X_train), target_names=class_names))

In [None]:
print("#" * 40 + "\n")

In [None]:
print("#" * 40)

In [None]:
print("\nClassifier performance on test dataset\n")

In [None]:
print(classification_report(y_test, y_test_pred, target_names=class_names))

In [None]:
print("#" * 40 + "\n")
##########################################################################################

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report

In [None]:
from utilities import visualize_classifier

In [None]:
if __name__=='__main__':
    # Parse the input arguments
    classifier_type = 'erf'
    #classifier_type = 'rf'
    # Load input data
    input_file = 'data_random_forests.txt'
    data = np.loadtxt(input_file, delimiter=',')
    X, y = data[:, :-1], data[:, -1]

    # Separate input data into three classes based on labels
    class_0 = np.array(X[y==0])
    class_1 = np.array(X[y==1])
    class_2 = np.array(X[y==2])

    # Visualize input data
    plt.figure()
    plt.scatter(class_0[:, 0], class_0[:, 1], s=75, facecolors='white', 
                    edgecolors='black', linewidth=1, marker='s')
    plt.scatter(class_1[:, 0], class_1[:, 1], s=75, facecolors='white', 
                    edgecolors='black', linewidth=1, marker='o')
    plt.scatter(class_2[:, 0], class_2[:, 1], s=75, facecolors='white', 
                    edgecolors='black', linewidth=1, marker='^')
    plt.title('Input data')

    # Split data into training and testing datasets 
    X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.25, random_state=5)

    # Ensemble Learning classifier
    params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0}
    if classifier_type == 'rf':
        classifier = RandomForestClassifier(**params)
    else:
        classifier = ExtraTreesClassifier(**params)
    classifier.fit(X_train, y_train)
    visualize_classifier(classifier, X_train, y_train, 'Training dataset')
    y_test_pred = classifier.predict(X_test)
    visualize_classifier(classifier, X_test, y_test, 'Test dataset')

    # Evaluate classifier performance
    class_names = ['Class-0', 'Class-1', 'Class-2']
    print("\n" + "#"*40)
    print("\nClassifier performance on training dataset\n")
    print(classification_report(y_train, classifier.predict(X_train), target_names=class_names))
    print("#"*40 + "\n")
    print("#"*40)
    print("\nClassifier performance on test dataset\n")
    print(classification_report(y_test, y_test_pred, target_names=class_names))
    print("#"*40 + "\n")

    # Compute confidence
    test_datapoints = np.array([[5, 5], [3, 6], [6, 4], [7, 2], [4, 4], [5, 2]])
    print("\nConfidence measure:")
    for datapoint in test_datapoints:
        probabilities = classifier.predict_proba([datapoint])[0]
        predicted_class = 'Class-' + str(np.argmax(probabilities))
        print('\nDatapoint:', datapoint)
        print('Predicted class:', predicted_class) 

    # Visualize the datapoints
    visualize_classifier(classifier, test_datapoints, [0]*len(test_datapoints), 
            'Test datapoints')
    plt.show()

########################################################################################

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
#from sklearn import grid_search
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
from utilities import visualize_classifier

Load input data

In [None]:
input_file = 'data_random_forests.txt'
data = np.loadtxt(input_file, delimiter=',')
X, y = data[:, :-1], data[:, -1]

Separate input data into three classes based on labels

In [None]:
class_0 = np.array(X[y==0])
class_1 = np.array(X[y==1])
class_2 = np.array(X[y==2])

Split the data into training and testing datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split.train_test_split(
        X, y, test_size=0.25, random_state=5)

Define the parameter grid

In [None]:
parameter_grid = [ {'n_estimators': [100], 'max_depth': [2, 4, 7, 12, 16]},
                   {'max_depth': [4], 'n_estimators': [25, 50, 100, 250]}
                 ]

In [None]:
metrics = ['precision_weighted', 'recall_weighted']

In [None]:
for metric in metrics:
    print("\n##### Searching optimal parameters for", metric)
    classifier = grid_search.GridSearchCV(
            ExtraTreesClassifier(random_state=0),
            parameter_grid, cv=5, scoring=metric)
    classifier.fit(X_train, y_train)
    print("\nGrid scores for the parameter grid:")
    for params, avg_score, _ in classifier.grid_scores_:
        print(params, '-->', round(avg_score, 3))
    print("\nBest parameters:", classifier.best_params_)
    y_pred = classifier.predict(X_test)
    print("\nPerformance report:\n")
    print(classification_report(y_test, y_pred))

########################################################################################

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import classification_report

Load input data

In [None]:
input_file = 'traffic_data.txt'
data = []
with open(input_file, 'r') as f:
    for line in f.readlines():
        items = line[:-1].split(',')
        data.append(items)

In [None]:
data = np.array(data)

Convert string data to numerical data

In [None]:
label_encoder = [] 
X_encoded = np.empty(data.shape)
for i, item in enumerate(data[0]):
    if item.isdigit():
        X_encoded[:, i] = data[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(data[:, i])

In [None]:
X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)

Split data into training and testing datasets 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=5)

Extremely Random Forests regressor

In [None]:
params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0}
regressor = ExtraTreesRegressor(**params)
regressor.fit(X_train, y_train)

Compute the regressor performance on test data

In [None]:
y_pred = regressor.predict(X_test)
print("Mean absolute error:", round(mean_absolute_error(y_test, y_pred), 2))

Testing encoding on single data instance

In [None]:
test_datapoint = ['Saturday', '10:20', 'Atlanta', 'no']
test_datapoint_encoded = [-1] * len(test_datapoint)

Predict the output for the test datapoint

In [None]:
print("Predicted traffic:", int(regressor.predict([test_datapoint_encoded])[0]))

########################################################################################

In [None]:
import numpy as np
import matplotlib.pyplot as plt