In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

# importing machine learning models for prediction
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC

In [4]:
def split_data(X, y):
    # Split the data into training, testing and validation sets (80%, 10%, 10%)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=2)

    return X_train, X_test, X_val, y_train, y_test, y_val

# Read and split data
df = pd.read_csv('cleaned_data/cleaned_numeric_training_data.csv')
print(df.head())

labels = df['damage_grade']
building_ids = df['building_id']
training_data = df.drop(['building_id', 'damage_grade'], axis=1)

X_train, X_test, X_val, y_train, y_test, y_val = split_data(training_data, labels)

print('--------------Train Set--------------\n', X_train)
print('--------------Test Set--------------\n', X_test)
print('--------------Validation Set--------------\n', X_val)

print('--------------Train Set Labels--------------\n', y_train)
print('--------------Test Set Labels--------------\n', y_test)
print('--------------Validation Set Labels--------------\n', y_val)


   building_id damage_grade  geo_level_1_id  geo_level_2_id  geo_level_3_id  \
0       802906         High               6             487           12198   
1        28830       Medium               8             900            2812   
2        94947         High              21             363            8973   
3       590882       Medium              22             418           10694   
4       201944         High              11             131            1488   

   count_floors_pre_eq  age  area_percentage  height_percentage  \
0                    2   30                6                  5   
1                    2   10                8                  7   
2                    2   10                5                  5   
3                    2   10                6                  5   
4                    3   30                8                  9   

   has_superstructure_adobe_mud  ...  plan_configuration_m  \
0                             1  ...                     0  

In [5]:
pipelines = [make_pipeline(StandardScaler(), LogisticRegression()),
             make_pipeline(StandardScaler(), GaussianNB()),
             make_pipeline(StandardScaler(), DecisionTreeClassifier()),
             make_pipeline(StandardScaler(), RandomForestClassifier()),
             make_pipeline(StandardScaler(), AdaBoostClassifier()),
             make_pipeline(StandardScaler(), BaggingClassifier()),
             make_pipeline(StandardScaler(), LinearDiscriminantAnalysis()),
             make_pipeline(StandardScaler(), QuadraticDiscriminantAnalysis())]

# make_pipeline(StandardScaler(), SVC(gamma=2, C=1)),
# make_pipeline(StandardScaler(), MLPClassifier(max_iter=1000))

# Grid Search for different classifiers
search_spaces = [
    # Logistic Regression
    {
        "solver": ['newton-cg', 'lbfgs', 'liblinear'],
        "penalty": ['none', 'l1', 'l2', 'elasticnet'],
        "C": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
    },
    # Naive Bayes
    {
        'var_smoothing': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9,
                          1e-10, 1e-11, 1e-12, 1e-13, 1e-14, 1e-15]
    },
    # Decision Tree
    {
        "criterion": ['gini', 'entropy'],
        "max-depth": [np.arange(2, 12)],
        "n_components": list(range(1, X_train.shape[1] + 1, 1))
    },
    # Random Forest
    {
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [np.arange(2, 12)],
        'criterion': ['gini', 'entropy']
    }
]

classifier_types = ['Logistic Regression',
                    'Naive Bayes',
                    'Decision Tree',
                    'Random Forest',
                    'AdaBoost',
                    'Bagging',
                    'LDA',
                    'QDA']

In [None]:
# Run random forest
print('Running random forest classifier')
rf = RandomForestClassifier()
print('Fitting random forest classifier')
rf.fit(X_train, y_train)
print('Making predictions')
rf_pred = rf.predict(X_test)

print('Creating confusion matrix for random forest classifier')
rf_confusion_matrix = confusion_matrix(y_test, rf_pred)
print(rf_confusion_matrix)
print('Printing confusion matrix')
ConfusionMatrixDisplay.from_predictions(y_test, rf_pred)
plt.show()
print('Calculating f1 score')
print(f1_score(y_test, rf_pred, average='micro'))


In [None]:
# Run all classifiers and plot results
results, labels = list(), list()

k_fold = KFold(n_splits=10, random_state=8)
for clf, label in zip(pipelines, classifier_types):
    print('Running 10 fold cross validation for', label)
    scores = cross_val_score(clf,
                             X_train,
                             y_train,
                             cv=k_fold,
                             scoring='accuracy',
                             verbose=1)
    results.append(scores)
    labels.append(label)
    print('Accuracy: %0.2f (+/- %0.2f) [%s]' % (scores.mean(), scores.std(), label))

# Creating boxplot of performance of classifiers
plt.boxplot(results, labels=labels, showmeans=True)
plt.show()



In [None]:
# Tune classifiers
print('TODO')