In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import datetime
from sklearn import metrics, preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
# from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, ShuffleSplit, learning_curve, validation_curve, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# from sklearn.feature_selection import SelectKBest, chi2
from joblib import dump, load
#from process_query import convert_causes

In [2]:
def convert_causes(df):
    # convert cause in dataframe and return updated dataframe

    dict_cause = {
        'Lightning': 'Natural',
        'Structure': 'Infrastructure Accident',
        'Powerline': 'Infrastructure Accident',
        'Railroad': 'Infrastructure Accident',
        'Fireworks': 'Human Accident',
        'Smoking': 'Human Accident',
        'Children': 'Human Accident',
        'Campfire': 'Human Accident',
        'Equipment Use': 'Human Accident',
        'Debris Burning': 'Human Accident',
        'Arson': 'Arson',
        'Missing/Undefined': 'Other',
        'Miscellaneous': 'Other'
    }

    # replace values in cause column if present
    if 'STAT_CAUSE_DESCR' in df.columns:
        df['STAT_CAUSE_DESCR'].replace(dict_cause, inplace=True)

    return df

In [3]:
def fnc_get_data():
    # load dataset
    try:
        conn = sqlite3.connect(r'../FPA_FOD_20170508.sqlite')
        sql_query = \
        """
        SELECT FOD_ID, FIRE_NAME, FIRE_SIZE, FIRE_SIZE_CLASS, LATITUDE, LONGITUDE, STATE, STAT_CAUSE_DESCR, date(DISCOVERY_DATE) AS DATE, FIRE_YEAR FROM Fires;
        """
        data = pd.read_sql(sql_query, conn)

        # convert causes
        data = convert_causes(data)

    finally:
        conn.close()

    # add columns
    data['DATE'] = pd.to_datetime(data['DATE'])
    data['MONTH'] = data['DATE'].dt.month
    data['DAY_OF_WEEK'] = data['DATE'].dt.dayofweek # Monday=0, Sunday=6

    # drop missing rows
    data = data.dropna()
    
    return data

In [4]:
# BASE MACHINE LEARNING MODEL
class MLBase:
    def __init__(self):
        self.estimator = None
        self.file_name = None
        self.feature_cols = ['LATITUDE', 'LONGITUDE', 'FIRE_YEAR', 'MONTH', 'DAY_OF_WEEK']
        self.data = None
        
    def get_data(self):
        
        # load dataset
        try:
            conn = sqlite3.connect(r'../FPA_FOD_20170508.sqlite')
            sql_query = \
            """
            SELECT FOD_ID, FIRE_NAME, FIRE_SIZE, FIRE_SIZE_CLASS, LATITUDE, LONGITUDE, STATE, STAT_CAUSE_DESCR, date(DISCOVERY_DATE) AS DATE, FIRE_YEAR FROM Fires;
            """
            data = pd.read_sql(sql_query, conn)

            # convert causes
            data = convert_causes(data)

        finally:
            conn.close()

        # add columns
        data['DATE'] = pd.to_datetime(data['DATE'])
        data['MONTH'] = data['DATE'].dt.month
        data['DAY_OF_WEEK'] = data['DATE'].dt.dayofweek # Monday=0, Sunday=6

        # drop missing rows
        self.data = data.dropna()
    
    def fit_model(self, X, y):
        # split dataset into training set and test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

        # fit random forest classifier
        self.estimator.fit(X_train, y_train)
        
        # predict response for test dataset
        y_pred = self.estimator.predict(X_test)
        
        # store accuracy
        self.estimator.accuracy_ = metrics.accuracy_score(y_test, y_pred)
      
    def fit_save(self, X, y):
        self.estimator.fit(X, y)
        self.save()
        
    def save(self):
        # save model to lib folder; to be run within lib directory
        print('Compressing and saving model...\n')
        
        if self.estimator is not None:
            dump(self.estimator, './' + self.file_name, compress = 9)
            print('{} has been saved.\n'.format(self.file_name))
        else: 
            print('Error: No estimator created yet.')
    
    def load(self):
        # load model from lib folder; to be run within app.py
        print('Loading {}...\n'.format(self.file_name))
        self.estimator = load('./lib/' + self.file_name)    # access from app.py
        print('{} has been loaded.\n'.format(self.file_name))
    
    def predict(self, lat, long, year, month, day_of_week):
        # predict given LATITUDE, LONGITUDE, FIRE_YEAR, MONTH, DAY_OF_WEEK
        if self.estimator is not None:
            prediction = self.estimator.predict([[lat, long, year, month, day_of_week]])
            return prediction[0]
        else: 
            print('Error: No estimator created yet.')


# RANDOM FOREST TO PREDICT WILDFIRE CAUSE
class MLRandomForestCause(MLBase):
    def __init__(self):
        super().__init__()
        self.estimator = RandomForestClassifier(n_estimators=25)
        self.file_name = __class__.__name__ + '.joblib.z'  # name of stored class

    def train(self, save=False):
        # train random forest with 25 trees; to be run within lib directory
        print('Training random forest...\n')

        # load and clean data
        self.get_data()

        # identify features and label
        X = self.data[self.feature_cols]
        y = self.data['STAT_CAUSE_DESCR']

        # fit model
        if save:
          self.fit_save(X, y)
        else:
          self.fit_model(X, y)

          print('Random forest trained with accuracy of {}.\n'.format(self.estimator.accuracy_))


# RANDOM FOREST TO PREDICT WILDFIRE SIZE CLASS
class MLRandomForestSizeClass(MLBase):
    def __init__(self):
        super().__init__()
        self.estimator = RandomForestClassifier(n_estimators=25)
        self.file_name = __class__.__name__ + '.joblib.z'  # name of stored class

    def train(self, save=False):
        # train random forest with 25 trees; to be run within lib directory
        print('Training random forest...\n')

        # load and clean data
        self.get_data()

        # identify features and label
        X = self.data[self.feature_cols]
        y = self.data['FIRE_SIZE_CLASS']

        # fit model
        if save:
          self.fit_save(X, y)
        else:
          self.fit_model(X, y)

          print('Random forest trained with accuracy of {}.\n'.format(self.estimator.accuracy_))


# KNN TO PREDICT WILDFIRE CAUSE
class MLKnnCause(MLBase):
    def __init__(self):
        super().__init__()
        self.estimator = KNeighborsClassifier(n_neighbors=100)
        self.file_name = __class__.__name__ + '.joblib.z'  # name of stored class

    def train(self, save=False):
        # train knn with 100 neighbors; to be run within lib directory
        print('Training KNN...\n')

        # load and clean data
        self.get_data()

        # identify features and label
        X = self.data[self.feature_cols]
        y = self.data['STAT_CAUSE_DESCR']

        # fit model
        if save:
          self.fit_save(X, y)
        else:
          self.fit_model(X, y)

          print('KNN trained with accuracy of {}.\n'.format(self.estimator.accuracy_))


# KNN TO PREDICT WILDFIRE SIZE CLASS
class MLKnnSizeClass(MLBase):
    def __init__(self):
        super().__init__()
        self.estimator = KNeighborsClassifier(n_neighbors=100)
        self.file_name = __class__.__name__ + '.joblib.z'  # name of stored class

    def train(self, save=False):
        # train knn with 100 neighbors; to be run within lib directory
        print('Training KNN...\n')

        # load and clean data
        self.get_data()

        # identify features and label
        X = self.data[self.feature_cols]
        y = self.data['FIRE_SIZE_CLASS']

        # fit model
        if save:
          self.fit_save(X, y)
        else:
          self.fit_model(X, y)

          print('KNN trained with accuracy of {}.\n'.format(self.estimator.accuracy_))


# AdaBoost TO PREDICT WILDFIRE CAUSE
class MLAdaBoostCause(MLBase):
    def __init__(self):
        super().__init__()
        self.estimator = AdaBoostClassifier(n_estimators=60)
        self.file_name = __class__.__name__ + '.joblib.z'  # name of stored class

    def train(self, save=False):
        # train adaboost; to be run within lib directory
        print('Training AdaBoost...\n')

        # load and clean data
        self.get_data()

        # identify features and label
        X = self.data[self.feature_cols]
        y = self.data['STAT_CAUSE_DESCR']

        # fit model
        if save:
          self.fit_save(X, y)
        else:
          self.fit_model(X, y)

          print('AdaBoost trained with accuracy of {}.\n'.format(self.estimator.accuracy_))


# AdaBoost TO PREDICT WILDFIRE SIZE CLASS
class MLAdaBoostSizeClass(MLBase):
    def __init__(self):
        super().__init__()
        self.estimator = AdaBoostClassifier(n_estimators=60)
        self.file_name = __class__.__name__ + '.joblib.z'  # name of stored class

    def train(self, save=False):
        # train adaboost; to be run within lib directory
        print('Training AdaBoost...\n')

        # load and clean data
        self.get_data()

        # identify features and label
        X = self.data[self.feature_cols]
        y = self.data['FIRE_SIZE_CLASS']

        # fit model
        if save:
          self.fit_save(X, y)
        else:
          self.fit_model(X, y)

          print('AdaBoost trained with accuracy of {}.\n'.format(self.estimator.accuracy_))


# Gradient Boosting TO PREDICT WILDFIRE CAUSE
class MLGradientBoostingCause(MLBase):
    def __init__(self):
        super().__init__()
        self.estimator = GradientBoostingClassifier(max_depth=8, n_estimators=250)
        self.file_name = __class__.__name__ + '.joblib.z'  # name of stored class

    def train(self, save=False):
        # train adaboost; to be run within lib directory
        print('Training Gradient Boosting...\n')

        # load and clean data
        self.get_data()

        # identify features and label
        X = self.data[self.feature_cols]
        y = self.data['STAT_CAUSE_DESCR']

        # fit model
        if save:
          self.fit_save(X, y)
        else:
          self.fit_model(X, y)

          print('Gradient Boosting trained with accuracy of {}.\n'.format(self.estimator.accuracy_))


# Gradient Boosting TO PREDICT WILDFIRE SIZE CLASS
class MLGradientBoostingClass(MLBase):
    def __init__(self):
        super().__init__()
        self.estimator = GradientBoostingClassifier(max_depth=8, n_estimators=250)
        self.file_name = __class__.__name__ + '.joblib.z'  # name of stored class

    def train(self, save=False):
        # train adaboost; to be run within lib directory
        print('Training Gradient Boosting...\n')

        # load and clean data
        self.get_data()

        # identify features and label
        X = self.data[self.feature_cols]
        y = self.data['FIRE_SIZE_CLASS']

        # fit model
        if save:
          self.fit_save(X, y)
        else:
          self.fit_model(X, y)

          print('Gradient Boosting trained with accuracy of {}.\n'.format(self.estimator.accuracy_))
          

# SVM TO PREDICT WILDFIRE CAUSE
class MLSVMCause(MLBase):
    def __init__(self):
        super().__init__()
        self.estimator = SVC(kernel='rbf', C=1.0)
        self.file_name = __class__.__name__ + '.joblib.z'  # name of stored class

    def train(self, save=False):
        # train adaboost; to be run within lib directory
        print('Training SVM...\n')

        # load and clean data
        self.get_data()

        # identify features and label
        X = self.data[self.feature_cols]
        y = self.data['STAT_CAUSE_DESCR']

        # fit model
        if save:
          self.fit_save(X, y)
        else:
          self.fit_model(X, y)

          print('SVM trained with accuracy of {}.\n'.format(self.estimator.accuracy_))


# SVM TO PREDICT WILDFIRE SIZE CLASS
class MLSVMSizeClass(MLBase):
    def __init__(self):
        super().__init__()
        self.estimator = SVC(kernel='rbf', C=1.0)
        self.file_name = __class__.__name__ + '.joblib.z'  # name of stored class

    def train(self, save=False):
        # train adaboost; to be run within lib directory
        print('Training SVM...\n')

        # load and clean data
        self.get_data()

        # identify features and label
        X = self.data[self.feature_cols]
        y = self.data['FIRE_SIZE_CLASS']

        # fit model
        if save:
          self.fit_save(X, y)
        else:
          self.fit_model(X, y)

          print('SVM trained with accuracy of {}.\n'.format(self.estimator.accuracy_))

In [5]:
## reference from https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=[0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], save=True):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring='f1_weighted')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    # Leveraged from aschang3 ML project
    best_index = 0
    for i in range(1,len(test_scores_mean)):
        if test_scores_mean[i] - (train_scores_mean[i] - test_scores_mean[i])  > test_scores_mean[best_index] - (train_scores_mean[best_index] - test_scores_mean[best_index]):
            best_index = i

    best_score = test_scores_mean[best_index]

    # Plot a dotted vertical line at the best score for that scorer marked by x
    plt.plot([train_sizes[best_index], ] * 2, [0, best_score],
                linestyle='-.', color='k', marker='x', markeredgewidth=4, ms=8)

    # Annotate the best score for that scorer
    plt.annotate("%0.3f" % best_score, (train_sizes[best_index] - 0.1, best_score + 0.005))

    plt.legend(loc="best")

    fig = plt.gcf()
    fig.set_size_inches(10.5, 10.5)
    save_as = '../figures/' + title + "_learning_curve_" + datetime.datetime.today().strftime('%d%H%M%S') + '.png'
    if save:
      fig.savefig(save_as, dpi=100)
    plt.close()
    # End leveraged from aschang3 ML project

    return
## reference from: https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html

## reference from: https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py
def plot_model_complexity(estimator, title, X, y, ylim=None, cv=None, param_name=None, param_range=[], save=True):
    train_scores, test_scores = validation_curve(estimator, X, y, param_name=param_name, param_range=param_range, cv=cv, scoring='f1_weighted', n_jobs=-1) 
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title(title)
    plt.xlabel(param_name)
    plt.ylabel("Score")
    if ylim is not None:
        plt.ylim(*ylim)
    lw = 2
    plt.plot(param_range, train_scores_mean, label="Training score",
                 color="darkorange", lw=lw)
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=lw)
    plt.plot(param_range, test_scores_mean, label="Cross-validation score",
                 color="navy", lw=lw)
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2,
                     color="navy", lw=lw)

    # Leveraged from aschang3 ML project
    best_index = 0
    for i in range(1,len(test_scores_mean)):
        if test_scores_mean[i] - (train_scores_mean[i] - test_scores_mean[i])  > test_scores_mean[best_index] - (train_scores_mean[best_index] - test_scores_mean[best_index]):
            best_index = i

    best_score = test_scores_mean[best_index]

    # Plot a dotted vertical line at the best score for that scorer marked by x
    plt.plot([param_range[best_index], ] * 2, [0, best_score],
                linestyle='-.', color='k', marker='x', markeredgewidth=4, ms=8)

    # Annotate the best score for that scorer
    plt.annotate("%0.3f" % best_score, (param_range[best_index], best_score + 0.005))

    plt.legend(loc="best")

    fig = plt.gcf()
    fig.set_size_inches(10.5, 10.5)
    save_as = '../figures/' + title + datetime.datetime.today().strftime('%d%H%M%S') + '.png'
    if save:
      fig.savefig(save_as, dpi=100)
    plt.close()
    # End leveraged from aschang3 ML project

    return
# reference from: https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py

In [6]:
df = fnc_get_data()
feature_cols = ['LATITUDE', 'LONGITUDE', 'FIRE_YEAR', 'MONTH', 'DAY_OF_WEEK']
X = df[feature_cols]
y_cause = df['STAT_CAUSE_DESCR']
y_size = df['FIRE_SIZE_CLASS']
cv = ShuffleSplit(n_splits=10, train_size=0.15, test_size=0.05, random_state=0)

In [7]:
if False:
    # rf gridsearchcv
    for y in [y_cause, y_size]:
        parameters = {'n_estimators': [125,250], 'max_depth':[16,32]}
        rforest = RandomForestClassifier()
        clf = GridSearchCV(rforest, parameters, cv=cv, scoring='f1_weighted', verbose=10, n_jobs=4)
        clf.fit(X, y)
        print(clf.best_params_)
        print(clf.best_score_)
        print(clf.cv_results_['params'])
        print(clf.cv_results_['mean_test_score'])
        print(clf.cv_results_['mean_fit_time'])

    ## cause
    #{'max_depth': 32, 'n_estimators': 250}
    #0.6390040225861129
    #[{'max_depth': 16, 'n_estimators': 125}, {'max_depth': 16, 'n_estimators': 250}, {'max_depth': 32, 'n_estimators': 125}, {'max_depth': 32, 'n_estimators': 250}]
    #[0.63151787 0.63191176 0.63772845 0.63900402]
    #[31.2246248  62.86037166 37.67768614 71.36209855]

    ## size
    #{'max_depth': 16, 'n_estimators': 250}
    #0.5742286401718223
    #[{'max_depth': 16, 'n_estimators': 125}, {'max_depth': 16, 'n_estimators': 250}, {'max_depth': 32, 'n_estimators': 125}, {'max_depth': 32, 'n_estimators': 250}]
    #[0.57404182 0.57422864 0.57088753 0.57173373]
    #[39.71029224 75.89663596 52.66529212 85.22966387]

In [8]:
if False:
    # rf gridsearchcv
    for y in [y_cause, y_size]:
        parameters = {'n_estimators': [25,50]}
        rforest = RandomForestClassifier()
        clf = GridSearchCV(rforest, parameters, cv=cv, scoring='f1_weighted', verbose=10, n_jobs=4)
        clf.fit(X, y)
        print(clf.best_params_)
        print(clf.best_score_)
        print(clf.cv_results_['params'])
        print(clf.cv_results_['mean_test_score'])
        print(clf.cv_results_['mean_fit_time'])

    ## cause
    #{'n_estimators': 50}
    #0.6345049032411676
    #[{'n_estimators': 25}, {'n_estimators': 50}]
    #[0.63021669 0.6345049 ]
    #[ 8.67082019 17.25797555]

    # size
    #{'n_estimators': 50}
    #0.5680015043876222
    #[{'n_estimators': 25}, {'n_estimators': 50}]
    #[0.56341574 0.5680015 ]
    #[10.82792463 22.12725415]

In [9]:
if False:
    # knn gridsearchcv
    for y in [y_cause, y_size]:
        parameters = {'n_neighbors': [5,10,100]}
        knn = KNeighborsClassifier()
        clf = GridSearchCV(knn, parameters, cv=cv, scoring='f1_weighted', verbose=10, n_jobs=4)
        clf.fit(X, y)
        print(clf.best_params_)
        print(clf.best_score_)
        print(clf.cv_results_['params'])
        print(clf.cv_results_['mean_test_score'])
        print(clf.cv_results_['mean_fit_time'])

    ## cause
    #{'n_neighbors': 10}
    #0.5905766174878578
    #[{'n_neighbors': 5}, {'n_neighbors': 10}, {'n_neighbors': 100}]
    #[0.58365336 0.59057662 0.56308848]
    #[0.42246768 0.44235063 0.46920006]

    ## size
    #{'n_neighbors': 100}
    #0.5473554376900897
    #[{'n_neighbors': 5}, {'n_neighbors': 10}, {'n_neighbors': 100}]
    #[0.5309177  0.54406385 0.54735544]
    #[0.42958615 0.48844428 0.44010985]

In [10]:
if False:
    # adaptive boosting gridsearchcv
    for y in [y_cause, y_size]:
        parameters = {'n_estimators':[60,250,500,1000]}
        aboost = AdaBoostClassifier()
        clf = GridSearchCV(aboost, parameters, cv=cv, scoring='f1_weighted', verbose=10, n_jobs=4)
        clf.fit(X, y)
        print(clf.best_params_)
        print(clf.best_score_)
        print(clf.cv_results_['params'])
        print(clf.cv_results_['mean_test_score'])
        print(clf.cv_results_['mean_fit_time'])

    ## cause
    #{'n_estimators': 1000}
    #0.49302307229299175
    #[{'n_estimators': 60}, {'n_estimators': 250}, {'n_estimators': 500}, {'n_estimators': 1000}]
    #[0.44783861 0.47919785 0.4884043  0.49302307]
    #[ 18.51594372  74.46232986 152.80710304 305.32525001]

    ## size
    #{'n_estimators': 1000}
    #0.5237277189495368
    #[{'n_estimators': 60}, {'n_estimators': 250}, {'n_estimators': 500}, {'n_estimators': 1000}]
    #[0.5198465  0.52175404 0.52257134 0.52372772]
    #[ 13.92950158  60.43023548 132.85321066 245.61578233]

In [11]:
if False:
    # gradient boosting gridsearchcv
    for y in [y_cause, y_size]:
        parameters = {'max_depth':[8,12,16]}
        gboost = GradientBoostingClassifier()
        clf = GridSearchCV(gboost, parameters, cv=cv, scoring='f1_weighted', verbose=10, n_jobs=4)
        clf.fit(X, y)
        print(clf.best_params_)
        print(clf.best_score_)
        print(clf.cv_results_['params'])
        print(clf.cv_results_['mean_test_score'])
        print(clf.cv_results_['mean_fit_time'])
        
        ##cause
        #{'max_depth': 12}
        #0.641225636144493
        #[{'max_depth': 8}, {'max_depth': 12}, {'max_depth': 16}]
        #[0.62882442 0.64122564 0.63908153]
        #[ 317.25169902 1103.92032697 4077.71273634]
        
        ## size

In [12]:
if False:
    # histogram gradient boosting gridsearchcv
    for y in [y_cause, y_size]:
        parameters = {'max_depth':[8,12,16], 'learning_rate':[0.05,0.1,0.5,1.0]}
        hgboost = HistGradientBoostingClassifier()
        clf = GridSearchCV(hgboost, parameters, cv=cv, scoring='f1_weighted', verbose=10, n_jobs=4)
        clf.fit(X, y)
        print(clf.best_params_)
        print(clf.best_score_)
        print(clf.cv_results_['params'])
        print(clf.cv_results_['mean_test_score'])
        print(clf.cv_results_['mean_fit_time'])

In [13]:
if False:
    # svm gridsearchcv
    # StandardScaler w/ Pandas DF Reference: https://stackoverflow.com/questions/35723472/how-to-use-sklearn-fit-transform-with-pandas-and-return-dataframe-instead-of-num
    X_scaled = preprocessing.StandardScaler().fit_transform(X.values)
    for y in [y_cause, y_size]:
        parameters = {'C':[0.1,0.5,1.0,1.5,2.5,5.0]}
        svc = SVC(kernel='rbf', gamma="scale")
        clf = GridSearchCV(svc, parameters, cv=cv, scoring='f1_weighted', verbose=10, n_jobs=4)
        clf.fit(X_scaled, y)
        print(clf.best_params_)
        print(clf.best_score_)
        print(clf.cv_results_['params'])
        print(clf.cv_results_['mean_test_score'])
        print(clf.cv_results_['mean_fit_time'])

In [14]:
new_array, num_labels = np.unique(y_cause.values, return_counts=True)
label_distro = num_labels / len(y_cause)
print(label_distro)
new_array, num_labels = np.unique(y_size.values, return_counts=True)
label_distro = num_labels / len(y_size)
print(label_distro)

[0.11503494 0.39921324 0.02245482 0.23564785 0.22764915]
[0.42816449 0.43247631 0.09598538 0.01989438 0.01191193 0.00758603
 0.00398147]


In [None]:
if True:
    cv = ShuffleSplit(n_splits=10, train_size=0.15, test_size=0.05, random_state=0)
    #rfc
    #plot_model_complexity(RandomForestClassifier(n_estimators=125), "Random Forest Cause Classifier Model Complexity Curve", X, y_cause, param_name='max_depth', param_range=[2,4,6,8,10,12,16], ylim=[0.3,0.8], cv=cv)
    #plot_model_complexity(RandomForestClassifier(n_estimators=125), "Random Forest Size Classifier Model Complexity Curve", X, y_size, param_name='max_depth', param_range=[2,4,6,8,10,12,16], ylim=[0.3,0.8], cv=cv)
    #knn
    #plot_model_complexity(KNeighborsClassifier(), "KNN Cause Classifier Model Complexity Curve", X, y_cause, param_name='n_neighbors', param_range=[1,3,5,7,10,15,25], ylim=[0.3,0.8], cv=cv)
    #plot_model_complexity(KNeighborsClassifier(), "KNN Size Classifier Model Complexity Curve", X, y_size, param_name='n_neighbors', param_range=[1,3,5,7,10,15,25], ylim=[0.3,0.8], cv=cv)
    #adaboost
    #plot_model_complexity(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=7)), "AdaBoost Cause Classifier Model Complexity Curve Max Depth 7", X, y_cause, param_name='n_estimators', param_range=[3,5,10,18,25], ylim=[0.3,0.8], cv=cv)
    #plot_model_complexity(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=7)), "AdaBoost Size Classifier Model Complexity Curve Max Depth 7", X, y_size, param_name='n_estimators', param_range=[3,5,10,18,25], ylim=[0.3,0.8], cv=cv)
    #hist gradient boosting
    #plot_model_complexity(HistGradientBoostingClassifier(), "Histogram Gradient Boosting Cause Classifier Model Complexity Curve", X, y_cause, param_name='max_depth', param_range=[2,4,8,10,12], ylim=[0.3,0.8], cv=cv)
    #plot_model_complexity(HistGradientBoostingClassifier(), "Histogram Gradient Boosting Size Classifier Model Complexity Curve", X, y_size, param_name='max_depth', param_range=[2,4,8,10,12], ylim=[0.3,0.8], cv=cv)
    #svm
    X_scaled = preprocessing.StandardScaler().fit_transform(X.values)
    plot_model_complexity(SVC(kernel='rbf', gamma='scale', max_iter=4000), "SVM Cause Classifier Model Complexity Curve", X_scaled, y_cause, param_name='C', param_range=[0.1,0.5,1.0,1.5,2.5,5.0], ylim=[0.3,0.8], cv=cv)
    #plot_model_complexity(SVC(kernel='rbf', gamma='scale', max_iter=1500), "SVM Size Classifier Model Complexity Curve", X_scaled, y_size, param_name='C', param_range=[0.1,0.5,1.0,1.5,2.5,5.0], ylim=[0.3,0.8], cv=cv)

In [16]:
if True:
    cv = ShuffleSplit(n_splits=3, train_size=0.8, test_size=0.2, random_state=0)
    #rfc
    #plot_learning_curve(RandomForestClassifier(n_estimators=125, max_depth=10), "Random Forest Cause Classifier Learning Curve", X, y_cause, ylim=[0.3, 0.8], cv=cv)
    #plot_learning_curve(RandomForestClassifier(n_estimators=125, max_depth=10), "Random Forest Size Classifier Learning Curve", X, y_size, ylim=[0.3, 0.8], cv=cv)
    #knn
    #plot_learning_curve(KNeighborsClassifier(n_neighbors=10), "KNN Cause Classifier Learning Curve", X, y_cause, ylim=[0.3, 0.8], cv=cv)
    #plot_learning_curve(KNeighborsClassifier(n_neighbors=10), "KNN Size Classifier Learning Curve", X, y_size, ylim=[0.3, 0.8], cv=cv)
    #adaboost
    #plot_learning_curve(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=7), n_estimators=10), "AdaBoost Cause Classifier Learning Curve Max Depth 7", X, y_cause, ylim=[0.3, 0.8], cv=cv)
    #plot_learning_curve(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=7), n_estimators=10), "AdaBoost Size Classifier Learning Curve Max Depth 7", X, y_size, ylim=[0.3, 0.8], cv=cv)
    #histogram gradient boosting
    #plot_learning_curve(HistGradientBoostingClassifier(max_depth=8), "Histogram Gradient Boosting Cause Classifier Learning Curve", X, y_cause, ylim=[0.3, 0.8], cv=cv)
    #plot_learning_curve(HistGradientBoostingClassifier(max_depth=8), "Histogram Gradient Boosting Size Classifier Learning Curve", X, y_size, ylim=[0.3, 0.8], cv=cv)
    #svm
    X_scaled = preprocessing.StandardScaler().fit_transform(X.values)
    plot_learning_curve(SVC(kernel='rbf', gamma='scale', C=1.0, max_iter=1500), "SVM Cause Classifier Learning Curve", X_scaled, y_cause, ylim=[0.3, 0.8], cv=cv)
    plot_learning_curve(SVC(kernel='rbf', gamma='scale', C=1.0, max_iter=1500), "SVM Size Classifier Learning Curve", X_scaled, y_size, ylim=[0.3, 0.8], cv=cv)