In [2]:
# General imports
import pandas as pd, numpy as np, re, random, warnings, traceback
warnings.filterwarnings('ignore')

# Class imbalance rectification
from sklearn.svm import SVC
# from sklearn.decomposition import PCA, TruncatedSVD # imported later
from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler, MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, RepeatedEditedNearestNeighbours
from imblearn.over_sampling import ADASYN, SVMSMOTE

# Optimization
import optuna

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Dimensionality Reduction and remapping
from umap import UMAP
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE

# Model Eval
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, confusion_matrix

# Scalers
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# Read in the data
df = pd.read_csv('raw_data.csv')

In [4]:
# Fill in some null values
df['Priority'].fillna('No Priority Listed', inplace=True)
df["General_Contractor"].fillna('No General Contractor Listed', inplace=True)
df['Remarks'].fillna('', inplace=True)
df['EST'].fillna('No Estimator Listed', inplace=True)
df['Addendum'].fillna('No Addendum', inplace=True)
# Change to boolean
df['Addendum'] = [1 if x != 'No Addendum' else 0 for x in df['Addendum']]

# Strip whitespace, fill in null values
df['Results'] = [str(x).strip() for x in df['Results']]
df['Results'].fillna('No Result Listed', inplace=True)

In [5]:
# Function to clean the "Description" field of a ticket
# Keep only alphanumeric values, strip extra space, keep it lowerspace
def process_descriptions(txt):
    return re.sub("[^a-zA-Z] +", "", txt).lower().strip()   

def get_traceback(e):
    lines = traceback.format_exception(type(e), e, e.__traceback__)
    return ''.join(lines)

# Modeling

In [6]:
# Columns to use for modeling -- NOT stakeholder given
model_cols = ['General_Contractor', 'Remarks', 'Addendum', 'EST', 'Results']
# Split the dataframe
res_df = df[model_cols]
res_df = res_df.loc[res_df['Results'] != 'nan']

classes = [1 if x == 'Awarded' else 0 for x in res_df['Results']]

In [7]:
# Create term counts and one hot encoding objects
vec = CountVectorizer(decode_error='ignore', strip_accents='unicode', lowercase=True, stop_words='english')
ohe = OneHotEncoder(drop='first', handle_unknown='infrequent_if_exist', sparse=False)

In [8]:
# Transform the data
cat_data = pd.DataFrame(data=ohe.fit_transform(res_df[[x for x in model_cols if x not in ['Results', 'Remarks']]]), columns=list(ohe.get_feature_names_out()))
text_data = pd.DataFrame(data=vec.fit_transform(res_df['Remarks'].apply(process_descriptions)).todense(), columns=list(vec.get_feature_names_out()))
modeling_df = pd.concat([cat_data, text_data], axis=1)
modeling_df

Unnamed: 0,General_Contractor_Argo Systems,"General_Contractor_CBP Constructors, LLC",General_Contractor_CYMA Builders of CM,General_Contractor_Consolidated Coatings,General_Contractor_Consolidated Medical Services,General_Contractor_Emaryland,General_Contractor_G & I Drywall,General_Contractor_Herb Schafer Asphalt Pavement,"General_Contractor_Jeffrey Brown Contracting, LLC","General_Contractor_KasCon, Inc",...,enviro,file,job,jobs,onlybid,phasingjob,sent,shifts,submitted,wk
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,1,0,0,0,0,0,0,0
72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
74,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(modeling_df, classes, stratify=classes, test_size=0.2, random_state=8)

In [10]:
# Optimization function using optuna
def optimize_snow(trial, x_train, x_test, y_train, y_test):

    n_row = x_train.shape[0]
    n_col = x_train.shape[1]

    # General Settings

    # Class imbalance
    undersample = trial.suggest_categorical("undersample", ["Yes", "No"])
    oversample = trial.suggest_categorical("oversample", ["Yes", "No"])
    
    # Data Transformations / Dimensionality Reductions
    data_type = trial.suggest_categorical("data_type", ["PCA", "UMAP", "RAW"])    
    
    # Preprocessing
    scaling = trial.suggest_categorical("scaling", ["None", "Standard", "Robust"])
    normalizing = trial.suggest_categorical("normalizing", ["Yes", "No"])
    minmax = trial.suggest_categorical("minmax", ["Yes", "No"])
    
    algorithm_type = trial.suggest_categorical("algorithm_type", ['classification', 'anomaly_detection'])
    
    # Important to use try/except because a lot of these params/combos will not work well together. The RL technique needs to learn this.
    try:

        # Dimensionality / Data Representation, tune those as well
        if data_type == 'PCA':
            
            # Tune PCA
            pca_n_components = trial.suggest_int("pca_n_components", 1, n_col-1)
            pca_whiten = trial.suggest_categorical("pca_whiten", [True, False])
            pca_svd_solver = trial.suggest_categorical("pca_svd_solver", ["auto", "full", "arpack", "randomized"])

            transformer = PCA(n_components=pca_n_components, whiten=pca_whiten, svd_solver=pca_svd_solver, random_state=8)
            mm = MinMaxScaler(feature_range=(0,1))
            x_train = transformer.fit_transform(mm.fit_transform(x_train))
            x_test = transformer.transform(mm.transform(x_test))

        elif data_type == 'UMAP':
            
            # Tune UMAP
            umap_n_neighbors = trial.suggest_int("umap_n_neighbors", 2, 75)
            umap_min_dist = trial.suggest_float("umap_min_dist", 0.0, 1.0)
            umap_n_components = trial.suggest_int("umap_n_components", 1, n_col - 1)
            umap_metric = trial.suggest_categorical("umap_metric", ['minkowski', 'cosine', 'correlation', 'euclidean', 'p', 'manhattan', 
                                                                    'infinity', 'nan_euclidean', 'dice', 'kulsinski', 
                                                                    'matching', 'jaccard', 'seuclidean', 'wminkowski', 'mahalanobis', 
                                                                    'rogerstanimoto', 'l1', 'sqeuclidean', 'sokalsneath', 'sokalmichener', 
                                                                    'l2', 'yule', 'canberra', 'cityblock', 'haversine', 'russellrao', 
                                                                    'hamming', 'braycurtis', 'chebyshev'])

            transformer = UMAP(n_neighbors=umap_n_neighbors, min_dist=umap_min_dist, n_components=umap_n_components, metric=umap_metric, n_jobs=2)
            mm = MinMaxScaler(feature_range=(0,1))
            x_train = transformer.fit_transform(mm.fit_transform(x_train))
            x_test = transformer.transform(mm.transform(x_test))        

        # elif data_type == 'TSNE':
            
        #     # Tune TSNE
        #     data_n_components = trial.suggest_int("data_n_components", 1, x_train.shape[1] - 1)
        #     early_exaggeration = trial.suggest_float("early_exaggeration", 1.0, 25.0)
        #     perplexity = trial.suggest_float("perplexity", 5.0, 50.0)
        #     data_n_iter = trial.suggest_int("data_n_iter", 250, 5000)
        #     data_n_iter_without_progress = trial.suggest_int("data_n_iter_without_progress", 300, 5000)
        #     min_grad_norm = trial.suggest_float("min_grad_norm", .0000001, 0.999999)
        #     data_init = trial.suggest_categorical("data_init", ['random', 'pca'])

        #     transformer = TSNE(n_components=data_n_components, n_iter=data_n_iter, early_exaggeration=early_exaggeration,
        #                 perplexity=perplexity, n_iter_without_progress=data_n_iter_without_progress, n_jobs=2,
        #                 min_grad_norm=min_grad_norm, init=data_init)
        #     mm = MinMaxScaler(feature_range=(0,1))
        #     x_train = transformer.fit_transform(mm.fit_transform(x_train))
        #     x_test = transformer.transform(mm.transform(x_test))  

        else:
            x_train = x_train.to_numpy()
            x_test = x_test.to_numpy()

        # Scale
        if scaling == 'Standard':
            ss = StandardScaler()
            x_train = ss.fit_transform(x_train)
            x_test = ss.transform(x_test)
        elif scaling == 'Robust':
            rb = RobustScaler()
            x_train = rb.fit_transform(x_train)
            x_test = rb.transform(x_test)
        else:
            pass
        
        # Normalize
        if normalizing == 'Yes':
            nm = Normalizer()
            x_train = nm.fit_transform(x_train)
            x_test = nm.transform(x_test)
        else:
            pass

        # Minmax Scaling
        if minmax == 'Yes':
            mm = MinMaxScaler(feature_range=(-1,1))
            x_train = mm.fit_transform(x_train)
            x_test = mm.transform(x_test)
        else:
            pass

        # Note -- we will need to add support for multiclass eventually
        
        # Undersample the majority class
        if undersample == 'Yes':
            usampler = trial.suggest_categorical("usampler", ["Random", "Tomek", "Repeated"])

            if usampler == 'Random':
                random_replacement = trial.suggest_categorical("random_replacement", [True, False])

                under_sampler = RandomUnderSampler(random_state=8, sampling_strategy='majority', replacement=random_replacement)
                x_train, y_train = under_sampler.fit_resample(x_train, y_train)

            elif usampler == 'Tomek':
                under_sampler = TomekLinks(sampling_strategy='majority', n_jobs=4)
                x_train, y_train = under_sampler.fit_resample(x_train, y_train)
            elif usampler == 'Repeated':
                repeated_n_neighbors = trial.suggest_int("repeated_n_neighbors", 2, 100)
                repeated_max_iter = trial.suggest_int("repeated_max_iter", 2, 1000)
                repeated_kind_sel = trial.suggest_categorical("repeated_kind_sel", ['all', 'mode'])
                
                under_sampler = RepeatedEditedNearestNeighbours(sampling_strategy='majority', n_jobs=4, n_neighbors=repeated_n_neighbors, 
                                                                max_iter=repeated_max_iter, kind_sel=repeated_kind_sel)
                x_train, y_train = under_sampler.fit_resample(x_train, y_train)
            else:
                pass
        else:
            pass
        
        # Oversample the minority class
        if oversample == 'Yes':
            osampler = trial.suggest_categorical("osampler", ["ADASYN", "SVMSMOTE"])

            if osampler == 'ADASYN':
                oversample_n_neighbors = trial.suggest_int("oversample_n_neighbors", 2, 100)
                
                over_sampler = ADASYN(sampling_strategy='minority', random_state=8, n_jobs=4, n_neighbors=oversample_n_neighbors)
                x_train, y_train = over_sampler.fit_resample(x_train, y_train)
            elif osampler == 'SVMSMOTE':
                svmsmote_k_neighbors = trial.suggest_int("svmsmote_k_neighbors", 2, 100)
                svmsmote_m_neighbors = trial.suggest_int("svmsmote_m_neighbors", 2, 100)
                svmsmote_out_step = trial.suggest_float("svmsmote_out_step", 0.01, 0.99)

                svmsmote_svm_c = trial.suggest_float("svmsmote_svm_c", 0.01, 1.0)
                svmsmote_svm_kernel = trial.suggest_categorical("svmsmote_svm_kernel", ['linear', 'poly', 'rbf', 'sigmoid'])
                svmsmote_svm_gamma = trial.suggest_categorical("svmsmote_svm_gamma", ['auto', 'scale'])
                svmsmote_svm_shrinking = trial.suggest_categorical("svmsmote_svm_shrinking", [True, False])
                svmsmote_svm_max_iter = trial.suggest_int("svmsmote_svm_max_iter", 2, 1000)
                svmsmote_svm_decision_function = trial.suggest_categorical("svmsmote_svm_decision_function", ['ovo', 'ovr'])
                svm_obj = SVC(C=svmsmote_svm_c, kernel=svmsmote_svm_kernel, gamma=svmsmote_svm_gamma, shrinking=svmsmote_svm_shrinking,
                            max_iter=svmsmote_svm_max_iter, decision_function_shape=svmsmote_svm_decision_function, class_weight='balanced', cache_size=750, random_state=8)

                over_sampler = SVMSMOTE(k_neighbors=svmsmote_k_neighbors, m_neighbors=svmsmote_m_neighbors, sampling_strategy='minority', 
                                        out_step=svmsmote_out_step, random_state=8, n_jobs=4, svm_estimator=svm_obj)
                x_train, y_train = over_sampler.fit_resample(x_train, y_train)
            else:
                pass
        else:
            pass
        
        # run through some classification algorithms
        # obviously, we should expand the parameter selection & the algorithms list eventually
        if algorithm_type == 'classification':
            c_algorithm = trial.suggest_categorical("c_algorithm", ["Logistic Regression", "Random Forest", "Gradient Boosting", "Extra Trees", "Bernoulli", "Multinomial", "XGBoost", "LightGBM"])

            if c_algorithm == 'XGBoost':
                xgb_max_depth = trial.suggest_int("xgb_max_depth", 2, 50)
                xgb_min_child_weight = trial.suggest_int("xgb_min_child_weight", 0, 50)
                xgb_gamma = trial.suggest_int("xgb_gamma", 0, 100)
                xgb_subsample = trial.suggest_float("xgb_subsample", 0, 1)
                xgb_max_delta_step = trial.suggest_int("xgb_max_delta_step", 0, 10)

                algo = XGBClassifier(max_depth=xgb_max_depth, min_child_weight=xgb_min_child_weight, gamma=xgb_gamma, subsample=xgb_subsample, max_delta_step=xgb_max_delta_step)
            elif c_algorithm == 'LightGBM':
                lgb_lambda_l1 = trial.suggest_loguniform('lgb_lambda_l1', 1e-8, 10.0)
                lgb_lambda_l2 = trial.suggest_loguniform('lgb_lambda_l2', 1e-8, 10.0)
                lgb_num_leaves = trial.suggest_int('lgb_num_leaves', 2, 256)
                lgb_feature_fraction = trial.suggest_uniform('lgb_feature_fraction', 0.4, 1.0)
                lgb_bagging_fraction = trial.suggest_uniform('lgb_bagging_fraction', 0.4, 1.0)
                lgb_bagging_freq = trial.suggest_int('lgb_bagging_freq', 1, 7)
                lgb_min_child_samples = trial.suggest_int('lgb_min_child_samples', 5, 100)     

                algo = LGBMClassifier(lambda_l1=lgb_lambda_l1, lambda_l2=lgb_lambda_l2, num_leaves=lgb_num_leaves, feature_fraction=lgb_feature_fraction, 
                                      bagging_fraction=lgb_bagging_fraction, bagging_freq=lgb_bagging_freq, min_child_samples=lgb_min_child_samples)
            elif c_algorithm == 'Logistic Regression':
                lr_penalty = trial.suggest_categorical('lr_penalty', ['none', 'l2', 'l1', 'elasticnet'])
                lr_dual = trial.suggest_categorical('lr_dual', [True, False])
                # tol = trial.suggest_float('')
                lr_c = trial.suggest_float('lr_c', 0.01, 1.0)
                lr_fit_intercept = trial.suggest_categorical('lr_fit_intercept', [True, False])
                lr_solver = trial.suggest_categorical('lr_solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
                lr_max_iter = trial.suggest_int('lr_max_iter', 2, 1000)
                lr_multi_class = trial.suggest_categorical('lr_multi_class', ['auto', 'ovr', 'multinomial'])

                algo = LogisticRegression(penalty=lr_penalty, dual=lr_dual, C=lr_c, fit_intercept=lr_fit_intercept, class_weight='balanced', random_state=8, n_jobs=4,
                                          solver=lr_solver, max_iter=lr_max_iter, multi_class=lr_multi_class)
            elif c_algorithm == 'Random Forest':
                rf_n_estimators = trial.suggest_int("rf_n_estimators", 2, 1000)
                rf_criterion = trial.suggest_categorical("rf_criterion", ["gini", "entropy", "log_loss"])
                rf_min_samples_split = trial.suggest_float("rf_min_samples_split", 0.01, 1.0)
                rf_min_samples_leaf = trial.suggest_float("rf_min_samples_leaf", 0.01, 1.0)
                rf_min_weight_fraction_leaf = trial.suggest_float("rf_min_weight_fraction_leaf", 0.0, 0.5)
                rf_max_features = trial.suggest_categorical("rf_max_features", ["sqrt", "log2", None])
                rf_bootstrap = trial.suggest_categorical("rf_bootstrap", [True, False])
                rf_oob_score = trial.suggest_categorical("rf_oob_score", [True, False])
                rf_class_weight = trial.suggest_categorical("rf_class_weight", ['balanced', 'balanced_subsample'])
                
                algo = RandomForestClassifier(n_estimators=rf_n_estimators, criterion=rf_criterion, min_samples_split=rf_min_samples_split,
                                              min_samples_leaf=rf_min_samples_leaf, max_features=rf_max_features, min_weight_fraction_leaf=rf_min_weight_fraction_leaf,
                                              bootstrap=rf_bootstrap, oob_score=rf_oob_score, class_weight=rf_class_weight, n_jobs=4, random_state=8)
            elif c_algorithm == 'Extra Trees':
                et_n_estimators = trial.suggest_int("et_n_estimators", 2, 1000)
                et_criterion = trial.suggest_categorical("et_criterion", ["gini", "entropy", "log_loss"])
                et_min_samples_split = trial.suggest_float("et_min_samples_split", 0.01, 1.0)
                et_min_samples_leaf = trial.suggest_float("et_min_samples_leaf", 0.01, 1.0)
                et_min_weight_fraction_leaf = trial.suggest_float("et_min_weight_fraction_leaf", 0.0, 0.5)
                et_max_features = trial.suggest_categorical("et_max_features", ["sqrt", "log2", None])
                et_bootstrap = trial.suggest_categorical("et_bootstrap", [True, False])
                et_oob_score = trial.suggest_categorical("et_oob_score", [True, False])
                et_class_weight = trial.suggest_categorical("et_class_weight", ['balanced', 'balanced_subsample'])
                
                algo = ExtraTreesClassifier(n_estimators=et_n_estimators, criterion=et_criterion, min_samples_split=et_min_samples_split,
                                              min_samples_leaf=et_min_samples_leaf, max_features=et_max_features, min_weight_fraction_leaf=et_min_weight_fraction_leaf,
                                              bootstrap=et_bootstrap, oob_score=et_oob_score, class_weight=et_class_weight, n_jobs=4, random_state=8)                                              
            elif c_algorithm == 'Gradient Boosting':
                gb_loss = trial.suggest_categorical("gb_loss", ['log_loss', 'deviance', 'exponential'])
                gb_n_estimators = trial.suggest_int("gb_n_estimators", 2, 1000)
                gb_criterion = trial.suggest_categorical("gb_criterion", ['friedman_mse', 'squared_error', 'mse'])
                gb_min_samples_split = trial.suggest_float("gb_min_samples_split", 0.01, 1.0)
                gb_min_samples_leaf = trial.suggest_float("gb_min_samples_leaf", 0.01, 1.0)
                gb_min_weight_fraction_leaf = trial.suggest_float("gb_min_weight_fraction_leaf", 0.0, 1.0)
                gb_max_features = trial.suggest_categorical("gb_max_features", ["sqrt", "log2", None])

                algo = GradientBoostingClassifier(loss=gb_loss, n_estimators=gb_n_estimators, criterion=gb_criterion, min_samples_split=gb_min_samples_split,
                                                  min_samples_leaf=gb_min_samples_leaf, min_weight_fraction_leaf=gb_min_weight_fraction_leaf, max_features=gb_max_features, 
                                                  random_state=8)
            elif c_algorithm == 'Bernoulli':
                bern_alpha = trial.suggest_float("bern_alpha", 0.0, 2.0)
                bern_binarize = trial.suggest_float("bern_binarize", 0.0, 1.0)
                bern_fit_prior = trial.suggest_categorical("bern_fit_prior", [True, False])
                
                algo = BernoulliNB(alpha=bern_alpha, binarize=bern_binarize, fit_prior=bern_fit_prior)

            elif c_algorithm == 'MultinomialNB':
                mn_alpha = trial.suggest_float("mn_alpha", 0.0, 2.0)
                mn_fit_prior = trial.suggest_categorical("mn_fit_prior", [True, False])                

                algo = MultinomialNB(alpha=mn_alpha, fit_prior=mn_fit_prior)

            else:
                c_algorithm = "None"

        elif algorithm_type == 'anomaly_detection':
            # a_algorithm = trial.suggest_categorical("a_algorithm", ['ABOD', 'COPOD', 'SOS', 'Sampling', 'GMM', 'MCD', 'CD', 'OCSVM', 'LMDD', 'LOF', 
            #                                                     'COF', 'CBLOF', 'HBOS', 'SOD', 'IForest', 'INNE', 'XGBOD', 'LODA', 'SUOD', 
            #                                                     'AutoEncoder', 'VAE', 'SO_GAAL', 'MO_GAAL', 'DeepSVDD', 'AnoGAN', 'RGraph', 'LUNAR'])
            a_algorithm = trial.suggest_categorical("a_algorithm", ['ABOD', 'COPOD', 'SOS', 'Sampling', 'GMM', 'MCD', 'CD', 'OCSVM', 'LMDD', 'LOF'])                                                              
            if a_algorithm == 'ABOD':
                abod_contamination = trial.suggest_float('abod_contamination', 0, 0.5)
                abod_n_neighbors = trial.suggest_int('abod_n_neighbors', 1, 100)

                algo = ABOD(contamination=abod_contamination, n_neighbors=abod_n_neighbors, method='fast')
            elif a_algorithm == 'COPOD':
                copod_contamination = trial.suggest_float('copod_contamination', 0, 0.5)
                copod_n_neighbors = trial.suggest_int('copod_n_neighbors', 1, 100)

                algo = COPOD(contamination=copod_contamination, n_neighbors=copod_n_neighbors)
            elif a_algorithm == 'SOS':
                sos_contamination = trial.suggest_float('sos_contamination', 0, 0.5)
                sos_perplexity = trial.suggest_float("sos_perplexity", 1.0, 100.0)                
                sos_metric = trial.suggest_categorical('sos_metric', ['braycurtis', 'canberra', 'chebyshev', 'correlation', 
                                                              'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 
                                                              'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 
                                                              'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 
                                                              'yule', 'euclidean'])

                algo = SOS(contamination=sos_contamination, perplexity=sos_perplexity, metric=sos_metric)
            elif a_algorithm == 'Sampling':
                sampling_contamination = trial.suggest_float('sampling_contamination', 0, 0.5)
                sampling_subset_size = trial.suggest_float('sampling_subset_size', 0, 1.0)                
                sampling_metric = trial.suggest_categorical('sampling_metric', ['braycurtis', 'canberra', 'chebyshev', 'correlation', 
                                                              'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 
                                                              'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 
                                                              'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 
                                                              'yule', 'euclidean', 'cityblock', 'cosine', 'euclidean', 'l1', 'l2', 
                                                              'manhattan'])

                algo = Sampling(contamination=sampling_contamination, subset_size=sampling_subset_size, metric=sampling_metric)

            elif a_algorithm == 'GMM':
                gmm_n_components = trial.suggest_int('gmm_n_components', 1, 100)
                gmm_covariance_type = trial.suggest_categorical('gmm_covariance_type', ['full', 'tied', 'diag', 'spherical'])
                gmm_tol = trial.suggest_float('gmm_tol', 1e-1, 1e-10)
                gmm_reg_covar = trial.suggest_float('gmm_reg_covar', 1e-1, 1e-10)
                gmm_max_iter = trial.suggest_int('gmm_max_iter', 2, 1000)
                gmm_init_params = trial.suggest_categorical('gmm_init_params', ['kmeans', 'random'])
                gmm_contamination = trial.suggest_float('gmm_contamination', 0, 0.5)

                algo = GMM(n_components=gmm_n_components, gmm_covariance_type=gmm_covariance_type, tol=gmm_tol, reg_covar=gmm_reg_covar,
                            max_iter=gmm_max_iter, init_params=gmm_init_params, contamination=gmm_contamination, random_state=8)
            elif a_algorithm == 'MCD':
                mcd_store_precision = trial.suggest_categorical('mcd_store_precision', [True, False])
                mcd_assume_centered = trial.suggest_categorical('mcd_assume_centered', [True, False])
                mcd_contamination = trial.suggest_float('mcd_contamination', 0, 0.5)

                algo = MCD(contamination=mcd_contamination, store_precision=mcd_store_precision, assume_centered=mcd_assume_centered, random_state=8)

            elif a_algorithm == 'CD':
                cd_n_components = trial.suggest_int('cd_n_components', 1, 100)
                cd_whiten = trial.suggest_categorical('cd_whiten', [True, False])
                cd_rule_of_thumb = trial.suggest_cateogorical('cd_rule_of_thumb', [True, False])
                cd_contamination = trial.suggest_float('cd_contamination', 0, 0.5)

                algo = CD(contamination=cd_contamination, n_components=cd_n_components, whiten=cd_whiten, rule_of_thumb=cd_rule_of_thumb)
            elif a_algorithm == 'OCSVM':
                ocsvm_svm_kernel = trial.suggest_categorical("ocsvm_svm_kernel", ['linear', 'poly', 'rbf', 'sigmoid'])
                ocsvm_svm_gamma = trial.suggest_categorical("ocsvm_svm_gamma", ['auto', 'scale'])
                ocsvm_svm_shrinking = trial.suggest_categorical("ocsvm_svm_gamma", [True, False])
                ocsvm_svm_max_iter = trial.suggest_int("ocsvm_svm_max_iter", 2, 1000)
                ocsvm_contamination = trial.suggest_float('ocsvm_contamination', 0, 0.5)

                algo = OCSVM(kernel=ocsvm_svm_kernel, gamma=ocsvm_svm_gamma, shrinking=ocsvm_svm_shrinking, max_iter=ocsvm_svm_max_iter, 
                              cache_size=750, random_state=8, contamination=ocsvm_contamination)
            elif a_algorithm == 'LMDD':
                lmdd_contamination = trial.suggest_float('lmdd_contamination', 0, 0.5)
                lmdd_n_iter = trial.suggest_int('lmdd_n_iter', 2, 1000)
                lmdd_dis_measure = trial.suggest_categorical('lmdd_dis_measure', ['aad', 'var', 'iqr'])

                algo = LMDD(contamination=lmdd_contamination, n_iter=lmdd_n_iter, dis_measure=lmdd_dis_measure, random_state=8)

            elif a_algorithm == 'LOF':
                lof_contamination = trial.suggest_float('lof_contamination', 0, 0.5)
                lof_n_neighbors = trial.suggest_int('lof_n_neighbors', 2, 100)
                lof_algorithm = trial.suggest_categorical('lof_algorithm', ['ball_tree', 'kd_tree', 'brute', 'auto'])
                lof_metric = trial.suggest_categorical('lof_metric', ['braycurtis', 'canberra', 'chebyshev', 'correlation', 
                                                              'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 
                                                              'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 
                                                              'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 
                                                              'yule', 'euclidean', 'cityblock', 'cosine', 'euclidean', 'l1', 'l2', 
                                                              'manhattan'])
                
                algo = LOF(contamination=lof_contamination, n_neighbors=lof_n_neighbors, algorithm=lof_algorithm, n_jobs=4, metric=lof_metric)

            else:
                pass
        else:
            algo = XGBClassifier()
        
        algo.fit(x_train, y_train)
        predictions = algo.predict(x_test)
        score = balanced_accuracy_score(y_test, predictions)

        del algo, predictions, x_train, x_test, y_train, y_test
        

        return score
    
    # We need better logging with full tracebacks for better debugging
    except Exception as E:
        print("======================================LOGGING======================================")
        print("Error is: ")
        # print(get_traceback(E))
        traceback.print_exc()
        print("======================================LOGGING======================================")
        print("\n")

        return -100

In [11]:
fxn = lambda trial: optimize_dabbco(trial, X_train, X_test, y_train, y_test)
study = optuna.create_study(direction="maximize")
study.optimize(fxn, n_trials=10)

[32m[I 2022-11-30 00:23:46,817][0m A new study created in memory with name: no-name-4491efff-6024-40a9-8496-cd303ba90eb7[0m
[32m[I 2022-11-30 00:23:47,035][0m Trial 0 finished with value: -100.0 and parameters: {'undersample': 'Yes', 'oversample': 'No', 'data_type': 'PCA', 'scaling': 'Standard', 'normalizing': 'No', 'minmax': 'No', 'algorithm_type': 'classification', 'c_algorithm': 'Extra Trees', 'data_n_components': 25, 'whiten': False, 'svd_solver': 'full', 'usampler': 'Random', 'replacement': True, 'n_estimators': 670, 'criterion': 'gini', 'min_samples_split': 0.6751205504194089, 'min_samples_leaf': 0.6770612993475804, 'min_weight_fraction_leaf': 0.8202993254506294, 'max_features': 'log2', 'bootstrap': False, 'oob_score': False, 'class_weight': 'balanced'}. Best is trial 0 with value: -100.0.[0m
[32m[I 2022-11-30 00:23:47,039][0m Trial 1 finished with value: -100.0 and parameters: {'undersample': 'No', 'oversample': 'No', 'data_type': 'RAW', 'scaling': 'Standard', 'normalizi

Error is: 
Traceback (most recent call last):
  File "C:\Users\Kelly\AppData\Local\Temp\ipykernel_17300\885668637.py", line 253, in optimize_dabbco
    algo.fit(x_train, y_train)
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\sklearn\ensemble\_forest.py", line 476, in fit
    trees = Parallel(
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\joblib\parallel.py", line 1056, in __call__
    self.retrieve()
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\joblib\parallel.py", line 935, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "C:\Users\Kelly\AppData\Local\Programs\Python\Python310\lib\multiprocessing\pool.py", line 771, in get
    raise self._value
  File "C:\Users\Kelly\AppData\Local\Programs\Python\Python310\lib\multiprocessing\pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\joblib\_parallel_backends.py", line 

[32m[I 2022-11-30 00:23:47,262][0m Trial 3 finished with value: 0.5 and parameters: {'undersample': 'Yes', 'oversample': 'No', 'data_type': 'RAW', 'scaling': 'Robust', 'normalizing': 'No', 'minmax': 'Yes', 'algorithm_type': 'classification', 'c_algorithm': 'XGBoost', 'usampler': 'Repeated', 'undersample_n_neighbors': 12, 'undersample_max_iter': 50, 'undersample_kind_sel': 'all', 'max_depth': 23, 'min_child_weight': 45, 'gamma': 93, 'subsample': 0.9106728411908478, 'max_delta_step': 6}. Best is trial 3 with value: 0.5.[0m
[32m[I 2022-11-30 00:23:47,268][0m Trial 4 finished with value: -100.0 and parameters: {'undersample': 'Yes', 'oversample': 'Yes', 'data_type': 'RAW', 'scaling': 'Standard', 'normalizing': 'Yes', 'minmax': 'Yes', 'algorithm_type': 'classification', 'c_algorithm': 'Multinomial', 'usampler': 'Random', 'replacement': False}. Best is trial 3 with value: 0.5.[0m
[32m[I 2022-11-30 00:23:47,288][0m Trial 5 finished with value: -100.0 and parameters: {'undersample': 'Y

Error is: 
Traceback (most recent call last):
  File "C:\Users\Kelly\AppData\Local\Temp\ipykernel_17300\885668637.py", line 253, in optimize_dabbco
    algo.fit(x_train, y_train)
UnboundLocalError: local variable 'algo' referenced before assignment



Error is: 
Traceback (most recent call last):
  File "C:\Users\Kelly\AppData\Local\Temp\ipykernel_17300\885668637.py", line 130, in optimize_dabbco
    x_train, y_train = under_sampler.fit_resample(x_train, y_train)
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\imblearn\base.py", line 83, in fit_resample
    output = self._fit_resample(X, y)
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\imblearn\under_sampling\_prototype_selection\_edited_nearest_neighbours.py", line 334, in _fit_resample
    X_enn, y_enn = self.enn_.fit_resample(X_, y_)
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\imblearn\base.py", line 83, in fit_resample
    output = self._fit_resample(X, y)
  File "d:\St

[32m[I 2022-11-30 00:23:51,789][0m Trial 6 finished with value: -100.0 and parameters: {'undersample': 'Yes', 'oversample': 'No', 'data_type': 'UMAP', 'scaling': 'Standard', 'normalizing': 'No', 'minmax': 'Yes', 'algorithm_type': 'classification', 'c_algorithm': 'Multinomial', 'data_n_neighbors': 60, 'min_dist': 0.2577743868193004, 'data_n_components': 63, 'data_metric': 'yule'}. Best is trial 3 with value: 0.5.[0m


Error is: 
Traceback (most recent call last):
  File "C:\Users\Kelly\AppData\Local\Temp\ipykernel_17300\885668637.py", line 56, in optimize_dabbco
    x_train = transformer.fit_transform(mm.fit_transform(x_train))
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\umap\umap_.py", line 2772, in fit_transform
    self.fit(X, y)
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\umap\umap_.py", line 2684, in fit
    self.embedding_, aux_data = self._fit_embed_data(
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\umap\umap_.py", line 2717, in _fit_embed_data
    return simplicial_set_embedding(
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\umap\umap_.py", line 1078, in simplicial_set_embedding
    initialisation = spectral_layout(
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\umap\spectral.py", line 332, in spectral_layout
    eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
  File "d:\Storag

[32m[I 2022-11-30 00:24:00,340][0m Trial 7 finished with value: -100.0 and parameters: {'undersample': 'No', 'oversample': 'Yes', 'data_type': 'UMAP', 'scaling': 'Standard', 'normalizing': 'No', 'minmax': 'Yes', 'algorithm_type': 'classification', 'c_algorithm': 'XGBoost', 'data_n_neighbors': 73, 'min_dist': 0.7261172367177843, 'data_n_components': 14, 'data_metric': 'kulsinski', 'osampler': 'ADASYN', 'oversample_n_neighbors': 61}. Best is trial 3 with value: 0.5.[0m
[32m[I 2022-11-30 00:24:00,345][0m Trial 8 finished with value: -100.0 and parameters: {'undersample': 'Yes', 'oversample': 'Yes', 'data_type': 'RAW', 'scaling': 'None', 'normalizing': 'No', 'minmax': 'No', 'algorithm_type': 'classification', 'c_algorithm': 'Random Forest', 'usampler': 'Random', 'replacement': False, 'n_estimators': 587, 'criterion': 'gini', 'min_samples_split': 0.5285553576731011, 'min_samples_leaf': 0.11701812914742163, 'min_weight_fraction_leaf': 0.9408725304662513, 'max_features': 'log2', 'bootstr

Error is: 
Traceback (most recent call last):
  File "C:\Users\Kelly\AppData\Local\Temp\ipykernel_17300\885668637.py", line 142, in optimize_dabbco
    x_train, y_train = over_sampler.fit_resample(x_train, y_train)
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\imblearn\base.py", line 83, in fit_resample
    output = self._fit_resample(X, y)
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\imblearn\over_sampling\_adasyn.py", line 143, in _fit_resample
    nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
  File "d:\Storage\Personal Files\Sinclair\_venv\lib\site-packages\sklearn\neighbors\_base.py", line 749, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples,  but n_samples = 60, n_neighbors = 62



Error is: 
Traceback (most recent call last):
  File "C:\Users\Kelly\AppData\Local\Temp\ipykernel_17300\885668637.py", line 253, in optimize_dabbco
    algo.fit(x_train, y_train)
  File "d:\Storage\Personal 

In [15]:
# Loop through all trials
all_trials = study.get_trials()

# Create a dict of the trial number and its p value
res_trials = {}
for _trial in all_trials:
    res_trials[_trial.number] = (_trial.value, _trial.params)

In [16]:
# Sort on top 10
parsed_trials = sorted(res_trials.items(), key=lambda x: x[1][0], reverse=True)[:25]        

parsed_trials

[(3,
  (0.5,
   {'undersample': 'Yes',
    'oversample': 'No',
    'data_type': 'RAW',
    'scaling': 'Robust',
    'normalizing': 'No',
    'minmax': 'Yes',
    'algorithm_type': 'classification',
    'c_algorithm': 'XGBoost',
    'usampler': 'Repeated',
    'undersample_n_neighbors': 12,
    'undersample_max_iter': 50,
    'undersample_kind_sel': 'all',
    'max_depth': 23,
    'min_child_weight': 45,
    'gamma': 93,
    'subsample': 0.9106728411908478,
    'max_delta_step': 6})),
 (0,
  (-100.0,
   {'undersample': 'Yes',
    'oversample': 'No',
    'data_type': 'PCA',
    'scaling': 'Standard',
    'normalizing': 'No',
    'minmax': 'No',
    'algorithm_type': 'classification',
    'c_algorithm': 'Extra Trees',
    'data_n_components': 25,
    'whiten': False,
    'svd_solver': 'full',
    'usampler': 'Random',
    'replacement': True,
    'n_estimators': 670,
    'criterion': 'gini',
    'min_samples_split': 0.6751205504194089,
    'min_samples_leaf': 0.6770612993475804,
    'min_

In [17]:
# Put parsed trials into a dataframe for further analysis
res_dfs = []
for t in all_trials:
    temp = pd.DataFrame.from_dict([t.params])
    temp['number'] = t.number
    temp['value'] = t.values[0] 

    res_dfs.append(temp)

final_results = pd.concat(res_dfs)

In [19]:
final_results.sort_values(by='value', inplace=True)
final_results

Unnamed: 0,undersample,oversample,data_type,scaling,normalizing,minmax,algorithm_type,c_algorithm,data_n_components,whiten,...,undersample_kind_sel,max_depth,min_child_weight,gamma,subsample,max_delta_step,data_n_neighbors,min_dist,data_metric,oversample_n_neighbors
0,Yes,No,PCA,Standard,No,No,classification,Extra Trees,25.0,False,...,,,,,,,,,,
0,No,No,RAW,Standard,Yes,No,classification,Gradient Boosting,,,...,,,,,,,,,,
0,No,Yes,RAW,Robust,Yes,No,classification,XGBoost,,,...,,,,,,,,,,
0,Yes,Yes,RAW,Standard,Yes,Yes,classification,Multinomial,,,...,,,,,,,,,,
0,Yes,No,PCA,Robust,No,No,classification,Extra Trees,47.0,False,...,all,,,,,,,,,
0,Yes,No,UMAP,Standard,No,Yes,classification,Multinomial,63.0,,...,,,,,,,60.0,0.257774,yule,
0,No,Yes,UMAP,Standard,No,Yes,classification,XGBoost,14.0,,...,,,,,,,73.0,0.726117,kulsinski,61.0
0,Yes,Yes,RAW,,No,No,classification,Random Forest,,,...,,,,,,,,,,
0,Yes,Yes,PCA,Standard,Yes,Yes,classification,XGBoost,72.0,False,...,,,,,,,,,,
0,Yes,No,RAW,Robust,No,Yes,classification,XGBoost,,,...,all,23.0,45.0,93.0,0.910673,6.0,,,,


In [24]:
final_results.head(n=1).to_clipboard(excel=True)

# Visualizations

In [12]:
# df['clean_remarks'] = df['Remarks'].apply(process_descriptions)
# s = pd.Series(df['clean_remarks'])
# # Join the cleaned strings together to pass to wordcloud
# corpus = ' '.join(s)

# # Generate the wordcloud object
# wc = WordCloud(width=500, height=250, max_words=50, normalize_plurals=True).generate(corpus)
# plt.imshow(wc, interpolation='bilinear')
# plt.axis("off")
# plt.title("Most common words in Bid Remarks")
# plt.show()

In [13]:
# # Make a fake $ column
# min = 1000
# max = 10000

# df['Cost'] = [round(random.uniform(min, max), 2) for _ in range(df.shape[0])]

In [14]:
# for col in ['Priority', "General_Contractor", 'EST']:
#     t = df.groupby(col).size().reset_index(name="Counts").sort_values(by="Counts", ascending=False)
#     # Should we use color here?
#     fig = px.bar(data_frame=t, x=str(col), y="Counts", barmode="group", color=str(col), title="Count of Bids by {}".format(str(col)))
#     fig.show()