# Project 3: Weather

Mengyu Jackson

## Overview



## Business Problem


## Data Understanding


In [1]:
!pip install --quiet --upgrade --user --upgrade-strategy=eager sktime

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import time

from sklearn.experimental import enable_iterative_imputer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, MinMaxScaler, MaxAbsScaler
from sklearn.compose import ColumnTransformer,make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, PassiveAggressiveClassifier, RidgeClassifierCV
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.pipeline import FeatureUnion
from sklearn.impute import MissingIndicator
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sktime.transformations.panel.rocket import MiniRocket
from sktime.utils.data_processing import from_2d_array_to_nested
from sktime.forecasting.model_selection import temporal_train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score, confusion_matrix, f1_score, accuracy_score, classification_report
from sktime.transformations.series.impute import Imputer
from sklearn.feature_selection import SelectFromModel, RFECV, SequentialFeatureSelector, SelectKBest, chi2

In [3]:
RANDOM_SEED = 0

## Data Preparation



In [4]:
weatherAUS = pd.read_csv('./data/weatherAUS.csv')
df = weatherAUS.copy()

In [5]:
weatherAUS.isna().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [6]:
weatherAUS["Location"].unique()

array(['Albury', 'BadgerysCreek', 'Cobar', 'CoffsHarbour', 'Moree',
       'Newcastle', 'NorahHead', 'NorfolkIsland', 'Penrith', 'Richmond',
       'Sydney', 'SydneyAirport', 'WaggaWagga', 'Williamtown',
       'Wollongong', 'Canberra', 'Tuggeranong', 'MountGinini', 'Ballarat',
       'Bendigo', 'Sale', 'MelbourneAirport', 'Melbourne', 'Mildura',
       'Nhil', 'Portland', 'Watsonia', 'Dartmoor', 'Brisbane', 'Cairns',
       'GoldCoast', 'Townsville', 'Adelaide', 'MountGambier', 'Nuriootpa',
       'Woomera', 'Albany', 'Witchcliffe', 'PearceRAAF', 'PerthAirport',
       'Perth', 'SalmonGums', 'Walpole', 'Hobart', 'Launceston',
       'AliceSprings', 'Darwin', 'Katherine', 'Uluru'], dtype=object)

In [7]:
data_by_location = {
    loc: weatherAUS[weatherAUS["Location"]==loc].copy()
    for loc in weatherAUS["Location"].unique()
}

na_columns_by_loc = {}
for loc, df in data_by_location.items():
    na_columns_by_loc[loc] = [
        col for col in df.columns
        if all(df[col].isna())
    ]

for loc, na_columns in na_columns_by_loc.items():
    df = data_by_location[loc]
    print(f"{loc}: {na_columns}")
    for col in na_columns:

        df.pop(col)
        
pprint.pprint(na_columns_by_loc)

Albury: ['Evaporation', 'Sunshine']
BadgerysCreek: ['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm']
Cobar: []
CoffsHarbour: []
Moree: []
Newcastle: ['Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'Pressure9am', 'Pressure3pm']
NorahHead: ['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm']
NorfolkIsland: []
Penrith: ['Evaporation', 'Sunshine', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm']
Richmond: ['Sunshine']
Sydney: []
SydneyAirport: []
WaggaWagga: []
Williamtown: []
Wollongong: ['Evaporation', 'Sunshine']
Canberra: []
Tuggeranong: ['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm']
MountGinini: ['Evaporation', 'Sunshine', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm']
Ballarat: ['Evaporation', 'Sunshine']
Bendigo: ['Sunshine']
Sale: []
MelbourneAirport: []
Melbourne: []
Mildura: []
Nhil: ['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm']
Portland: []
Watsonia: []
Dartmoor: ['Cloud9am', 'Cloud3pm']
Brisbane: []
Cairns: []
GoldCoast: ['Evaporation', 'Suns

In [8]:
all_na_columns = sorted(set(sum(na_columns_by_loc.values(), [])))

In [9]:
na_column_df = pd.DataFrame([
    [1 if col in na_columns_by_loc[loc] else 0 
             for col in all_na_columns ]
    for loc in na_columns_by_loc.keys()],
    index=list(na_columns_by_loc.keys()),
    columns=all_na_columns,
)

In [10]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df.pop('Date')

143882   2013-03-01
143883   2013-03-02
143884   2013-03-03
143885   2013-03-04
143886   2013-03-05
            ...    
145455   2017-06-21
145456   2017-06-22
145457   2017-06-23
145458   2017-06-24
145459   2017-06-25
Name: Date, Length: 1578, dtype: datetime64[ns]

In [11]:
len(df)

1578

In [12]:
df = df.dropna(subset=['RainToday', 'RainTomorrow'])
df['RainToday'] = df['RainToday'].replace('No', 0).replace('Yes', 1).astype(float)
df['RainTomorrow'] = df['RainTomorrow'].replace('No', 0).replace('Yes', 1).astype(float)

#df['RainTomorrow'].replace('No', 0).replace('Yes', 1).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RainToday'] = df['RainToday'].replace('No', 0).replace('Yes', 1).astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RainTomorrow'] = df['RainTomorrow'].replace('No', 0).replace('Yes', 1).astype(float)


In [13]:
df['WindGustDir'] = df['WindGustDir'].fillna("NaN")
df['WindDir9am'] = df['WindDir9am'].fillna("NaN")
df['WindDir3pm'] = df['WindDir3pm'].fillna("NaN")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['WindGustDir'] = df['WindGustDir'].fillna("NaN")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['WindDir9am'] = df['WindDir9am'].fillna("NaN")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['WindDir3pm'] = df['WindDir3pm'].fillna("NaN")


In [14]:
#df = df.dropna()
len(df)

1502

In [15]:
x_test_full = df[df['Year']>=2016].copy()
y_test_full = x_test_full.pop('RainTomorrow')
x_train_full = df[df['Year']<2016].copy()
y_train_full = x_train_full.pop('RainTomorrow')


In [16]:
len(x_test_full)

527

In [17]:
df.dtypes

Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday        float64
RainTomorrow     float64
Year               int64
Month              int64
Day                int64
dtype: object

In [18]:
# column_transformer = ColumnTransformer([
#     ("windgustdir",OneHotEncoder(), ["WindGustDir"]),
#     ("winddir9am",OneHotEncoder(), ["WindDir9am"]),
#     ("winddir3pm",OneHotEncoder(), ["WindDir3pm"]),
#     ("loc",OneHotEncoder(), ["Location"]),], 
#     remainder="passthrough",
# )
column_transformer = make_column_transformer(
    (MinMaxScaler(),
     make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(handle_unknown="ignore"),
     make_column_selector(dtype_include=object)))

column_transformer.fit(x_train_full)

ColumnTransformer(transformers=[('minmaxscaler', MinMaxScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x0000024165B27E50>),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x0000024165B27F70>)])

In [19]:
# foo = pd.DataFrame(data=column_transformer.transform(x_train).toarray())

# column_transformer.transform(x_train)
x_train_full

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,...,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,Year,Month,Day
143882,Uluru,19.7,30.0,0.8,ESE,48.0,E,E,30.0,24.0,...,1010.6,1007.5,,,21.7,28.4,0.0,2013,3,1
143883,Uluru,21.6,33.1,0.0,E,33.0,E,N,22.0,11.0,...,1010.5,1006.5,,,24.6,31.3,0.0,2013,3,2
143884,Uluru,21.3,36.1,0.0,E,33.0,ENE,SSE,24.0,13.0,...,1006.9,1002.7,,,27.6,34.5,0.0,2013,3,3
143885,Uluru,22.9,37.7,0.0,ENE,39.0,E,SSE,28.0,13.0,...,1006.0,1002.1,,,28.7,35.4,0.0,2013,3,4
143886,Uluru,24.0,39.0,0.0,S,39.0,E,S,20.0,19.0,...,1006.9,1003.5,,,29.9,37.3,0.0,2013,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144913,Uluru,20.5,34.7,0.0,E,52.0,ESE,E,35.0,20.0,...,1013.2,1010.1,,,24.3,33.0,0.0,2015,12,27
144914,Uluru,18.0,36.4,0.0,ESE,54.0,E,ESE,30.0,31.0,...,1014.7,1010.9,,,26.7,35.0,0.0,2015,12,28
144915,Uluru,17.5,37.1,0.0,E,56.0,E,SE,33.0,22.0,...,1012.6,1007.5,,,28.1,34.7,0.0,2015,12,29
144916,Uluru,20.0,38.9,0.0,E,59.0,E,SSE,20.0,17.0,...,1007.2,1002.6,,1.0,31.3,38.4,0.0,2015,12,30


### First Model

* RandomForestClassifier

In [20]:
# rfc = make_pipeline(column_transformer, 
#                     FunctionTransformer(lambda x: x.todense(), accept_sparse=True),
#                     MinMaxScaler(),
#                     KNNImputer(), 
#                     RandomForestClassifier(random_state=random_state=None))
# rfc.fit(x_train, y_train)
# rfc.score(x_test, y_test)


# Score is 0.8453908984830805

In [21]:
X_new = SelectKBest(chi2, k=20).fit_transform(x_train_full, y_train_full)

ValueError: could not convert string to float: 'Uluru'

In [None]:
rfc = make_pipeline(column_transformer,
                    SimpleImputer(), 
                    RandomForestClassifier(random_state=RANDOM_SEED))

trimmed_rfc = SequentialFeatureSelector(estimator=rfc)
trimmed_rfc.fit(x_train_full, y_train_full)
trimmed_rfc.estimator_.score(x_test_full, y_test_full)

In [None]:
print(f"{x_train_full.columns[trimmed_rfc.get_support()]}")

In [None]:
model = SelectFromModel(rfc.named_steps.randomforestclassifier, prefit=True)

In [None]:
transform_pipeline_preprocessing = make_pipeline(column_transformer, 
                    MaxAbsScaler(),
                    SimpleImputer(),
                                  )
transform_pipeline_preprocessing.fit(x_train_full)
transform_pipeline = make_pipeline(transform_pipeline_preprocessing,
                                                                     model)

new_columns = transform_pipeline.transform(x_train_full)

In [None]:
model.n_features_in_

In [None]:
rfc.named_steps.randomforestclassifier.feature_importances_

In [None]:
for df in data_by_location.values():
    print(df.columns)
    df.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)
    df['RainToday'] = df['RainToday'].replace('No', 0).replace('Yes', 1).astype(float)
    df["Date"] = pd.to_datetime(df["Date"])
    df.set_index("Date", inplace=True)
    period_index = df.index.to_period("D")
    df.index = df.index.to_period("D")


In [None]:
y_by_location, X_by_location = {}, {}

for loc, df in data_by_location.items():
    X_by_location[loc] = df.copy()
    y_by_location[loc] = X_by_location[loc].pop("RainTomorrow")
    X_by_location[loc].pop("Location")

In [None]:
X_train_by_loc, X_test_by_loc, y_train_by_loc, y_test_by_loc = {}, {}, {}, {}

for loc in data_by_location.keys():
    X_train_by_loc[loc], X_test_by_loc[loc], y_train_by_loc[loc], y_test_by_loc[loc] = temporal_train_test_split(X_by_location[loc], y_by_location[loc], train_size=.85)

In [None]:
y_train_by_loc

In [None]:
column_transformer_by_loc = {}

for loc, X_train in X_train_by_loc.items():
    encoders = []
    for col in ["WindGustDir", "WindDir9am", "WindDir3pm"]:
        if col in X_train.columns:
            encoders.append((col.lower(), OneHotEncoder(handle_unknown = 'ignore'), [col]))
    
    column_transformer_by_loc[loc] = ColumnTransformer(encoders, remainder="passthrough", sparse_threshold=0)
    #colun_transformer_by_loc[loc].fit(X_train)

In [None]:
transform_pipeline_by_loc = {
    loc: make_pipeline(
        transformer,
        MaxAbsScaler(),
        KNNImputer(),
    )
    for loc, transformer in column_transformer_by_loc.items()
}

In [None]:
X_train_transformed_by_loc = {
    loc: from_2d_array_to_nested(transform_pipeline.fit_transform(X_train_by_loc[loc]))
    for loc, transform_pipeline in transform_pipeline_by_loc.items()
}

for loc, X_train_transformed in X_train_transformed_by_loc.items():
    X_train_transformed.index = X_train_by_loc[loc].index

In [None]:
start = time.time()

inference_pipeline_by_loc = {
    loc: make_pipeline(
        MiniRocket(), 
        RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
    )
    for loc in X_train_transformed_by_loc.keys()
}

for loc, inference_pipeline in inference_pipeline_by_loc.items():
    inference_pipeline.fit(X_train_transformed_by_loc[loc], y_train_by_loc[loc])
    
training_time = time.time() - start
print(f"Training took {training_time} seconds")

In [None]:
# Training time is over 12 hours; vs 202 seconds for the Rocket model.

# from sktime.classification.hybrid import HIVECOTEV1
# start2 = time.time()

# inference_pipeline_by_loc2 = {
#     loc: make_pipeline(
#         HIVECOTEV1()
#     )
#     for loc in X_train_transformed_by_loc.keys()
# }

# for loc, inference_pipeline in inference_pipeline_by_loc2.items():
#     inference_pipeline.fit(X_train_transformed_by_loc[loc], y_train_by_loc[loc])
    
# training_time2 = time.time() - start2
# print(f"Training took {training_time2} seconds")

In [None]:
X_test_transformed_by_loc = {
    loc: from_2d_array_to_nested(transform_pipeline.transform(X_test_by_loc[loc])) 
    #loc: transform_pipeline.transform(X_test_by_loc[loc])
    for loc, transform_pipeline in transform_pipeline_by_loc.items()
}

# for loc, transform_pipeline in transform_pipeline_by_loc.items():
#     print(loc)
#     transform_pipeline.transform(X_test_by_loc[loc])

In [None]:
y_pred_by_loc = {
    loc: inference_pipeline.predict(X_test_transformed_by_loc[loc])
    for loc, inference_pipeline in inference_pipeline_by_loc.items()
}
y_pred_proba_by_loc = {
    loc: inference_pipeline.predict_proba(X_test_transformed_by_loc[loc])
    for loc, inference_pipeline in inference_pipeline_by_loc.items()
}

In [None]:
len_by_loc = {loc: len(df) for loc, df in data_by_location.items()}
len_by_loc

In [None]:
f1_score_by_loc = {loc: f1_score(y_pred_by_loc[loc], y_test_by_loc[loc], pos_label="Yes") for loc in y_pred_by_loc.keys()}

pprint.pprint(f1_score_by_loc)

In [None]:
f1_scores_no_nas = pd.Series(sorted(score_by_loc[loc]
                   for loc, na_columns in na_columns_by_loc.items()
                   if not na_columns))
f1_scores_no_nas

f1_scores_no_nas.describe()

In [None]:
f1_scores_with_nas = pd.Series(sorted(score_by_loc[loc]
                   for loc, na_columns in na_columns_by_loc.items()
                   if na_columns))
f1_scores_with_nas.describe()

In [None]:
score_by_loc = {
    loc: inference_pipeline.score(X_test_transformed_by_loc[loc], y_test_by_loc[loc])
    for loc, inference_pipeline in inference_pipeline_by_loc.items()
}

confusion_matrix_by_loc = {
    loc: confusion_matrix(y_true, y_pred_by_loc[loc])
    for loc, y_true in y_test_by_loc.items()
}

#inference_pipeline["Albury"].score(X_test_transformed_by_loc["Albury"], y_test_by_loc["Albury"])
pprint.pprint(score_by_loc)
pprint.pprint(confusion_matrix_by_loc)

In [None]:
na_column_df_y = pd.Series((f1_score_by_loc[loc] for loc in na_column_df.index), index=na_column_df.index)



In [None]:
na_X_train, na_X_test, na_y_train, na_y_test = train_test_split(na_column_df, na_column_df_y, test_size=0.33, random_state=RANDOM_SEED)

In [None]:
na_regressor = LinearSVR()
na_regressor.fit(na_X_train, na_y_train)

In [None]:
na_regressor.score(na_X_test, na_y_test)

In [None]:
na_regressor.coef_

In [None]:
print(f"Confusion Matrix: {sum(confusion_matrix_by_loc.values()}")
print(f"Accuracy: )

In [None]:
# combined_y_true = []
# combined_y_pred = []

# for loc in y_test_by_loc.keys():

y_test_by_loc["Albury"].to_list() + y_test_by_loc["Melbourne"].to_list()
combined_y_test = sum((list(v) for v in y_test_by_loc.values()), [])
combined_y_pred = sum((list(v) for v in y_pred_by_loc.values()), [])
#y_pred_by_loc.keys()

In [None]:
print(f"Accuracy: {accuracy_score(combined_y_test, combined_y_pred)}")

In [None]:
print(classification_report(combined_y_test, combined_y_pred))

In [None]:
print(classification_report(y_test_by_loc["Woomera"], y_pred_by_loc["Woomera"]))

In [None]:
for loc, inference_pipeline in inference_pipeline_by_loc.items():
    print(f"{loc}")
    plot_confusion_matrix(inference_pipeline, X_test_transformed_by_loc[loc], y_test_by_loc[loc]) 
    
    
    

In [None]:
sum(score_by_loc.values()) / len(score_by_loc) for loc in score_by_loc

In [None]:
# average_precision = average_precision_score(x_test_transformed, ts_y_test)

# print('Average precision-recall score: {0:0.2f}'.format(
#       average_precision))
average_precision=0.0

In [None]:
loc = "Woomera"
disp = plot_precision_recall_curve(inference_pipeline_by_loc[loc], X_test_transformed_by_loc[loc], y_test_by_loc[loc])
disp.ax_.set_title('2-class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

In [None]:
x_train.isna().sum()

In [None]:
column_transformer = ColumnTransformer([
    ("windgustdir",OneHotEncoder(handle_unknown = 'ignore'), ["WindGustDir"]),
    ("winddir9am",OneHotEncoder(handle_unknown = 'ignore'), ["WindDir9am"]),
    ("winddir3pm",OneHotEncoder(handle_unknown = 'ignore'), ["WindDir3pm"]),
    ("loc",OneHotEncoder(), ["Location"]),], 
    remainder="passthrough",
)

In [None]:
column_transformer.fit(x_train)

In [None]:
x_train_full

In [None]:
classifier_dict = {}

for i, classifier in enumerate([
    make_pipeline(column_transformer, 
                  MaxAbsScaler(),
                  SimpleImputer(), 
                  RandomForestClassifier()
                 ),
    make_pipeline(
        column_transformer,
        MaxAbsScaler(),
        SimpleImputer(), 
        SelectFromModel(LinearSVC(penalty="l1", dual=False)),
        RandomForestClassifier()
    )
]):
    if classifier_dict.get(type(classifier).__name__) is None:
        start = time.time()
        classifier.fit(x_train_full, y_train_full)
        score = classifier.score(x_test_full, y_test_full)
        training_time = time.time() - start
        c_name = f"{type(classifier).__name__}_{i}"
        print(f"{type(classifier).__name__} ({training_time} seconds): {score}")
        classifier_dict[type(classifier).__name__] = {"classifier": pipeline, "score": score, "training_time": training_time}
    else:
        pipeline = classifier_dict[type(classifier).__name__]
    plot_confusion_matrix(classifier, x_test, y_test) 

pprint.pprint(classifier_dict)

In [None]:
classifier_dict = {}

for classifier in [
    LogisticRegression(),
    SGDClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    #NuSVC(nu=0.2),
    LinearSVC(),
    #SVC(), Too slow (600+ seconds) and outperformed by LinearSVC anyways
    KNeighborsClassifier(),
    #RadiusNeighborsClassifier(radius=1.5), # Nothing found in radius, even after increase
    NearestCentroid(),
    #MultinomialNB(), Negative values?
    BernoulliNB(),
    DecisionTreeClassifier(),
    BaggingClassifier(),
    RandomForestClassifier(),
    #ExtraTreesClassifier(), Too slow, (400+ seconds) and no meaningful difference from Bagging/RandomForest
    AdaBoostClassifier(),
    MLPClassifier(),
]:
    if classifier_dict.get(type(classifier).__name__) is None:
        start = time.time()
        pipeline = make_pipeline(column_transformer, 
                        MaxAbsScaler(),
                        SimpleImputer(), 
                        classifier)
        pipeline.fit(x_train, y_train)
        score = pipeline.score(x_test, y_test)
        training_time = time.time() - start
        c_name = f"{type(classifier).__name__}_{i}"
        print(f"{type(classifier).__name__} ({training_time} seconds): {score}")
        classifier_dict[type(classifier).__name__] = {"classifier": pipeline, "score": score, "training_time": training_time}
    else:
        pipeline = classifier_dict[type(classifier).__name__]
    plot_confusion_matrix(pipeline, x_test, y_test) 

pprint.pprint(classifier_dict)

In [None]:
classifier_dict = {}

for i, classifier in enumerate([
    MLPClassifier(learning_rate="adaptive", learning_rate_init=0.01, max_iter=1000, random_state=0),
    MLPClassifier(hidden_layer_sizes=(100,100),  learning_rate="adaptive", learning_rate_init=0.01, max_iter=1000, random_state=0),
    MLPClassifier(hidden_layer_sizes=(50,50),  learning_rate="adaptive", learning_rate_init=0.01, max_iter=1000, random_state=0),
    MLPClassifier(hidden_layer_sizes=(30,30,30),  learning_rate="adaptive", learning_rate_init=0.01, max_iter=1000, random_state=0)
]):
    start = time.time()
    pipeline = make_pipeline(column_transformer, 
                    MaxAbsScaler(),
                    SimpleImputer(), 
                    classifier)
    pipeline.fit(x_train, y_train)
    score = pipeline.score(x_test, y_test)
    training_time = time.time() - start
    c_name = f"{type(classifier).__name__}_{i}"
    print(f"{c_name} ({training_time} seconds): {score}")
    classifier_dict[c_name] = {"classifier": pipeline, "score": score, "training_time": training_time}
    plot_confusion_matrix(pipeline, x_test, y_test) 

pprint.pprint(classifier_dict)

In [None]:
classifier_dict = {}

for i, classifier in enumerate([
    RandomForestClassifier(n_estimators=1000, min_samples_leaf=100, oob_score=True, class_weight="balanced"),
]):
    if classifier_dict.get(type(classifier).__name__) is None:
        start = time.time()
        pipeline = make_pipeline(column_transformer, 
                        MaxAbsScaler(),
                        SimpleImputer(), 
                        classifier)
        pipeline.fit(x_train, y_train)
        score = pipeline.score(x_test, y_test)
        training_time = time.time() - start
        c_name = f"{type(classifier).__name__}_{i}"
        print(f"{type(classifier).__name__} ({training_time} seconds): {score}")
        classifier_dict[type(classifier).__name__] = {"classifier": pipeline, "score": score, "training_time": training_time}
    else:
        pipeline = classifier_dict[type(classifier).__name__]
    plot_confusion_matrix(pipeline, x_test, y_test) 

pprint.pprint(classifier_dict)

In [None]:
disp = plot_precision_recall_curve(classifier_dict["RandomForestClassifier"]["classifier"], x_test, y_test)

* LogisticRegression

In [None]:
lrc = make_pipeline(column_transformer, LogisticRegression(random_state=0))
lrc.fit(x_train, y_train)

In [None]:
lrc.score(x_test, y_test)

* DecisionTreeClassifier

In [None]:
drc = make_pipeline(column_transformer, DecisionTreeClassifier(random_state=0))
drc.fit(x_train, y_train)

In [None]:
drc.score(x_test, y_test)

## Evaluation

## Conclusions



# Linear Model

## Linear Model Feature Engineering



## Linear Models

