In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

from sklearn.metrics import f1_score, balanced_accuracy_score, precision_score, recall_score

import warnings


# Load pre-processed data

## Dataset 1: Biomed data on muscular dystropy


In [3]:
with open('data/biomed/preprocessed_biomed_data.pickle', 'rb') as handle:
    x_biomed_train, y_biomed_train, x_biomed_test, y_biomed_test = pickle.load(handle)


## Dataset 2: Fertility

In [4]:
with open('data/fertility/fertility_preprocessed.pickle', 'rb') as handle:
    x_fert_train, y_fert_train, x_fert_test, y_fert_test = pickle.load(handle)

## Dataset 3: Amazon Reviews

In [5]:
with open('data/reviews/preprocessed_reviews_data.pickle', 'rb') as handle:
    x_reviews_train, y_reviews_train, x_reviews_test, y_reviews_test = pickle.load(handle)

## Dataset 4: Congress

In [6]:
with open('data/congress/preprocessed_congress_data.pickle', 'rb') as handle:
    x_congress_train, y_congress_train, x_congress_test, y_congress_test = pickle.load(handle)

# Prediction

The following datasets ahve the following variables to use for the prediction:

- Biomed: `x_biomed_train`, `y_biomed_train`, `x_biomed_test`, `y_biomed_test`
- Fertility: `x_fert_train`, `y_fert_train`, `x_fert_test`, `y_fert_test`
- Reviews: `x_reviews_train`, `y_reviews_train`, `x_reviews_test`
- Congress: `x_congress_train`, `y_congress_train`, `x_congress_test`




## Classifier 3: Multilayer Perceptron

https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier

In [48]:

# parameters, as lists, so that I can change them more easily
activation_functions = ["logistic", "tanh", "relu"]
solvers = ["lbfgs", "sgd", "adam"]

# assigning my faetures and classes to generic names for easier switching
x_train = x_biomed_train
y_train = y_biomed_train

# more parameters
hidden_layers = (8,6,1)

# creating the model
mlp = MLPClassifier(hidden_layer_sizes = hidden_layers, 
                    max_iter=300, # epochs
                    activation = activation_functions[1], 
                    solver=solvers[-1], 
                    random_state=123 # kind of seed
                   )
# creating a pipeline
pipe = Pipeline(steps=[('mlpc', mlp)])
# training
pipe.fit(x_train, y_train)

# printing the score (its the accuracy, that can be changed somewhere)
print(pipe.score(x_biomed_test, y_biomed_test))

# making predictions
y_biomed_test_pred = pipe.predict(x_biomed_test)
# printing out the results of my prediction vs the actual class values
for i,x in enumerate(y_biomed_test):
    print(x, "-", y_biomed_test_pred[i])

0.8571428571428571
1 - 1
0 - 0
0 - 0
0 - 0
0 - 0
1 - 0
1 - 0
0 - 0
0 - 0
1 - 1
1 - 0
0 - 0
0 - 0
0 - 0
1 - 0
0 - 0
0 - 0
1 - 1
0 - 0
1 - 1
0 - 0
1 - 0
0 - 0
1 - 1
0 - 0
0 - 0
1 - 1
0 - 0
1 - 1
0 - 0
1 - 1
0 - 0
0 - 0
0 - 0
0 - 0
0 - 0
1 - 0
0 - 0
0 - 0
0 - 0
0 - 0
1 - 1




In [49]:
activation_functions = ["logistic", "tanh", "relu"]
solvers = ["lbfgs", "sgd", "adam"]
learning_rates = ["constant", "invscaling", "adaptive"]
learning_rate_init = 0.001 # default 0.001

x_train = x_biomed_train
y_train = y_biomed_train


hidden_layers = (8,6,1)

mlp = MLPClassifier(hidden_layer_sizes = hidden_layers, 
                    max_iter=300, # epochs
                    activation = activation_functions[1], 
                    solver=solvers[-1], 
                    random_state=1234 # kind of seed
                   )

# cross validating
# first param is the model incl its params, second is features, 
# third is class labels, cv is the k of k-fold-CV, scoring is the wanted scores
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
print(cross_val_score(mlp, x_train, y_train, cv=5, scoring="recall"))
print(cross_val_score(mlp, x_train, y_train, cv=5, scoring="f1_weighted"))




[0.75       0.91666667 0.66666667 0.75       0.83333333]




[0.9082782  0.94117647 0.84271284 0.90562771 0.87878788]




### how many hidden neurons should we use?

https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw

#### Suggestion (rule of thumb):

$\frac{N_s}{(\alpha*(N_i+N_o))}$

$N_i$ = number of input neurons

$N_o$ = number of output neurons

$N_s$ = number of samples in training data

$\alpha$ = scaling factor $\in [2,10]$



### MLP: biomed

In [51]:
# wie viele layers und wie groß?

mlp = MLPClassifier(hidden_layer_sizes = (2), 
                                        max_iter=500, # epochs
                                        activation = "relu", 
                                        solver="lbfgs", 
                                        #learning_rate = learning_rate,
                                        #learning_rate_init = learning_rate_init,
                                        random_state=1234 # kind of seed
                                       )
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="balanced_accuracy").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="f1_weighted").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="recall_weighted").mean())
print()

mlp = MLPClassifier(hidden_layer_sizes = (4), 
                                        max_iter=500, # epochs
                                        activation = "relu", 
                                        solver="lbfgs", 
                                        #learning_rate = learning_rate,
                                        #learning_rate_init = learning_rate_init,
                                        random_state=1234 # kind of seed
                                       )
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="balanced_accuracy").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="f1_weighted").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="recall_weighted").mean())
print()

mlp = MLPClassifier(hidden_layer_sizes = (6), 
                                        max_iter=500, # epochs
                                        activation = "relu", 
                                        solver="lbfgs", 
                                        #learning_rate = learning_rate,
                                        #learning_rate_init = learning_rate_init,
                                        random_state=1234 # kind of seed
                                       )
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="balanced_accuracy").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="f1_weighted").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="recall_weighted").mean())
print()

mlp = MLPClassifier(hidden_layer_sizes = (4, 2), 
                                        max_iter=500, # epochs
                                        activation = "relu", 
                                        solver="lbfgs", 
                                        #learning_rate = learning_rate,
                                        #learning_rate_init = learning_rate_init,
                                        random_state=1234 # kind of seed
                                       )
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="balanced_accuracy").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="f1_weighted").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="recall_weighted").mean())
print()

mlp = MLPClassifier(hidden_layer_sizes = (4, 3), 
                                        max_iter=500, # epochs
                                        activation = "relu", 
                                        solver="lbfgs", 
                                        #learning_rate = learning_rate,
                                        #learning_rate_init = learning_rate_init,
                                        random_state=1234 # kind of seed
                                       )
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="balanced_accuracy").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="f1_weighted").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="recall_weighted").mean())
print()


mlp = MLPClassifier(hidden_layer_sizes = (6, 2), 
                                        max_iter=500, # epochs
                                        activation = "logistic", 
                                        solver="lbfgs", 
                                        #learning_rate = learning_rate,
                                        #learning_rate_init = learning_rate_init,
                                        random_state=1234 # kind of seed
                                       )
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="balanced_accuracy").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="f1_weighted").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="recall_weighted").mean())
print()

mlp = MLPClassifier(hidden_layer_sizes = (6, 4), 
                                        max_iter=500, # epochs
                                        activation = "relu", 
                                        solver="lbfgs", 
                                        #learning_rate = learning_rate,
                                        #learning_rate_init = learning_rate_init,
                                        random_state=1234 # kind of seed
                                       )
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="balanced_accuracy").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="f1_weighted").mean())
print(cross_val_score(mlp, x_train, y_train, cv=10, scoring="recall_weighted").mean())
print()



0.8622727272727273
0.8774547773758974
0.8805147058823529

0.8577272727272728
0.8718309893991683
0.8746323529411765



ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.8756060606060606


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.8899547773758973


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.8930147058823529

0.8622727272727273
0.8774547773758974
0.8805147058823529

0.8506060606060606
0.8680187428222157
0.8746323529411765



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

0.8639393939393939


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

0.8831721620238244


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

0.8871323529411764

0.8925757575757576
0.9078433107103183
0.9102941176470589



#### CV

In [52]:
from sklearn.model_selection import cross_validate

# performing cross-fold validation on the training data to evaluate best parameters

biomed_results = pd.DataFrame(columns=["classifier", "balanced_accuracy", 
                                       "f1_weighted", "recall", 
                                       "precision", "time_taken", "activation_function", 
                                       "solver", "learning_rate", "hidden_layer"])

# the scores we want
scoring = {'balanced_accuracy': 'balanced_accuracy',
           'f1_weighted': 'f1_weighted',
           'precision_weighted': 'precision_weighted',
           'recall_weighted': 'recall_weighted'}

# filter out warnings (not sure if this is a good idea)
warnings.filterwarnings('ignore')

activation_functions = ["logistic", "tanh", "relu"]
solvers = ["lbfgs", "sgd", "adam"]
learning_rates = ["constant", "invscaling", "adaptive"]

x_train = x_biomed_train
y_train = y_biomed_train

#hidden_layers = (8,6,1)
#hidden_layers = (100)
hidden_layers = [
    (2),
    (4),
    (6),
    (4, 2),
    (4, 4),
    (6, 2),
    (6, 4),
    (6, 4, 2),
]

# running index
i = 0

# cross-validation k
k = 10


# only do the computation, if the following flag is true
# it takes an hour or so to compute this, not necessary
if True:
    # iterate through all parameter permutations
    # save accuracy, f1, precisiona and recall
    for activation_function in activation_functions:
        for solver in solvers:
            for learning_rate in learning_rates:
                for hidden_layer in hidden_layers:
                    
                    # make a model
                    mlp = MLPClassifier(hidden_layer_sizes = hidden_layer, 
                                        max_iter=300, # epochs
                                        activation = activation_function, 
                                        solver=solver, 
                                        learning_rate = learning_rate,
                                        learning_rate_init = learning_rate_init,
                                        random_state=1234 # kind of seed
                                       )

                    # cross_validate() returns a dictionary with results
                    # cv = k for f-fold cross-validation
                    cv_results = cross_validate(mlp, x_train, y_train, cv=k, scoring=scoring, return_train_score=True)
                    
                    # extract the result values and take the mean of the k iterations
                    fit_time = cv_results["fit_time"].mean()
                    balanced_accuracy = cv_results["test_balanced_accuracy"].mean()
                    f1_weighted = cv_results["test_f1_weighted"].mean()
                    recall = cv_results["test_recall_weighted"].mean()
                    precision = cv_results["test_precision_weighted"].mean()
              
                    # print out the running number, the accuracy and the parameters
                    print(i, "- acc:", balanced_accuracy, "-, time:",  fit_time, end="\r")


                    # save everything
                    biomed_results = pd.concat([biomed_results, pd.DataFrame({
                        "classifier": "mlp",
                        "balanced_accuracy": balanced_accuracy,
                        "f1_weighted": [f1_weighted],
                        "recall": [recall],
                        "precision": [precision],
                        "time_taken": fit_time,
                        "activation_function": activation_function,
                        "solver": solver,
                        "learning_rate": learning_rate,
                        "hidden_layer": str(hidden_layer)                        
                    })], ignore_index=True)
                    i += 1;
    
    # saving the results as a pickle
    with open('data/biomed/biomed_results.pickle', 'wb') as handle:
        pickle.dump(biomed_results, handle)


In [53]:
biomed_results.to_csv("data/biomed/biomed_results.csv")

In [54]:
# loading the saved results
with open('data/biomed/biomed_results.pickle', 'rb') as handle:
    biomed_results = pickle.load(handle)

display(biomed_results.sort_values("balanced_accuracy", ascending=False).head(10))
display(biomed_results.sort_values("recall", ascending=False).head(10))

Unnamed: 0,classifier,balanced_accuracy,f1_weighted,recall,precision,time_taken,activation_function,solver,learning_rate,hidden_layer
158,mlp,0.892576,0.907843,0.910294,0.919991,0.085247,relu,lbfgs,invscaling,"(6, 4)"
166,mlp,0.892576,0.907843,0.910294,0.919991,0.082983,relu,lbfgs,adaptive,"(6, 4)"
150,mlp,0.892576,0.907843,0.910294,0.919991,0.084893,relu,lbfgs,constant,"(6, 4)"
23,mlp,0.890455,0.912551,0.916544,0.926143,0.151449,logistic,lbfgs,adaptive,"(6, 4, 2)"
7,mlp,0.890455,0.912551,0.916544,0.926143,0.148197,logistic,lbfgs,constant,"(6, 4, 2)"
15,mlp,0.890455,0.912551,0.916544,0.926143,0.153861,logistic,lbfgs,invscaling,"(6, 4, 2)"
5,mlp,0.886364,0.906814,0.911029,0.917648,0.11727,logistic,lbfgs,constant,"(6, 2)"
21,mlp,0.886364,0.906814,0.911029,0.917648,0.125265,logistic,lbfgs,adaptive,"(6, 2)"
13,mlp,0.886364,0.906814,0.911029,0.917648,0.121541,logistic,lbfgs,invscaling,"(6, 2)"
86,mlp,0.884848,0.896403,0.899632,0.917579,0.133092,tanh,lbfgs,invscaling,"(6, 4)"


Unnamed: 0,classifier,balanced_accuracy,f1_weighted,recall,precision,time_taken,activation_function,solver,learning_rate,hidden_layer
23,mlp,0.890455,0.912551,0.916544,0.926143,0.151449,logistic,lbfgs,adaptive,"(6, 4, 2)"
15,mlp,0.890455,0.912551,0.916544,0.926143,0.153861,logistic,lbfgs,invscaling,"(6, 4, 2)"
7,mlp,0.890455,0.912551,0.916544,0.926143,0.148197,logistic,lbfgs,constant,"(6, 4, 2)"
72,mlp,0.878788,0.905071,0.911029,0.924786,0.057864,tanh,lbfgs,constant,2
80,mlp,0.878788,0.905071,0.911029,0.924786,0.062135,tanh,lbfgs,invscaling,2
5,mlp,0.886364,0.906814,0.911029,0.917648,0.11727,logistic,lbfgs,constant,"(6, 2)"
88,mlp,0.878788,0.905071,0.911029,0.924786,0.059005,tanh,lbfgs,adaptive,2
21,mlp,0.886364,0.906814,0.911029,0.917648,0.125265,logistic,lbfgs,adaptive,"(6, 2)"
13,mlp,0.886364,0.906814,0.911029,0.917648,0.121541,logistic,lbfgs,invscaling,"(6, 2)"
166,mlp,0.892576,0.907843,0.910294,0.919991,0.082983,relu,lbfgs,adaptive,"(6, 4)"


In [55]:
# bad example
# this is the model with the highest recall
# obviously it go there by just saying everyone has muscular dystrophy lol
x_train = x_biomed_train
y_train = y_biomed_train
# make a model
mlp = MLPClassifier(hidden_layer_sizes = (4), 
                    max_iter=300, # epochs
                    activation = "logistic", 
                    solver="sgd", 
                    learning_rate = "invscaling",
                    random_state=1234 # kind of seed
                   )

print("f1:", cross_val_score(mlp, x_train, y_train, cv=10, scoring="f1_weighted").mean())
print("recall", cross_val_score(mlp, x_train, y_train, cv=10, scoring="recall").mean())

mlp.fit(x_train, y_train)

pd.crosstab(y_biomed_test, mlp.predict(x_biomed_test), colnames=["predicted"], rownames=["actual"])

f1: 0.19026389211811207
recall 1.0


predicted,1
actual,Unnamed: 1_level_1
1,15
0,27


In [56]:
# visualize

x_train = x_biomed_train
y_train = y_biomed_train

mlp = MLPClassifier(hidden_layer_sizes = (6, 4, 2), 
                    max_iter=500, # epochs
                    activation = "logistic", 
                    solver="lbfgs", 
                    verbose=True,
                     learning_rate_init=.001,

                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)

mlp.score(x_train, y_train)
mlp.loss_

#fig, axes = plt.subplots(2, 2, figsize=(15, 10))


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           77     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.81197D-01    |proj g|=  1.17790D-01

At iterate    1    f=  6.52992D-01    |proj g|=  5.17792D-04

At iterate    2    f=  6.52991D-01    |proj g|=  3.08558D-04

At iterate    3    f=  6.52981D-01    |proj g|=  9.28675D-04

At iterate    4    f=  6.52962D-01    |proj g|=  2.30968D-03

At iterate    5    f=  6.52905D-01    |proj g|=  4.84661D-03

At iterate    6    f=  6.52726D-01    |proj g|=  9.13231D-03

At iterate    7    f=  6.49557D-01    |proj g|=  8.05513D-03

At iterate    8    f=  6.49516D-01    |proj g|=  2.27553D-02

At iterate    9    f=  6.38260D-01    |proj g|=  2.34352D-02

At iterate   10    f=  6.30874D-01    |proj g|=  6.08264D-02

At iterate   11    f=  5.20699D-01    |proj g|=  1.23179D-01

At iterate   12    f=  5.20131D-01    |proj g|=  1.30946D-01

At iterate   13    f=  4.1

 This problem is unconstrained.


-01    |proj g|=  2.84189D-03

At iterate   95    f=  1.82271D-01    |proj g|=  2.11433D-03

At iterate   96    f=  1.82211D-01    |proj g|=  5.37546D-03

At iterate   97    f=  1.82129D-01    |proj g|=  5.84879D-03

At iterate   98    f=  1.82103D-01    |proj g|=  1.12551D-02

At iterate   99    f=  1.81963D-01    |proj g|=  1.95619D-02

At iterate  100    f=  1.81941D-01    |proj g|=  4.25585D-03

At iterate  101    f=  1.81892D-01    |proj g|=  3.58800D-03

At iterate  102    f=  1.81832D-01    |proj g|=  3.57263D-03

At iterate  103    f=  1.81775D-01    |proj g|=  1.23247D-02

At iterate  104    f=  1.81651D-01    |proj g|=  4.34341D-03

At iterate  105    f=  1.81551D-01    |proj g|=  1.67673D-02

At iterate  106    f=  1.81506D-01    |proj g|=  6.12692D-03

At iterate  107    f=  1.81456D-01    |proj g|=  3.08793D-03

At iterate  108    f=  1.81432D-01    |proj g|=  3.09173D-03

At iterate  109    f=  1.81402D-01    |proj g|=  5.75400D-03

At iterate  110    f=  1.81363D-01    |

0.17488151710303415

#### best parameters & testing on holdout set
the disease is very rare, in fact, few patients in the sample even have it. We shouldd therefore aim to minimize false negatives, so we do not miss anyone having it. Therefore, we want to maximize recall.

in training, the learning rate never made a difference

In [57]:
x_train = x_biomed_train
y_train = y_biomed_train
x_test = x_biomed_test
y_test = y_biomed_test


print("best accuracy, 3d best recall")
mlp = MLPClassifier(hidden_layer_sizes = (6, 4), 
                    max_iter=300, # epochs
                    activation = "relu", 
                    solver="lbfgs", 
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = mlp.predict(x_test)
#pd.crosstab(y_biomed_test, y_pred, colnames=["predicted"], rownames=["actual"])
print("(6, 4)", "relu", "lbfgs")
print("Accuracy balanced:", balanced_accuracy_score(y_test, y_pred))
print("Recall balanced:", recall_score(y_test, y_pred, average="weighted"))
print("------")


print("best recall, 2nd best accuracy")
mlp = MLPClassifier(hidden_layer_sizes = (6, 4, 2), 
                    max_iter=300, # epochs
                    activation = "logistic", 
                    solver="lbfgs", 
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = mlp.predict(x_test)
#pd.crosstab(y_biomed_test, y_pred, colnames=["predicted"], rownames=["actual"])
print("(6, 4, 2)", "logistic", "lbfgs")
print("Accuracy balanced:", balanced_accuracy_score(y_test, y_pred))
print("Recall balanced:", recall_score(y_test, y_pred, average="weighted"))
print("------")


print("3d best accuracy")
mlp = MLPClassifier(hidden_layer_sizes = (6, 2), 
                    max_iter=300, # epochs
                    activation = "logistic", 
                    solver="lbfgs", 
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = mlp.predict(x_test)
#pd.crosstab(y_biomed_test, y_pred, colnames=["predicted"], rownames=["actual"])
print("(6, 2)", "logistic", "lbfgs")
print("Accuracy balanced:", balanced_accuracy_score(y_test, y_pred))
print("Recall balanced:", recall_score(y_test, y_pred, average="weighted"))
print("------")


print("2nd best recall")
mlp = MLPClassifier(hidden_layer_sizes = (2), 
                    max_iter=300, # epochs
                    activation = "tanh", 
                    solver="lbfgs", 
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = mlp.predict(x_test)
#pd.crosstab(y_biomed_test, y_pred, colnames=["predicted"], rownames=["actual"])
print("(2)", "tanh", "lbfgs")
print("Accuracy balanced:", balanced_accuracy_score(y_test, y_pred))
print("Recall balanced:", recall_score(y_test, y_pred, average="weighted"))
print("------")


best accuracy, 3d best recall

At iterate  305    f=  1.75671D-01    |proj g|=  1.89624D-03

At iterate  306    f=  1.75630D-01    |proj g|=  4.08641D-03

At iterate  307    f=  1.75555D-01    |proj g|=  8.62967D-04

At iterate  308    f=  1.75533D-01    |proj g|=  1.99795D-03

At iterate  309    f=  1.75485D-01    |proj g|=  3.35670D-03

At iterate  310    f=  1.75444D-01    |proj g|=  3.13307D-03

At iterate  311    f=  1.75397D-01    |proj g|=  3.56955D-03

At iterate  312    f=  1.75298D-01    |proj g|=  8.51299D-04

At iterate  313    f=  1.75258D-01    |proj g|=  3.25341D-03

At iterate  314    f=  1.75218D-01    |proj g|=  5.23578D-04

At iterate  315    f=  1.75203D-01    |proj g|=  3.81703D-04

At iterate  316    f=  1.75195D-01    |proj g|=  8.03955D-04

At iterate  317    f=  1.75190D-01    |proj g|=  5.94270D-04

At iterate  318    f=  1.75181D-01    |proj g|=  7.11230D-04

At iterate  319    f=  1.75161D-01    |proj g|=  4.23975D-04

At iterate  320    f=  1.75130D-01    |

In [58]:
# other good example
x_train = x_biomed_train
y_train = y_biomed_train
# make a model
mlp = MLPClassifier(hidden_layer_sizes = (4), 
                    max_iter=300, # epochs
                    activation = "tanh", 
                    solver="sgd", 
                    learning_rate = "invscaling",
                    random_state=1234 # kind of seed
                   )
print("f1:", cross_val_score(mlp, x_train, y_train, cv=10, scoring="f1_weighted").mean())
print("recall", cross_val_score(mlp, x_train, y_train, cv=10, scoring="recall").mean())

mlp.fit(x_train, y_train)
print(np.array([y_biomed_test, mlp.predict(x_biomed_test)]))

print()
# confusion matrix
pd.crosstab(y_biomed_test, mlp.predict(x_biomed_test), colnames=["predicted"], rownames=["actual"])

f1: 0.5388797219292576
recall 0.9333333333333333
[[1 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0
  1 0 0 0 0 1]
 [1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 0 1 0 0 0
  1 1 1 1 0 1]]



predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,15
0,13,14


### MLP: fertility
class labels: 0 is "normal", 1 is "altered"

intuition for evaluation: it's maybe better to falsely inform people that they are fertile, even though they may not be, than the other way around, telling potentially fertile people they aren't fertile. People failing while trying for a baby is common and typically less of a big deal than people accidentally getting pregnant believing they are unable to and also not wanting to. In other words, unwanted pregnancy with our model at fault would be the bigger issue than giving false hopes. On the other hand, the more realistic setting of this model in action is that it would advise anyone, where fertility could potentially be altered to go see a physician, so the model would not ever be blamed for an unwanted pregnancy anyways.
"Normal fertility" corresponds with 0 in our class labels, therefore avoiding mistakingly classifying a patient as having "altered fertility" means avoiding false positives, therefore we aim for high precision over recall. 


In [59]:
activation_functions = ["logistic", "tanh", "relu"]
solvers = ["lbfgs", "sgd", "adam"]
learning_rates = ["constant", "invscaling", "adaptive"]
learning_rate_init = 0.001 # default 0.001

x_train = x_fert_train
y_train = y_fert_train


hidden_layers = (8,6,1)
hidden_layers = (21)


mlp = MLPClassifier(hidden_layer_sizes = hidden_layers, 
                    max_iter=300, # epochs
                    activation = activation_functions[1], 
                    solver=solvers[-1], 
                    random_state=1234 # kind of seed
                   )

# cross validating
# first param is the model incl its params, second is features, 
# third is class labels, cv is the k of k-fold-CV, scoring is the wanted scores
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
print(cross_val_score(mlp, x_train, y_train, cv=5, scoring="accuracy"))
print(cross_val_score(mlp, x_train, y_train, cv=5, scoring="f1_weighted"))


[0.9375 0.875  0.875  0.8125 0.75  ]
[0.92816092 0.81666667 0.81666667 0.78448276 0.75      ]


#### CV

In [60]:
# testing params for fertility
fertility_results = pd.DataFrame(columns=["classifier", "balanced_accuracy", 
                                       "f1_weighted", "recall", 
                                       "precision", "time_taken", "activation_function", 
                                       "solver", "learning_rate", 
                                       "hidden_layer"])
# the scores we want
scoring = {'balanced_accuracy': 'balanced_accuracy',
           'f1_weighted': 'f1_weighted',
           'precision_weighted': 'precision_weighted',
           'recall_weighted': 'recall_weighted'}

# performing cross-fold validation on the training data to evaluate best parameters

# filter out warnings (not sure if this is a good idea)
warnings.filterwarnings('ignore')

activation_functions = ["logistic", "tanh", "relu"]
solvers = ["lbfgs", "sgd", "adam"]
learning_rates = ["constant", "invscaling", "adaptive"]

x_train = x_fert_train
y_train = y_fert_train

hidden_layers = [
    (2),
    (10),
    (21),
    (10, 5),
    (10, 10),
    (21, 2),
    (21, 10),
    (21, 10, 2),
]

# cross-validation
k = 5

# running number to count iterations/permutations
i = 0


# only do the computation, if the following flag is true
# it takes an hour or so to compute this, not necessary
if True:
    # iterate through all parameter permutations
    # save accuracy, f1, precisiona and recall
    for activation_function in activation_functions:
        for solver in solvers:
            for learning_rate in learning_rates:
                for hidden_layer in hidden_layers:
                    
                    # make a model
                    mlp = MLPClassifier(hidden_layer_sizes = hidden_layer, 
                                        max_iter=300, # epochs
                                        activation = activation_function, 
                                        solver=solver, 
                                        learning_rate = learning_rate,
                                        learning_rate_init = learning_rate_init,
                                        random_state=1234 # kind of seed
                                       )

                    # cross_validate() returns a dictionary with results
                    # cv = k for f-fold cross-validation
                    cv_results = cross_validate(mlp, x_train, y_train, cv=k, scoring=scoring, return_train_score=True)
                    
                    # extract the result values and take the mean of the 5 iterations
                    fit_time = cv_results["fit_time"].mean()
                    balanced_accuracy = cv_results["test_balanced_accuracy"].mean()
                    f1_weighted = cv_results["test_f1_weighted"].mean()
                    recall = cv_results["test_recall_weighted"].mean()
                    precision = cv_results["test_precision_weighted"].mean()
                    
                    # print out the running number, the accuracy and the parameters
                    print(i, "- acc:", balanced_accuracy, "-, time:",  fit_time, end="\r")
                    
                    # save everything
                    fertility_results = pd.concat([fertility_results, pd.DataFrame({
                        "classifier": "mlp",
                        "balanced_accuracy": [balanced_accuracy],
                        "f1_weighted": [f1_weighted],
                        "recall": [recall],
                        "precision": [precision],
                        "time_taken": fit_time,
                        "activation_function": activation_function,
                        "solver": solver,
                        "learning_rate": learning_rate,
                        "hidden_layer": str(hidden_layer)                        
                    })], ignore_index=True)
                    i += 1;
 
    display(fertility_results)
    
    # saving the results as a pickle
    with open('data/fertility/fertility_results_unscaled.pickle', 'wb') as f:
        pickle.dump(fertility_results, f)


In [61]:
fertility_results[fertility_results.hidden_layer=="2"]

Unnamed: 0,classifier,balanced_accuracy,f1_weighted,recall,precision,time_taken,activation_function,solver,learning_rate,hidden_layer


First training run mit k=10

Wow, the results so far are pretty bad. F1 scores are looking ok-ish, but accuracy never even reaches 0.7. Precision is quite low, meaning we get heaps of false positives. Upon crying over these results, we remembered that the training data only contains 80 samples, so doing 10-fold cross-validation might have been a mistake. Now rerunning with k=5.

In [62]:
with open('data/fertility/fertility_results_k5.pickle', 'rb') as f:
        fertility_results = pickle.load(f)

display(fertility_results.sort_values("balanced_accuracy", ascending=False).head(10))
display(fertility_results.sort_values("f1_weighted", ascending=False).head(10))

Unnamed: 0,classifier,balanced_accuracy,f1_weighted,recall,precision,time_taken,activation_function,solver,learning_rate,hidden_layer
32,mlp,0.7,0.747264,0.7,0.858598,0.011627,logistic,sgd,invscaling,2
110,mlp,0.635714,0.652808,0.5875,0.838694,0.115512,tanh,sgd,invscaling,"(21, 10)"
0,mlp,0.6,0.817196,0.825,0.81633,0.039305,logistic,lbfgs,constant,2
8,mlp,0.6,0.817196,0.825,0.81633,0.027684,logistic,lbfgs,invscaling,2
16,mlp,0.6,0.817196,0.825,0.81633,0.027929,logistic,lbfgs,adaptive,2
19,mlp,0.592857,0.807928,0.8125,0.813754,0.035992,logistic,lbfgs,adaptive,"(10, 5)"
11,mlp,0.592857,0.807928,0.8125,0.813754,0.042255,logistic,lbfgs,invscaling,"(10, 5)"
3,mlp,0.592857,0.807928,0.8125,0.813754,0.037799,logistic,lbfgs,constant,"(10, 5)"
179,mlp,0.585714,0.278273,0.275,0.893571,0.079978,relu,sgd,invscaling,"(10, 5)"
199,mlp,0.571429,0.830862,0.85,0.818125,0.127086,relu,adam,constant,"(21, 10, 2)"


Unnamed: 0,classifier,balanced_accuracy,f1_weighted,recall,precision,time_taken,activation_function,solver,learning_rate,hidden_layer
210,mlp,0.55,0.838966,0.8875,0.800833,0.099973,relu,adam,adaptive,21
202,mlp,0.55,0.838966,0.8875,0.800833,0.100424,relu,adam,invscaling,21
194,mlp,0.55,0.838966,0.8875,0.800833,0.098712,relu,adam,constant,21
215,mlp,0.571429,0.830862,0.85,0.818125,0.125675,relu,adam,adaptive,"(21, 10, 2)"
142,mlp,0.571429,0.830862,0.85,0.818125,0.127235,tanh,adam,adaptive,"(21, 10)"
207,mlp,0.571429,0.830862,0.85,0.818125,0.128002,relu,adam,invscaling,"(21, 10, 2)"
199,mlp,0.571429,0.830862,0.85,0.818125,0.127086,relu,adam,constant,"(21, 10, 2)"
126,mlp,0.571429,0.830862,0.85,0.818125,0.125987,tanh,adam,constant,"(21, 10)"
127,mlp,0.571429,0.830862,0.85,0.818125,0.146762,tanh,adam,constant,"(21, 10, 2)"
134,mlp,0.571429,0.830862,0.85,0.818125,0.129492,tanh,adam,invscaling,"(21, 10)"


accuracy is still isnt ascending over 0.7, but we got way higher precision this time


#### best params & testing
here the learning rate DID make a difference



In [63]:
x_train = x_fert_train
y_train = y_fert_train
x_test = x_fert_test
y_test = y_fert_test

print("best accuracy")
mlp = MLPClassifier(hidden_layer_sizes = (2), 
                    max_iter=300, # epochs
                    activation = "logistic", 
                    solver="sgd", 
                    learning_rate="invscaling",
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = mlp.predict(x_test)
#pd.crosstab(y_biomed_test, y_pred, colnames=["predicted"], rownames=["actual"])
print("(2)", "logistic", "sgd", "invscaling learning rate")
print("Accuracy balanced:", balanced_accuracy_score(y_test, y_pred))
print("F1 balanced:", f1_score(y_test, y_pred, average="weighted"))
print("Precision balanced:", precision_score(y_test, y_pred, average="weighted"))
print("------")


print("best f1 weighted and best precision")
mlp = MLPClassifier(hidden_layer_sizes = (21), 
                    max_iter=300, # epochs
                    activation = "relu", 
                    solver="adam", 
                  random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = mlp.predict(x_test)
#pd.crosstab(y_biomed_test, y_pred, colnames=["predicted"], rownames=["actual"])
print("(21)", "relu", "adam")
print("Accuracy balanced:", balanced_accuracy_score(y_test, y_pred))
print("F1 balanced:", f1_score(y_test, y_pred, average="weighted"))
print("Precision balanced:", precision_score(y_test, y_pred, average="weighted"))
print("------")





best accuracy
(2) logistic sgd invscaling learning rate
Accuracy balanced: 0.5833333333333333
F1 balanced: 0.7189964157706094
Precision balanced: 0.845054945054945
------
best f1 weighted and best precision
(21) relu adam
Accuracy balanced: 0.4444444444444444
F1 balanced: 0.8
Precision balanced: 0.8
------


### MLP: reviews
class labels are not binary, so we cannot use f1, recall or precision but rather have to rely on weighted accuracy

following our rules of thumb, we should use few hidden layers with nodes between 50 and 10,000


In [64]:
x_train

Unnamed: 0,age,child_diseases,accident,surgery,hours_sitting,fall,spring,summer,winter,fever_never,fever_not_recent,fever_recent,smoking_daily,smoking_never,smoking_occasionally,alcohol_daily,alcohol_rarely_or_never,alcohol_several_daily,alcohol_several_weekly,alcohol_weekly
35,0.78,0,0,1,0.38,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0
13,0.81,0,1,1,0.38,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0
21,0.75,0,1,1,0.25,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0
26,0.67,0,1,0,0.38,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0
50,0.67,0,1,0,0.19,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,1.00,0,0,0,0.38,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0
10,0.67,0,0,1,0.31,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1
45,0.53,0,1,1,0.44,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0
17,0.69,0,1,0,0.25,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1


In [65]:
# grid search

# testing params for reviews
reviews_results = pd.DataFrame(columns=["classifier", "balanced_accuracy", 
                                       "time_taken", "activation_function", 
                                       "solver", "learning_rate", 
                                       "hidden_layer", "iterations"])
# the scores we want
# no point in checking for precision or recall
# arguably, one may be worse than the other, but not for the sake of the research :D
scoring = {'balanced_accuracy': 'balanced_accuracy'}

# performing cross-fold validation on the training data to evaluate best parameters

# filter out warnings (not sure if this is a good idea)
warnings.filterwarnings('ignore')

activation_function = "logistic"
solvers = ["lbfgs", "sgd", "adam"]
learning_rates = ["constant"]

x_train = x_reviews_train
y_train = y_reviews_train





hidden_layers = [
    (50, 50, 50),
    (200, 100, 50),
    (1000, 1000),
    (1000, 1000, 50),
    (2000, 500, 50),
#    (5000),
#    (1000),
#    (10000),
]

# cross-validation
k = 10

# running number to count iterations/permutations
i = 0


# only do the computation, if the following flag is true
# it takes an hour or so to compute this, not necessary
if True:
    # iterate through all parameter permutations
    # save accuracy, f1, precisiona and recall
    for max_iter in [500, 800]:
        for solver in solvers:
            for learning_rate in learning_rates:
                for hidden_layer in hidden_layers:
                    
                    # make a model
                    mlp = MLPClassifier(hidden_layer_sizes = hidden_layer, 
                                        max_iter=max_iter, # epochs
                                        activation = activation_function, 
                                        solver=solver, 
                                        verbose=True,
                                        learning_rate = learning_rate,
                                        learning_rate_init = learning_rate_init,
                                        random_state=1234 # kind of seed
                                       )

                    # cross_validate() returns a dictionary with results
                    # cv = k for f-fold cross-validation
                    cv_results = cross_validate(mlp, x_train, y_train, cv=k, scoring=scoring, return_train_score=True)
                    
                    # extract the result values and take the mean of the k iterations
                    fit_time = cv_results["fit_time"].mean()
                    balanced_accuracy = cv_results["test_balanced_accuracy"].mean()
                    
                    # print out the running number, the accuracy and the parameters
                    print(i, "- acc:", balanced_accuracy, "-, time:",  fit_time, end="\r")
                    
                    # save everything
                    reviews_results = pd.concat([reviews_results, pd.DataFrame({
                        "classifier": "mlp",
                        "balanced_accuracy": [balanced_accuracy],
                        "time_taken": fit_time,
                        "activation_function": activation_function,
                        "solver": solver,
                        "iterations": max_iter,
                        "learning_rate": learning_rate,
                        "hidden_layer": str(hidden_layer)                        
                    })], ignore_index=True)
                    i += 1;
 
    display(reviews_results)
    
    # saving the results as a pickle
    with open(f'data/reviews/reviews_results_k{k}_4.pickle', 'wb') as f:
        pickle.dump(reviews_results, f)


In [66]:
# loading the saved results
with open('data/reviews/reviews_results_k10.pickle', 'rb') as handle:
    reviews_results_1 = pickle.load(handle)
display(reviews_results_1.sort_values("balanced_accuracy", ascending=False))

# loading the saved results
with open('data/reviews/reviews_results_k10_2.pickle', 'rb') as handle:
    reviews_results_2 = pickle.load(handle)
display(reviews_results_2.sort_values("balanced_accuracy", ascending=False))

# loading the saved results
with open('data/reviews/reviews_results_k10_3.pickle', 'rb') as handle:
    reviews_results_3 = pickle.load(handle)
display(reviews_results_3.sort_values("balanced_accuracy", ascending=False))

# loading the saved results
with open('data/reviews/reviews_results_k10_4.pickle', 'rb') as handle:
    reviews_results_4 = pickle.load(handle)
display(reviews_results_4.sort_values("balanced_accuracy", ascending=False))

Unnamed: 0,classifier,balanced_accuracy,time_taken,activation_function,solver,learning_rate,hidden_layer
1,mlp,0.740408,101.882982,logistic,adam,constant,1000
3,mlp,0.686245,67.672742,tanh,adam,constant,1000
5,mlp,0.657619,69.126942,relu,adam,constant,1000
0,mlp,0.61183,63.347642,logistic,adam,constant,50
2,mlp,0.50581,42.704505,tanh,adam,constant,50
4,mlp,0.25149,43.051773,relu,adam,constant,50


Unnamed: 0,classifier,balanced_accuracy,time_taken,activation_function,solver,learning_rate,hidden_layer
2,mlp,0.61183,71.385762,logistic,adam,constant,50
5,mlp,0.61183,84.956902,logistic,adam,constant,50
4,mlp,0.497667,204.181449,logistic,sgd,constant,50
1,mlp,0.454939,152.37984,logistic,sgd,constant,50
0,mlp,0.164449,132.227634,logistic,lbfgs,constant,50
3,mlp,0.161449,165.89905,logistic,lbfgs,constant,50


Unnamed: 0,classifier,balanced_accuracy,time_taken,activation_function,solver,learning_rate,hidden_layer,iterations
0,mlp,0.688497,166.714588,logistic,adam,constant,"(1000, 1000)",500
1,mlp,0.688497,150.525065,logistic,adam,constant,"(1000, 1000)",800


Unnamed: 0,classifier,balanced_accuracy,time_taken,activation_function,solver,learning_rate,hidden_layer,iterations
2,mlp,0.688497,126.050256,logistic,adam,constant,"(1000, 1000)",500
7,mlp,0.688497,145.618657,logistic,adam,constant,"(1000, 1000)",800
4,mlp,0.463415,1628.143984,logistic,adam,constant,"(2000, 500, 50)",500
9,mlp,0.463415,1611.926593,logistic,adam,constant,"(2000, 500, 50)",800
3,mlp,0.42315,576.500537,logistic,adam,constant,"(1000, 1000, 50)",500
8,mlp,0.42315,581.137089,logistic,adam,constant,"(1000, 1000, 50)",800
1,mlp,0.348694,207.914831,logistic,adam,constant,"(200, 100, 50)",500
6,mlp,0.348694,227.487648,logistic,adam,constant,"(200, 100, 50)",800
0,mlp,0.266429,105.945277,logistic,adam,constant,"(50, 50, 50)",500
5,mlp,0.261408,131.797761,logistic,adam,constant,"(50, 50, 50)",800


#### what are the best params??

In [67]:
# save results for kaggle
x_train = x_reviews_train
y_train = y_reviews_train


x_test = x_reviews_test
y_test = y_fert_test

print("model 1")
mlp = MLPClassifier(hidden_layer_sizes = (50), 
                    max_iter=500, # epochs
                    activation = "logistic", # acc to sklearn logistic is best for big datasets
                    solver="adam", 
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = pd.DataFrame({"ID": y_test, "Class": mlp.predict(x_test)})
y_pred.to_csv("data/reviews/reviews_pred_mlp1.csv", index=False)
print("model 1: logistic, adam, (50)")



print("model 2")
mlp = MLPClassifier(hidden_layer_sizes = (500), 
                    max_iter=500, # epochs
                    activation = "logistic", # acc to sklearn logistic is best for big datasets
                    solver="adam", 
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = pd.DataFrame({"ID": y_test, "Class": mlp.predict(x_test)})
y_pred.to_csv("data/reviews/reviews_pred_mlp2.csv", index=False)
print("model 2: logistic, adam, (500)")
print("model 3")
mlp = MLPClassifier(hidden_layer_sizes = (5000), 
                    max_iter=500, # epochs
                    activation = "logistic", # acc to sklearn logistic is best for big datasets
                    solver="adam", 
                    verbose=True,
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = pd.DataFrame({"ID": y_test, "Class": mlp.predict(x_test)})
y_pred.to_csv("data/reviews/reviews_pred_mlp3.csv", index=False)
print("model 3: logistic, adam, (5000)")
print("model 4")
mlp = MLPClassifier(hidden_layer_sizes = (1000, 1000), 
                    max_iter=500, # epochs
                    activation = "logistic", # acc to sklearn logistic is best for big datasets
                    solver="adam", 
                    #verbose=True,
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = pd.DataFrame({"ID": y_test, "Class": mlp.predict(x_test)})
y_pred.to_csv("data/reviews/reviews_pred_mlp4.csv", index=False)
print("model 4: logistic, adam, (1000, 1000)")
print("model 5")
mlp = MLPClassifier(hidden_layer_sizes = (2000, 500, 50), 
                    max_iter=500, # epochs
                    activation = "logistic", # acc to sklearn logistic is best for big datasets
                    solver="adam", 
                    #verbose=True,
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = pd.DataFrame({"ID": y_test, "Class": mlp.predict(x_test)})
y_pred.to_csv("data/reviews/reviews_pred_mlp5.csv", index=False)
print("model 5: logistic, adam, (2000, 500, 50)")

model 1


### MLP: congress
we go from 16 features to 2 labels

rule of thumb suggests to use anything between 2 and 6 hidden layers

In [68]:
# recoding the labels to be 0 or 1
# 0 is democrat
# 1 is repulican
y_congress_train.replace({"democrat": 0, "republican": 1}, inplace=True)
# this way I can reuse my classification code from earlier

In [69]:
display(x_congress_train)
display(y_congress_train)
display(x_congress_test)


Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
1,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
2,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
3,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
4,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
214,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.5,1.0
215,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
216,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0


100    0
215    1
139    1
178    0
15     1
      ..
106    0
14     0
92     1
179    0
102    1
Name: class, Length: 218, dtype: int64

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
2,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
3,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
4,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
213,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
214,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
215,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [70]:
activation_functions = ["logistic", "tanh", "relu"]
solvers = ["lbfgs", "sgd", "adam"]
learning_rates = ["constant", "invscaling", "adaptive"]
learning_rate_init = 0.001 # default 0.001

x_train = x_congress_train
y_train = y_congress_train


hidden_layers = (16) # best so far (16, 16, 2)

mlp = MLPClassifier(hidden_layer_sizes = hidden_layers, 
                    max_iter=500, # epochs
                    activation = activation_functions[1], 
                    solver=solvers[0], 
                    # verbose=True, # show progress
                    random_state=1234 # kind of seed
                   )

# cross validating
# first param is the model incl its params, second is features, 
# third is class labels, cv is the k of k-fold-CV, scoring is the wanted scores
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
print(cross_val_score(mlp, x_train, y_train, cv=5, scoring="accuracy").mean())



0.9495771670190274


In [71]:
# grid search

# testing params for congress
congress_results = pd.DataFrame(columns=["classifier", "balanced_accuracy", 
                                       "time_taken", "activation_function", 
                                       "solver", "learning_rate", 
                                       "hidden_layer"])
# the scores we want
# no point in checking for precision or recall
# arguably, one may be worse than the other, but not for the sake of the research :D
scoring = {'balanced_accuracy': 'balanced_accuracy'}

# performing cross-fold validation on the training data to evaluate best parameters

# filter out warnings (not sure if this is a good idea)
warnings.filterwarnings('ignore')

activation_functions = ["logistic", "tanh", "relu"]
solvers = ["lbfgs", "sgd", "adam"]
learning_rates = ["constant", "invscaling", "adaptive"]

x_train = x_congress_train
y_train = y_congress_train




hidden_layers = [
    (2),
    (8),
    (16),
    (2, 2),
    (8, 2),
    (16, 8),
    (16, 8, 2),
    (16, 16, 8),
    (16, 16, 2),
    (16, 16, 16, 2),
    (16, 16, 16, 16)
]

# cross-validation
k = 10

# running number to count iterations/permutations
i = 0


# only do the computation, if the following flag is true
# it takes an hour or so to compute this, not necessary
if True:
    # iterate through all parameter permutations
    # save accuracy, f1, precisiona and recall
    for activation_function in activation_functions:
        for solver in solvers:
            for learning_rate in learning_rates:
                for hidden_layer in hidden_layers:
                    
                    # make a model
                    mlp = MLPClassifier(hidden_layer_sizes = hidden_layer, 
                                        max_iter=300, # epochs
                                        activation = activation_function, 
                                        solver=solver, 
                                        learning_rate = learning_rate,
                                        learning_rate_init = learning_rate_init,
                                        random_state=1234 # kind of seed
                                       )

                    # cross_validate() returns a dictionary with results
                    # cv = k for f-fold cross-validation
                    cv_results = cross_validate(mlp, x_train, y_train, cv=k, scoring=scoring, return_train_score=True)
                    
                    # extract the result values and take the mean of the k iterations
                    fit_time = cv_results["fit_time"].mean()
                    balanced_accuracy = cv_results["test_balanced_accuracy"].mean()
                    
                    # print out the running number, the accuracy and the parameters
                    print(i, "- acc:", balanced_accuracy, "-, time:",  fit_time, end="\r")
                    
                    # save everything
                    congress_results = pd.concat([congress_results, pd.DataFrame({
                        "classifier": "mlp",
                        "balanced_accuracy": [balanced_accuracy],
                        "time_taken": fit_time,
                        "activation_function": activation_function,
                        "solver": solver,
                        "learning_rate": learning_rate,
                        "hidden_layer": str(hidden_layer)                        
                    })], ignore_index=True)
                    i += 1;
 
    display(congress_results)
    
    # saving the results as a pickle
    with open('data/congress/congress_results_k10.pickle', 'wb') as f:
        pickle.dump(congress_results, f)


In [72]:
# loading the saved results
with open('data/congress/congress_results_k10.pickle', 'rb') as handle:
    congress_results = pickle.load(handle)

display(congress_results.sort_values("balanced_accuracy", ascending=False).head(30))

Unnamed: 0,classifier,balanced_accuracy,time_taken,activation_function,solver,learning_rate,hidden_layer
10,mlp,0.975893,0.047264,logistic,lbfgs,constant,"(16, 16, 16, 16)"
32,mlp,0.975893,0.084928,logistic,lbfgs,adaptive,"(16, 16, 16, 16)"
21,mlp,0.975893,0.044197,logistic,lbfgs,invscaling,"(16, 16, 16, 16)"
16,mlp,0.973214,0.014908,logistic,lbfgs,invscaling,"(16, 8)"
5,mlp,0.973214,0.014848,logistic,lbfgs,constant,"(16, 8)"
27,mlp,0.973214,0.014618,logistic,lbfgs,adaptive,"(16, 8)"
24,mlp,0.969643,0.008914,logistic,lbfgs,adaptive,16
2,mlp,0.969643,0.009134,logistic,lbfgs,constant,16
13,mlp,0.969643,0.009614,logistic,lbfgs,invscaling,16
112,mlp,0.962225,0.014998,tanh,lbfgs,invscaling,16


#### best params & testing
learning_rate apparently makes no difference



In [4]:
x_train = x_congress_train
y_train = y_congress_train


x_test = x_congress_test
y_test = y_fert_test

print("best accuracy")
mlp = MLPClassifier(hidden_layer_sizes = (16, 16, 16, 16), 
                    max_iter=300, # epochs
                    activation = "logistic", 
                    solver="lbfgs", 
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = pd.DataFrame({"ID": x_test, "class": mlp.predict(x_test)})
y_pred["class"] = y_pred["class"].apply(lambda x: "republican" if x else "democrat")
y_pred.to_csv("data/congress/congress_pred_mlp1.csv", index=False)
print("model 1: logistic, lbfgs, (16, 16, 16, 16)")




print("2nd best accuracy")
mlp = MLPClassifier(hidden_layer_sizes = (16, 8), 
                    max_iter=300, # epochs
                    activation = "logistic", 
                    solver="lbfgs", 
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = pd.DataFrame({"ID": x_test, "class": mlp.predict(x_test)})
y_pred["class"] = y_pred["class"].apply(lambda x: "republican" if x else "democrat")
y_pred.to_csv("data/congress/congress_pred_mlp2.csv", index=False)
print("model 2: logistic, lbfgs, (16, 8)")





print("3rd best accuracy")
mlp = MLPClassifier(hidden_layer_sizes = (16), 
                    max_iter=300, # epochs
                    activation = "logistic", 
                    solver="lbfgs", 
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = pd.DataFrame({"ID": x_congress_test_ID, "class": mlp.predict(x_test)})
y_pred["class"] = y_pred["class"].apply(lambda x: "republican" if x else "democrat")
y_pred.to_csv("data/congress/congress_pred_mlp3.csv", index=False)
print("model 3: logistic, lbfgs, (16)")






print("4th best accuracy")
mlp = MLPClassifier(hidden_layer_sizes = (16), 
                    max_iter=300, # epochs
                    activation = "tanh", 
                    solver="lbfgs", 
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = pd.DataFrame({"ID": x_congress_test_ID, "class": mlp.predict(x_test)})
y_pred["class"] = y_pred["class"].apply(lambda x: "republican" if x else "democrat")
y_pred.to_csv("data/congress/congress_pred_mlp4.csv", index=False)
print("model 4: tanh, lbfgs, (16)")




print("5th best accuracy")
mlp = MLPClassifier(hidden_layer_sizes = (16, 8, 2), 
                    max_iter=300, # epochs
                    activation = "relu", 
                    solver="adam", 
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
y_pred = pd.DataFrame({"ID": x_test, "class": mlp.predict(x_test)})
y_pred["class"] = y_pred["class"].apply(lambda x: "republican" if x else "democrat")
y_pred.to_csv("data/congress/congress_pred_mlp5.csv", index=False)
print("model 5: relu, adam, (16, 8, 2)")




NameError: name 'x_congress_train' is not defined

In [74]:
x_congress_test

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
2,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
3,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,,0.0,1.0,
4,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,,0.0,0.0,1.0,
213,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
214,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
215,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,


In [75]:
x_train = x_congress_train
y_train = y_congress_train


x_test = x_congress_test
y_test = y_fert_test


mlp = MLPClassifier(hidden_layer_sizes = (8), 
                    max_iter=500, # epochs
                    activation = "logistic", 
                    solver="lbfgs", 
                  #  verbose=True,
                    random_state=1234 # kind of seed
                   )
mlp.fit(x_train, y_train)
#y_pred = pd.DataFrame({"ID": x_congress_test_ID, "class": mlp.predict(x_test)})
#y_pred["class"] = y_pred["class"].apply(lambda x: "republican" if x else "democrat")
#y_pred.to_csv("data/congress/congress_pred_mlp5.csv", index=False)
mlp.loss_


0.000594898348559961