In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.compose as sc
import sklearn.preprocessing as sp
import sklearn.linear_model as sl
import sklearn.metrics as sm
import sklearn.pipeline as spipe
import sklearn.model_selection as sms
import tensorflow.keras.models as km
import tensorflow.keras.layers as kl
import tensorflow.keras.wrappers.scikit_learn as kw

np.random.seed(42)

X = np.genfromtxt("data/trainX.dat")
t = np.genfromtxt("data/traint.dat")

## Bring into data frame for easier wrangling and plotting
df = pd.DataFrame(X)
df['target'] = t

df.describe()

qual_cols = (0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18)
quant_cols = [1, 4, 12, 19]

## Plot data from continuous columns
#sns.pairplot(df, vars = quant_cols, hue = 'target');

trans = sc.make_column_transformer( \
    (sp.StandardScaler(), \
     quant_cols), \
    (sp.OneHotEncoder(categories = [np.arange(1, 5)], \
                      sparse = False), \
     [0]), \
    (sp.OneHotEncoder(categories = [np.arange(0, 5)], \
                      sparse = False),
     [2]), \
    (sp.OneHotEncoder(categories = [np.arange(0, 11)], \
                      sparse = False),
     [3]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 6)], \
                      sparse = False),
     [5]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 6)], \
                      sparse = False),
     [6]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 5)], \
                      sparse = False),
     [7]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 5)], \
                      sparse = False),
     [8]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 4)], \
                      sparse = False),
     [9]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 5)], \
                      sparse = False),
     [10]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 5)], \
                      sparse = False),
     [11]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 4)], \
                      sparse = False),
     [13]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 4)], \
                      sparse = False),
     [14]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 5)], \
                      sparse = False),
     [15]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 5)], \
                      sparse = False),
     [16]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 3)], \
                      sparse = False),
     [17]), \
    (sp.OneHotEncoder(categories = [np.arange(1, 3)], \
                      sparse = False),
     [18]))
    
model = spipe.Pipeline(steps = [('recode', trans),
                                ('classify', sl.LogisticRegression(solver = "liblinear"))])
## GridSearch to find regularizer and strength via CV
grid = sms.GridSearchCV(model, {'classify__penalty': ["l1", "l2"], 'classify__C': [1, 10, 100, 1000]}, cv = 10)

## Fit model and predict class probabilities
fit = grid.fit(X, t)
prob = fit.predict_proba(X)
fit.best_params_

prob = fit.predict_proba(X)
pred = prob[:, 0] < 1 / 6.

## Always look at the confusion matrix!
cm = sm.confusion_matrix(t, pred)

def loss(cm):
    return 5. * cm[0, 1] + cm[1, 0]

print(cm)
print(loss(cm))

## get size of transformed data
N, D = trans.fit_transform(X).shape

def nn_model():
    # create model
    model = km.Sequential()
    model.add(kl.Dense(100, input_dim=D, activation='relu'))
    model.add(kl.Dropout(rate = 0.4))
    model.add(kl.Dense(10, activation='relu'))
    model.add(kl.Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

nn = spipe.Pipeline(steps = [('preprocess', trans),
                             ('net', kw.KerasClassifier(build_fn = nn_model,
                                                        epochs=30, batch_size=32, verbose=1))])
nn_fit = nn.fit(X, t)

nn_prob = nn.predict_proba(X)

nn_pred = nn_prob[:, 0] < 1 / 6.
nn_cm = sm.confusion_matrix(t, nn_pred)

print(nn_cm)
print(loss(nn_cm))

X_test = np.genfromtxt("data/testX.dat")
t_test = np.genfromtxt("data/testt.dat")

nn_pred_test = nn.predict_proba(X_test)[:, 0] < 1 / 6.
nn_cm_test = sm.confusion_matrix(t_test, nn_pred_test)

print(nn_cm_test)
print(loss(nn_cm_test))

  'Matplotlib is building the font cache using fc-list. '


ModuleNotFoundError: No module named 'sklearn.compose'