### 5-fold Cross Validation

In [41]:
from math import floor
from statistics import mean
from numpy import array
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

# Function to scale input vectors
def scale(scaler, input_vector):
    scaler.fit(input_vector)
    scaled_features = scaler.transform(input_vector)
    return scaled_features

# Function to calculate number of nodes in hidden layer of MLP
def calc_node_num(input_len, output_len):
    return floor(mean([input_len, output_len]))

# Function to construct params dictionary to pass to cross validation
def construct_params(node_num):
    return {'hidden_layer_sizes': [(node_num,)], 
            'activation': ['logistic', 'tanh', 'relu'],
            'solver': ['lbfgs', 'sgd']}

# Function to run 5-fold cross validation (with MLP) to determine optimal hyperparameters for MLP classifer
def cross_validation(x_features, y_labels, params):
    clf = GridSearchCV(MLPClassifier(), params, cv=3, # Fix this with actual dataset
                       scoring='accuracy')
    clf.fit(x_features, y_labels)
    return clf.best_params_

# Return model with given params
def get_model(params):
    return MLPClassifier(hidden_layer_sizes=params['hidden_layer_sizes'], 
                         activation=params['activation'], 
                         solver=params['solver'])

# Train model on entire training set
def train_model(model, x_vector, y_vector):
    model.fit(x_vector, y_vector)
    return model

# Steps for age and time period classifiers
# (0) Scale input/output data (?)
scaler = StandardScaler()
age_features = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]
age_labels = [1, 2, 2, 1, 1, 2]
scaled_age_features = scale(scaler, age_features)
# print(scaled_age_features)

time_period_features = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [6, 7, 8, 9, 10], [6, 7, 8, 9, 10], [6, 7, 8, 9, 10], [6, 7, 8, 9, 10]]
time_period_labels = [1, 2, 1, 2, 1, 2]
scaled_time_period_features = scale(scaler, time_period_features)
# print(scaled_time_period_features)

# (1) Perform cross validation with all chosen parameters
age_node_num = calc_node_num(len(age_features[0]), len(age_labels))
age_params = construct_params(age_node_num)
best_age_params = cross_validation(scaled_age_features, age_labels, age_params)

tp_node_num = calc_node_num(len(time_period_features[0]), len(time_period_labels))
time_period_params = construct_params(tp_node_num)
best_tp_params = cross_validation(scaled_time_period_features, time_period_labels, time_period_params)

# (2) After determining optimal params based on accuracy, retrain model with those params
age_model = get_model(best_age_params)
trained_age_model = train_model(age_model, scaled_age_features, age_labels)

tp_model = get_model(best_tp_params)
trained_tp_model = train_model(tp_model, scaled_time_period_features, time_period_labels)




MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)




### Metrics

In [47]:
# Function to predict labels for given test set and return classification report of precision, recall and F1
def predict_values(model, test_features, test_labels):
    predictions = model.predict(test_features)
    print(classification_report(test_labels, predictions))

# (3) Split training data into training/test set, use test set to return precision, recall, and f1 scores (?)
# test_age_features = [[1, 2, 3, 4, 5]]
# test_age_labels = [[2]]
# scaled_test_age_features = scale(scaler, test_age_features)
# predict_values(trained_age_model, test_age_features, test_age_labels)

# test_tp_features = [[6, 7, 8, 9, 10]]
# test_tp_labels = [[2]]
# scaled_test_tp_features = scale(scaler, test_tp_features)

[2]
              precision    recall  f1-score   support

           2       1.00      1.00      1.00         1

   micro avg       1.00      1.00      1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



### Prediction

In [None]:
# (4) Use trained models to predict age and time period of selected authors/novels
# test_age_features = [[1, 2, 3, 4, 5]]
# test_age_labels = [[1]]
# scaled_test_age_features = scale(scaler, test_age_features)

# test_tp_features = [[6, 7, 8, 9, 10]]
# test_tp_labels = [[2]]
# scaled_test_tp_features = scale(scaler, test_tp_features)