In [None]:
import re
import csv
import numpy as np
import matplotlib.pyplot as plt

from skopt import BayesSearchCV
from skopt.space import Real

from sklearn.svm import SVC
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, plot_confusion_matrix

### Loading training and test sets as sklearn bunch objects

In [None]:
# training-set
train_dataset = load_files("CoarseTask/training_set/", encoding="utf-8")

# cross-genre test-set
test_dataset_cross = load_files("CoarseTask/test_cross", encoding="utf-8")

# same-genre test-set
test_dataset_same = load_files("CoarseTask/test_same", encoding="utf-8")

### Defining the feature space

In [None]:
def get_document_length(dataset):
    """ Splits each document resorting to a naive white-space-split strategy
    and counts the number of word tokens.
    """
    return np.array([len(doc.split(" ")) for doc in dataset]).reshape(-1, 1)

# combines n-grams and document length
combined_features = FeatureUnion([
    ("tfidf_word", TfidfVectorizer(analyzer='word', ngram_range=(1, 2), lowercase=False)),
    ("tfidf_char", TfidfVectorizer(analyzer='char', ngram_range=(3, 5), lowercase=False)),
    ("document_len", Pipeline([
        ("transf", FunctionTransformer(get_document_length, validate=False)),
        ("scaler", MinMaxScaler())
    ]))
], n_jobs=-1)

### Tuning hyper-parameters

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_dataset.data, train_dataset.target, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ("features", combined_features),
    ("clf", SVC(kernel='linear', decision_function_shape='ovo', tol=1e-12, class_weight='balanced', random_state=42))
])

# hyper-parameters search space
parameters = {
    "features__tfidf_word__max_df": Real(.3, .9, prior='log-uniform'),
    "features__tfidf_word__min_df": Real(.001, .1, prior='log-uniform'),
    
    "features__tfidf_char__max_df": Real(.3, .9, prior='log-uniform'),
    "features__tfidf_char__min_df": Real(.001, .1, prior='log-uniform'),
        
    "clf__C": Real(1e-6, 100.0, prior='log-uniform')
}

stratified_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
bayes_search = BayesSearchCV(pipeline, parameters, cv=stratified_kf, n_iter=30, n_jobs=-1, verbose=1, scoring="f1_macro")
bayes_search.fit(X_train, y_train)

# prints optimized hyper-parameters
best_parameters = bayes_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('{:30} {}'.format(param_name, round(best_parameters[param_name], 3)))

# predicts labels for the validation set
tuned_clf = bayes_search.best_estimator_
val_prediction = tuned_clf.predict(X_val)

# prints relevant statistics and plots the confusion matrix
print('\n{:30}{}'.format('accuracy:', round((accuracy_score(y_val, val_prediction)), 2)))
print('{:30}{}'.format('macro-averaged precision:', round(precision_score(y_val, val_prediction, average='macro'), 2)))
print('{:30}{}'.format('macro-averaged recall:', round(recall_score(y_val, val_prediction, average='macro'), 2)))
print('{:30}{}'.format('macro-averaged f1 score:', round(f1_score(y_val, val_prediction, average='macro'), 2)))

fig, ax = plt.subplots(figsize=(10, 10))
labels = ["1901-1918", "1919-1926", "1927-1942", "1943-1947", "1948-1954"]
plot_confusion_matrix(tuned_clf, X_val, y_val, display_labels=labels, cmap=plt.cm.Blues, ax=ax)
plt.show()

### Predicting labels for same-genre and cross-genre test-sets

In [None]:
X_train = train_dataset.data
y_train = train_dataset.target

# same-genre
X_test_same_genre = test_dataset_same.data
y_test_same_genre = test_dataset_same.target

# cross-genre
X_test_cross_genre = test_dataset_cross.data
y_test_cross_genre = test_dataset_cross.target

# re-train tuned classifier on whole training set
tuned_clf.fit(X_train, y_train)

predicted_cross = tuned_clf.predict(X_test_cross_genre)
predicted_same = tuned_clf.predict(X_test_same_genre)

### Storing predictions as tsv files

In [None]:
with open('2_matteo-brv_2.tsv', 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    for name, pred in zip(test_dataset_cross.filenames, predicted_cross):
        name = re.sub(r'.*/', '', name)
        tsv_writer.writerow([name, f'Class{pred+1}'])

with open('1_matteo-brv_2.tsv', 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    for name, pred in zip(test_dataset_same.filenames, predicted_same):
        name = re.sub(r'.*/', '', name)
        tsv_writer.writerow([name, f'Class{pred+1}'])