In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import DataLoader

import sklearn
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('universal_tagset')

from scripts.feature_extraction import load_features, perform_feature_extraction, save_features
from scripts.network import Pan2425Dataset, AuthorIdentificationNetwork

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/maxneerken/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/maxneerken/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/maxneerken/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


## Data retrieval

In [2]:
data = {
    'dev': pd.read_csv('data/pan2425_dev_data.csv'),
    'train': pd.read_csv('data/pan2425_train_data.csv'),
    'test': pd.read_csv('data/pan2425_test_data.csv')
}

## Feature extraction

In [3]:
extract = False

author_mapping = {560480: 0, 512464: 1, 2750536: 2, 1112924: 3, 29783: 4, 910821: 5, 1497577: 6, 748687: 7,
                  1220273: 8, 870118: 9, 1276465: 10, 2943978: 11, 806976: 12, 2855986: 13, 6234395: 14,
                  240213: 15, 583064: 16, 583994: 17, 967934: 18, 3439302: 19}

if extract:
    train_features, train_labels = perform_feature_extraction(data['train'], author_mapping)
    save_features(train_features, train_labels, './data', 'train')
    
    val_features, val_labels = perform_feature_extraction(data['train'], author_mapping)
    save_features(val_features, val_labels, './data', 'train')
    
    test_features, test_labels = perform_feature_extraction(data['test'], author_mapping)
    save_features(test_features, test_labels, './data', 'test')
else:
    train_features, train_labels = load_features('./data', 'train')
    val_features, val_labels = load_features('./data', 'val')
    test_features, test_labels = load_features('./data', 'test')

## Model training and evaluation

In [61]:
from scripts.network import train, plot_training

train_set = Pan2425Dataset(train_features, train_labels.flatten())
val_set = Pan2425Dataset(val_features, val_labels.flatten())

train_loader = DataLoader(train_set, shuffle=True, batch_size=64)
val_loader = DataLoader(val_set, shuffle=False, batch_size=32)

net = AuthorIdentificationNetwork()
train_loss, val_loss, train_acc, val_acc = train(net, train_loader, val_loader, epochs=100)
plot_training(train_loss, val_loss, train_acc, val_acc, epochs=100)

ImportError: cannot import name 'train' from 'scripts.network' (/Users/maxneerken/Documents/Text & Multimedia Mining/Assignment 4/scripts/network.py)

In [46]:

def evaluate(net: nn.Module, test_loader: DataLoader):
    labels = []
    predictions = []
    with torch.no_grad():
        for x, y in test_loader:
            # compute the prediction given the input x
            output = net.forward(x)
            prediction = torch.argmax(output, dim=1)
            
            predictions.extend(prediction.detach().cpu().numpy())
            labels.extend(y.detach().cpu().numpy())
            
    labels = np.array(labels)
    predictions = np.array(predictions)
    
    recall = sklearn.metrics.recall_score(labels, predictions, average='macro')
    precision = sklearn.metrics.precision_score(labels, predictions, average='macro')
    f1_score = sklearn.metrics.f1_score(labels, predictions, average='macro')
    
    return recall, precision, f1_score


In [50]:
test_set = Pan2425Dataset(test_features, test_labels)
test_loader = DataLoader(test_set, shuffle=False, batch_size=32)

model_recall, model_precision, model_f_score = evaluate(net, test_loader=test_loader)

print(f'Recall: {round(model_recall, 2)}')
print(f'Precision: {round(model_precision, 2)}')
print(f'F-Score: {round(model_f_score, 2)}')

Recall: 0.67
Precision: 0.71
F-Score: 0.66


## Ablation analysis

In [None]:
n_features = 175
ablation_results = []

train_set = Pan2425Dataset(train_features, train_labels)
val_set = Pan2425Dataset(val_features, val_labels)
test_set = Pan2425Dataset(test_features, test_labels)

for i in range(n_features):
    print(f'Ablating feature [{i+1}/{n_features}]')

    # Ablate
    train_set = train_set.remove_feature(i)
    val_set = val_set.remove_feature(i)
    test_set = test_set.remove_feature(i)

    train_loader = DataLoader(train_set, shuffle=True, batch_size=64)
    val_loader = DataLoader(val_set, shuffle=False, batch_size=64)
    test_loader = DataLoader(test_set, shuffle=False, batch_size=64)

    # Fit model
    model = AuthorIdentificationNetwork(input_features=174)
    train(model, train_loader, val_loader, epochs=40)

    # Test model
    metrics = evaluate(model, test_loader)
    ablation_results.append(metrics)

    # Deblate
    train_set = train_set.restore_feature()
    val_set = val_set.restore_feature()
    test_set = test_set.restore_feature()

ablation_df = pd.DataFrame(ablation_results, columns=['Recall', 'Precision', 'F-Score'])
ablation_df.to_csv('Ablation_results.csv', index=False)

In [None]:
symbols_indices = []
alphabets_indices = []
pos_indices = []

In [58]:
chars = [
        'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM',
        'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'
    ]
len(chars)

17

In [51]:
import plotly.express as px
import pandas as pd

# Assuming ablation_df, f_scores, and model_f_score are defined
ablation_df = pd.DataFrame(ablation_results, columns=['Recall', 'Precision', 'F-Score'])
f_scores = ablation_df[['F-Score']].to_numpy().flatten()

# Define your model baseline F-Score
model_f_score = 0.75  # Replace with your actual baseline value

# Compute the difference with the baseline
f_score_diff = f_scores - model_f_score

# Create a DataFrame for Plotly
plotly_df = pd.DataFrame({
    'Feature': [f"F{i+1}" for i in range(len(f_scores))],  # Shorter feature labels
    'F-Score Difference': f_score_diff,
    'Detailed Feature Name': [f"Feature {i+1}" for i in range(len(f_scores))]  # For tooltips
})

# Create an interactive bar chart
fig = px.bar(
    plotly_df,
    x='Feature',
    y='F-Score Difference',
    title="Effect of Feature Removal on F-Score",
    labels={'Feature': 'Removed Feature', 'F-Score Difference': 'F-Score change'},
    color='F-Score Difference',
    color_continuous_scale='RdYlGn',
)

# Add a horizontal line at zero for easy comparison
fig.add_hline(
    y=0,
    line_dash="solid",
    line_color="black",
    annotation_text=f"Baseline F-Score: {model_f_score}",
    annotation_position="top left"
)

# Reduce x-axis label clutter
fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(0, len(f_scores), 10)),  # Show every 10th label
        ticktext=[f"F{i+1}" for i in range(0, len(f_scores), 10)],  # Corresponding shorter labels
        title='Removed Feature'
    ),
    yaxis=dict(
        title='Change in F-Score'
    ),
    height=600,  # Adjust height for better visualization
    margin=dict(l=40, r=40, t=40, b=100)
)

# Show the plot
fig.show()
