# Droyßen Authorship-Attribution

## Importe

In [1]:
import os

import sklearn.feature_extraction.text as text
import sklearn.preprocessing as preprocessing

import sklearn.model_selection as model_selection
import sklearn.metrics as metrics

import scipy.spatial.distance as scidist
import sklearn.decomposition
import scipy.cluster.hierarchy as hierarchy

import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

## Preprocessing

### Helferfunktion zum Laden der Daten

In [2]:
def load_directory(directory, max_length):
    documents, authors, titles = [], [], [] 
    for filename in os.scandir(directory):
        if not filename.name.endswith('.txt'):
            continue
        author, _ = os.path.splitext(filename.name)

        with open(filename.path) as f:
            contents = f.read()
        lemmas = contents.lower().split()
        start_idx, end_idx, segm_cnt = 0, max_length, 1

        # extract slices from the text:
        while end_idx < len(lemmas):
            documents.append(' '.join(lemmas[start_idx:end_idx]))
            authors.append(author[0])
            title = filename.name.replace('.txt', '').split('_')[1]
            titles.append(f"{title}-{segm_cnt}")

            start_idx += max_length
            end_idx += max_length
            segm_cnt += 1

    return documents, authors, titles

Vokabular: verschiedene Listen: Ausgangspunkt: most frequent words, dann schrittweise neue Listen erstellen mit Wörtern, die man rausnimmt (mit Begründung im Text), dann über Code von Ursprungsliste "abziehen"
Anders als Kestemon: dort war die Liste mit "##" aussortiert

## Laden der Daten

In [1]:
documents, authors, titles = load_directory('data/texts', 10000)

NameError: name 'load_directory' is not defined

## Erstelle Vokabular

In [None]:
vocab = [l.strip() for l in open('data/wordlist.txt') if not l.startswith('#') and l.strip()][:65]

In [None]:
# Instantiieren
vectorizer = text.CountVectorizer(token_pattern=r"(?u)\b\w+\b", vocabulary=vocab)

# Fit und Transform
v_documents = vectorizer.fit_transform(documents).toarray()

# Check
print(v_documents.shape)
print(vectorizer.get_feature_names_out()[:5])

In [None]:
# Normalisiere
n_v_documents = preprocessing.normalize(v_documents.astype(float), norm='l1')

# Check
print(n_v_documents.shape)

In [None]:
# Z-Transformation
scaler = preprocessing.StandardScaler()
s_documents = scaler.fit_transform(n_v_documents)

## Helferfunktion für Plotten

In [None]:
def plot_z_scores(nchunk=0):

    fig, ax = plt.subplots(figsize=(16,6))

    labels = vectorizer.get_feature_names_out()
    x = np.arange(0,65)

    ax.bar(x=x, height=s_documents[nchunk])
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=90, fontsize=12)
    ax.set_title(f'Textchunk {authors[nchunk]}_{titles[nchunk]}')
    ax.set_xlabel('Features', fontsize=14)
    ax.set_ylabel('z-Werte', fontsize=14)
    ax.set_ylim(-3.5, 3.5)
    ax.yaxis.grid();

In [None]:
# Beispielplot für ersten Chunk, als Test
plot_z_scores(nchunk=0)

## Cityblock-Distanzen zu allen Chunks

test_doc = s_documents[0]

distances = [ scidist.cityblock(test_doc, train_doc) for train_doc in s_documents[1:] ]

In [None]:
for distance, author, title in zip(distances, authors[1:], titles[1:]):
    print(f'{distance} => {author}_{title}')

In [None]:
print(f'geringste Cityblock-Distanz: {distances[np.argmin(distances)]} => {authors[np.argmin(distances) + 1]}_{titles[np.argmin(distances) + 1]}')

### Plotte Testchunk und die jeweils nächsten und entferntesten Chunks

In [None]:
plot_z_scores(nchunk=0)                         # Testchunk
plot_z_scores(nchunk=np.argmin(distances) + 1)  # nächster Chunk
plot_z_scores(nchunk=np.argmax(distances) + 1)  # entfernster Chunk

## Delta-Objekt erstellen

In [None]:
class Delta:
    """Delta-Based Authorship Attributer."""

    def fit(self, X, y):
        """Fit (or train) the attributer.

        Arguments:
            X: a two-dimensional array of size NxV, where N represents
               the number of training documents, and V represents the
               number of features used.
            y: a list (or NumPy array) consisting of the observed author
                for each document in X.

        Returns:
            Delta: A trained (fitted) instance of Delta.

        """
        self.train_y = np.array(y)
        self.scaler = preprocessing.StandardScaler(with_mean=False)
        self.train_X = self.scaler.fit_transform(X)

        return self

    def predict(self, X, metric='cityblock'):
        """Predict the authorship for each document in X.

        Arguments:
            X: a two-dimensional (sparse) matrix of size NxV, where N
               represents the number of test documents, and V represents
               the number of features used during the fitting stage of
               the attributer.
            metric (str, optional): the metric used for computing
               distances between documents. Defaults to 'cityblock'.

        Returns:
            ndarray: the predicted author for each document in X.

        """
        X = self.scaler.transform(X)
        dists = scidist.cdist(X, self.train_X, metric=metric)
        return self.train_y[np.argmin(dists, axis=1)]

## Train-Test Split

In [None]:
test_size = len(set(authors)) * 2

(train_documents, test_documents, train_authors, test_authors) = model_selection.train_test_split(n_v_documents,  # normalisierte Daten werden genutzt
                                                                                                 authors, 
                                                                                                 test_size=test_size, 
                                                                                                 stratify=authors, 
                                                                                                 random_state=42)
                                                                                                 
print(f'N={test_documents.shape[0]} Test Dokumente mit '
      f'V={test_documents.shape[1]} Features.')

print(f'N={train_documents.shape[0]} Training Dokumente mit '
      f'V={train_documents.shape[1]} Features.')

## Evaluation

In [None]:
delta = Delta()                             # Delta Classifier wird instantiiert
delta.fit(train_documents, train_authors)   # Delta Classifier wird gefitted
preds = delta.predict(test_documents)       # Delta Classifier klassifziert Test Dokumente

# Ausgabe
for true, pred in zip(test_authors, preds):
    _connector = 'ABER' if true != pred else 'und'
    print(f'Der Autor ist {true} {_connector} {pred} wurde vorhergesagt.')

accuracy = metrics.accuracy_score(preds, test_authors)
print(f'\nAccuracy der Vorhersagen: {accuracy:.1f}')

# Anwendung

In [None]:
# Einlesen der Trainingsdaten
train_documents, train_authors, train_titles = load_directory('data/texts', 3301)

vectorizer = text.CountVectorizer(token_pattern=r"(?u)\b\w+\b", vocabulary=vocab)
                                  
v_train_documents = vectorizer.fit_transform(train_documents).toarray()
v_train_documents = preprocessing.normalize(v_train_documents.astype(float), norm='l1')

delta = Delta().fit(v_train_documents, train_authors)

In [None]:
# Einlesen der Testdaten
test_docs, test_authors, test_titles = load_directory('data/texts/test', 3301)

v_test_docs = vectorizer.transform(test_docs).toarray()
v_test_docs = preprocessing.normalize(v_test_docs.astype(float), norm='l1')

## Klassifizieren mit Cityblock-Distanz

In [None]:
predictions = delta.predict(v_test_docs)

for test_author, test_title, prediction in zip(test_authors, test_titles, predictions):
    print(f'Quelle: {test_author}_{test_title} => klassifiziert als {prediction}')