In [2]:
from pathlib import Path
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import pylab

In [3]:
data = []
base_dir = 'corpus_preprocessed'

for target in ['aimait', 'terribles']:
    target_label = 1 if target == 'aimait' else 0
    
    target_dir = os.path.join(base_dir, target)
    
    for author in os.listdir(target_dir):
        author_dir = os.path.join(target_dir, author)
        
        if os.path.isdir(author_dir):
            for filename in os.listdir(author_dir):
                if filename.endswith('.txt'):
                    file_path = os.path.join(author_dir, filename)
                    
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read()
                    data.append({
                        'target': target_label,
                        'author': author,
                        'title': filename[:-4],
                        'text': text
                    })

df = pd.DataFrame(data)
display(df)

Unnamed: 0,target,author,title,text
0,1,prudentius,prud.psycho,praefatio senex fidelis primus credo uia abram...
1,1,sidonius,sidonius3,epistula sidonius auitus salus multus uinculum...
2,1,sidonius,sidonius2,epistula sidonius ecdicio salus duo nunc parit...
3,1,sidonius,sidonius1,epistula sidonius constantio salus praecipio d...
4,1,sidonius,sidonius5,epistula sidonius petronius salus audio lectit...
...,...,...,...,...
893,0,tacitus,tac.ann15,interea rex parthi uologaeses cognosco corbulo...
894,0,tacitus,tac.ann11,ualerium asiaticum bis consul quondam adulter ...
895,0,tacitus,tac.ann12,caedes messalinae convulsus princeps domus ori...
896,0,tacitus,tac.ann13,primus nouus principatus mors iunii silanus pr...


In [5]:
data = []
base_dir = 'other_preprocessed'

for target in ['expected_aimait', 'expected_terribles']:
    target_label = 1 if target == 'expected_aimait' else 0
    
    target_dir = os.path.join(base_dir, target)
    
    for author in os.listdir(target_dir):
        author_dir = os.path.join(target_dir, author)
        
        if os.path.isdir(author_dir):
            for filename in os.listdir(author_dir):
                if filename.endswith('.txt'):
                    file_path = os.path.join(author_dir, filename)
                    
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read()
                    data.append({
                        'expected target': target_label,
                        'author': author,
                        'title': filename[:-4],
                        'text': text
                    })

df_other = pd.DataFrame(data)
display(df_other)

Unnamed: 0,expected target,author,title,text
0,1,dracontius,carmen_de_deo,liber primus cupio animus placidus rescio tono...
1,1,dracontius,hexameron_dracontii,sanctus eugenii epistula chindasuintum rex inc...
2,1,dracontius,satisfactio_ad_guntharium,satisfactio guntharium rex immense deus cuncto...
3,1,eugyppius,vita_sancti_severini,epistula eugyppii paschasium diaconum dominus ...
4,1,eugyppius,thesaurus,tomus prior caput primus sententia iacobi apos...
5,1,eugyppius,epistola_dedicatoria,epistula dedicatoria excerptor codico nonnullu...
6,1,mamertus,hymnus_de_passione_domini,hymnus passio dominus pango lingua gloriosus p...
7,1,mamertus,epistolae,epistula primus claudianus sidonio papae salus...
8,1,mamertus,de_statu_animae,praefatio praefectorio patricius doceo bonus u...
9,1,ennodius,panegyricus_theoderico_regi_dictus,panegyricus theoderico rex dico princeps uener...


In [6]:
author_list = list(df_other['author'].unique())
print(author_list)

['dracontius', 'eugyppius', 'mamertus', 'ennodius', 'avitus', 'paulinus', 'damasus', 'hieronymus', 'hilarius']


In [8]:
author = 'dracontius'
targ = df_other[df_other['author'] == author]['expected target'][0]
targ

1

In [17]:
def make_predict_table(author, df_other, models, tfidfs):
    df_temp = df_other[df_other['author'] == author].copy()
    
    targ = df_temp['expected target'].iloc[0] 
    print(f"{author}, expected target: {targ}")

    prediction_table = pd.DataFrame()
    prediction_table['title'] = df_temp['title'].values

    for i, (model, tfidf) in enumerate(zip(models, tfidfs)):
        X = tfidf.transform(df_temp['text']).toarray()
        y_pred = model.predict(X)
        prediction_table[f'Model {i}'] = y_pred

    means = prediction_table.iloc[:, 1:].mean()
    mean_row = pd.DataFrame([['mean'] + list(means)], columns=prediction_table.columns)
    prediction_table = pd.concat([prediction_table, mean_row], ignore_index=True)

    return prediction_table


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC

less = min(df.target.value_counts())
balanced_df = pd.concat((df[df['target'] == 0].sample(less, random_state=9), df[df['target'] != 0].sample(less, random_state=9)))

models = []
tfidfs = []

vocab = 50000
ngram = 4
model = AdaBoostClassifier(estimator=LogisticRegression(), n_estimators=100, algorithm='SAMME', random_state=11)
tfidf = TfidfVectorizer(max_features=vocab, ngram_range=(1, ngram))
X = tfidf.fit_transform(balanced_df['text']).toarray()
y = np.asarray(balanced_df['target'])
model.fit(X, y)
models.append(model)
tfidfs.append(tfidf)

vocab = 30000
ngram = 1
model = AdaBoostClassifier(estimator=BernoulliNB(), n_estimators=100, algorithm='SAMME', random_state=11)
tfidf = TfidfVectorizer(max_features=vocab, ngram_range=(1, ngram))
X = tfidf.fit_transform(balanced_df['text']).toarray()
y = np.asarray(balanced_df['target'])
model.fit(X, y)
models.append(model)
tfidfs.append(tfidf)

vocab = 100
ngram = 4
model = RandomForestClassifier()
tfidf = TfidfVectorizer(max_features=vocab, ngram_range=(1, ngram))
X = tfidf.fit_transform(balanced_df['text']).toarray()
y = np.asarray(balanced_df['target'])
model.fit(X, y)
models.append(model)
tfidfs.append(tfidf)

estimators=[('svm', SVC(probability=True)), 
             ('log', LogisticRegression()),
             ('rid', RidgeClassifier(alpha=5))]
vocab = 20000
ngram = 1
model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())
tfidf = TfidfVectorizer(max_features=vocab, ngram_range=(1, ngram))
X = tfidf.fit_transform(balanced_df['text']).toarray()
y = np.asarray(balanced_df['target'])
model.fit(X, y)
models.append(model)
tfidfs.append(tfidf)

In [18]:
author = 'dracontius'
make_predict_table(author, df_other, models, tfidfs)

dracontius, expected target: 1


Unnamed: 0,title,Model 0,Model 1,Model 2,Model 3
0,carmen_de_deo,1.0,1.0,1.0,1.0
1,hexameron_dracontii,1.0,1.0,1.0,1.0
2,satisfactio_ad_guntharium,1.0,0.0,1.0,1.0
3,mean,1.0,0.666667,1.0,1.0


In [19]:
author = 'eugyppius'
make_predict_table(author, df_other, models, tfidfs)

eugyppius, expected target: 1


Unnamed: 0,title,Model 0,Model 1,Model 2,Model 3
0,vita_sancti_severini,1.0,1.0,1.0,1.0
1,thesaurus,0.0,1.0,0.0,0.0
2,epistola_dedicatoria,0.0,1.0,0.0,1.0
3,mean,0.333333,1.0,0.333333,0.666667


In [None]:
['dracontius', 'eugyppius', 'mamertus', 'ennodius', 'avitus', 'paulinus', 'damasus', 'hieronymus', 'hilarius']

In [20]:
author = 'mamertus'
make_predict_table(author, df_other, models, tfidfs)

mamertus, expected target: 1


Unnamed: 0,title,Model 0,Model 1,Model 2,Model 3
0,hymnus_de_passione_domini,0.0,1.0,0.0,1.0
1,epistolae,0.0,0.0,0.0,1.0
2,de_statu_animae,0.0,0.0,1.0,1.0
3,mean,0.0,0.333333,0.333333,1.0


In [21]:
author = 'ennodius'
make_predict_table(author, df_other, models, tfidfs)

ennodius, expected target: 1


Unnamed: 0,title,Model 0,Model 1,Model 2,Model 3
0,panegyricus_theoderico_regi_dictus,1.0,1.0,1.0,1.0
1,dictiones,1.0,1.0,1.0,1.0
2,petitorium,0.0,1.0,0.0,1.0
3,eucharisticum,1.0,0.0,1.0,1.0
4,paraenesis_didascalia,1.0,0.0,1.0,1.0
5,epistolae,1.0,1.0,0.0,1.0
6,vita_b._antonii,1.0,0.0,1.0,1.0
7,carmina,1.0,1.0,1.0,1.0
8,benedictio_cerei,0.0,1.0,0.0,1.0
9,libellus_apologeticus_pro_synodo,1.0,1.0,0.0,1.0


In [22]:
author = 'avitus'
make_predict_table(author, df_other, models, tfidfs)

avitus, expected target: 1


Unnamed: 0,title,Model 0,Model 1,Model 2,Model 3
0,fragmenta,1.0,1.0,1.0,1.0
1,homilia_de_rogationibus,0.0,0.0,1.0,1.0
2,epistolae,0.0,0.0,1.0,1.0
3,fragmenta_libri_de_divinitate_spiritus_sancti,0.0,1.0,0.0,1.0
4,collatio_episcoporum,1.0,0.0,0.0,1.0
5,de_mosaicae_historiae_gestis,1.0,1.0,1.0,1.0
6,sermo_in_rogationibus,0.0,0.0,0.0,1.0
7,mean,0.428571,0.428571,0.571429,1.0


In [23]:
author = 'paulinus'
make_predict_table(author, df_other, models, tfidfs)

paulinus, expected target: 0


Unnamed: 0,title,Model 0,Model 1,Model 2,Model 3
0,adversus,0.0,1.0,1.0,1.0
1,vita_ambrosii,1.0,1.0,1.0,1.0
2,de_benedictionibus,0.0,1.0,1.0,0.0
3,mean,0.333333,1.0,1.0,0.666667


In [24]:
author = 'damasus'
make_predict_table(author, df_other, models, tfidfs)

damasus, expected target: 0


Unnamed: 0,title,Model 0,Model 1,Model 2,Model 3
0,epigrammata,1.0,1.0,1.0,1.0
1,mean,1.0,1.0,1.0,1.0


In [25]:
author = 'hieronymus'
make_predict_table(author, df_other, models, tfidfs)

hieronymus, expected target: 0


Unnamed: 0,title,Model 0,Model 1,Model 2,Model 3
0,notitia_historica,0.0,0.0,0.0,0.0
1,de_situ_et_nominibus_locorum_hebraicorum,0.0,1.0,1.0,0.0
2,contra_vigilantium,0.0,0.0,0.0,0.0
3,apologia_adv_lib_rufini,0.0,0.0,0.0,0.0
4,de_viris_illustribus,0.0,1.0,1.0,1.0
5,vita_hilarii,0.0,1.0,0.0,1.0
6,contra_ioannem,0.0,1.0,1.0,0.0
7,vita_s._pauli_primi_eremitae,0.0,0.0,1.0,1.0
8,dialogus_contra_luciferianos,1.0,0.0,1.0,1.0
9,quaestiones_hebraicae_in_genesim,0.0,1.0,1.0,0.0


In [26]:
author = 'hilarius'
make_predict_table(author, df_other, models, tfidfs)

hilarius, expected target: 0


Unnamed: 0,title,Model 0,Model 1,Model 2,Model 3
0,de_synodis,1.0,0.0,1.0,1.0
1,fragmenta,0.0,0.0,0.0,1.0
2,pologetica_responsa,0.0,1.0,0.0,1.0
3,de_dedicatione,0.0,1.0,0.0,1.0
4,contra_arianos,1.0,0.0,1.0,1.0
5,secundus_ad_constantium,0.0,0.0,0.0,0.0
6,epistola_ad_libellus,1.0,0.0,0.0,0.0
7,de_trinitate,1.0,1.0,1.0,1.0
8,contra_constantium,1.0,0.0,0.0,0.0
9,primus_ad_constantium,0.0,0.0,0.0,0.0
