Проведем подробные тесты для тех моделей, которые показали хорошую точность на предварительных тестах

In [1]:
from pathlib import Path
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import pylab

In [2]:
data = []
base_dir = 'corpus_preprocessed'

for target in ['aimait', 'terribles']:
    target_label = 1 if target == 'aimait' else 0
    
    target_dir = os.path.join(base_dir, target)
    
    for author in os.listdir(target_dir):
        author_dir = os.path.join(target_dir, author)
        
        if os.path.isdir(author_dir):
            for filename in os.listdir(author_dir):
                if filename.endswith('.txt'):
                    file_path = os.path.join(author_dir, filename)
                    
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read()
                    data.append({
                        'target': target_label,
                        'author': author,
                        'title': filename[:-4],
                        'text': text
                    })

df = pd.DataFrame(data)
display(df)

Unnamed: 0,target,author,title,text
0,1,prudentius,prud.psycho,praefatio senex fidelis primus credo uia abram...
1,1,sidonius,sidonius3,epistula sidonius auitus salus multus uinculum...
2,1,sidonius,sidonius2,epistula sidonius ecdicio salus duo nunc parit...
3,1,sidonius,sidonius1,epistula sidonius constantio salus praecipio d...
4,1,sidonius,sidonius5,epistula sidonius petronius salus audio lectit...
...,...,...,...,...
893,0,tacitus,tac.ann15,interea rex parthi uologaeses cognosco corbulo...
894,0,tacitus,tac.ann11,ualerium asiaticum bis consul quondam adulter ...
895,0,tacitus,tac.ann12,caedes messalinae convulsus princeps domus ori...
896,0,tacitus,tac.ann13,primus nouus principatus mors iunii silanus pr...


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import statistics

def fit_compute(vocab, ngram, model, test_list, df):

    less = min(df.target.value_counts())
    balanced_df = pd.concat((df[df['target'] == 0].sample(less, random_state=7), df[df['target'] != 0].sample(less, random_state=7)))
    
    tfidf = TfidfVectorizer(max_features=vocab, ngram_range=(1, ngram))
    tfidf.fit(balanced_df['text'])
    
    accs = []
    prec = []
    rec = []
    f1 = []
    
    for author in test_list:
        
        train_df = df[df['author'] != author]
        test_df = df[df['author'] == author]
        
        less = min(train_df.target.value_counts())
        balanced_df = pd.concat((train_df[train_df['target'] == 0].sample(less, random_state=9), train_df[train_df['target'] != 0].sample(less, random_state=9)))

        X_train = tfidf.transform(balanced_df['text']).toarray()
        y_train = np.asarray(balanced_df['target'])
        
        X_test = tfidf.transform(test_df['text']).toarray()  
        y_test = np.asarray(test_df['target'])

        model.fit(X_train, y_train)
        y_pre = model.predict(X_test)
        
        accs.append(accuracy_score(y_test,y_pre))
        prec.append(precision_score(y_test,y_pre, zero_division=1.0))
        rec.append(recall_score(y_test,y_pre, zero_division=1.0))
        f1.append(f1_score(y_test,y_pre, zero_division=1.0))
    
    return statistics.fmean(accs), statistics.fmean(prec), statistics.fmean(rec), statistics.fmean(f1)

In [4]:
authors_list = list(df['author'].unique())
authors_list

['prudentius',
 'sidonius',
 'gregorytours',
 'ermoldus',
 'tertullianus',
 'vitae',
 'rutulius',
 'orientus',
 'bonifatius',
 'commodianus',
 'regino',
 'paulinus',
 'apuleius',
 'venantius',
 'claudian',
 'aldhelmus',
 'floridus',
 'tatuinus',
 'strabo',
 'ausonius',
 'sedulius',
 'iordanes',
 'flavius_merobaudes',
 'boethius',
 'eusebius',
 'pauldeacon',
 'vita_cuthberti',
 'fredegarus',
 'freculphus',
 'macer_floridus',
 'abbo',
 'petronius',
 'terentius',
 'symmachus',
 'victor',
 'martial',
 'lactantius',
 'cyprianus',
 'plinius',
 'sallustius',
 'quintilian',
 'caesar',
 'propertius',
 'suetonius',
 'tibullus',
 'fronto',
 'statius',
 'minucius',
 'persius',
 'ovid',
 'juvenal',
 'horace',
 'vergil',
 'livy',
 'arnobius',
 'gellius',
 'alcuin',
 'macrobius',
 'catullus',
 'plautus',
 'sen',
 'cicero',
 'eginhardus',
 'ambrosius',
 'tacitus',
 'ammianus']

Будем проверять модели в порядке убывания наилучшего результата, достигнутого на предварительных тестах

In [16]:
# AdaBoost - logreg
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

model = AdaBoostClassifier(estimator=LogisticRegression(), n_estimators=100, algorithm='SAMME', random_state=11)
print('AdaBoost - logreg\n')

for vocab, n in [(1000, 4), (50000, 4)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

AdaBoost - logreg

1000	4	0.7234304971584363	0.6363636363636364	0.9054533429533429	0.5567733276824186
50000	4	0.7862173599628184	0.7575757575757576	0.8988275613275614	0.6700545142221432


In [25]:
# AdaBoost - logreg
model = AdaBoostClassifier(estimator=LogisticRegression(), n_estimators=100, algorithm='SAMME', random_state=11)
print('AdaBoost - logreg\n')

for vocab, n in [(20000, 1), (30000, 1), (50000, 1)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

AdaBoost - logreg

20000	1	0.7275769596212326	0.7121212121212122	0.8687469937469938	0.592362621429607
30000	1	0.7251566346243781	0.696969696969697	0.8789381914381914	0.5912438885166158
50000	1	0.7421506390287522	0.7121212121212122	0.8903318903318903	0.6131876500750019


In [26]:
model = AdaBoostClassifier(estimator=BernoulliNB(), n_estimators=100, algorithm='SAMME', random_state=11)
print('AdaBoost - NB\n')

for vocab, n in [(30000, 1)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

AdaBoost - NB

30000	1	0.800991590250709	0.696969696969697	0.96257215007215	0.6663237951361118


In [17]:
# Bernoulli NB
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
print('Bernoulli NB\n')

for vocab, n in [(100000, 4), (100000, 1), (50000, 4)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

Bernoulli NB

100000	4	0.6457148515972045	0.8484848484848485	0.7368326118326118	0.5895669986579077
100000	1	0.6285956647411756	0.803030303030303	0.7566137566137566	0.5661036199313711
50000	4	0.6113244805361876	0.7272727272727273	0.7580868205868205	0.4935045888683902


In [28]:
model = BernoulliNB()
print('Bernoulli NB\n')

for vocab, n in [(30000, 1)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

Bernoulli NB

30000	1	0.6073776446291561	0.696969696969697	0.7881433381433381	0.49594708167133184


In [20]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

estimators=[('knn', KNeighborsClassifier(n_neighbors=3, metric='cosine')), 
             ('rfc', RandomForestClassifier())]

model = StackingClassifier(estimators=estimators, final_estimator=XGBClassifier(n_estimators=100, max_depth=3))
print("KNN, RFC -> XGB\n")

for vocab, n in [(100, 1), (500, 1)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

KNN, RFC -> XGB

100	1	0.7234495753660781	0.6363636363636364	0.9111050986050986	0.5629895950751566
500	1	0.7177410045063947	0.6212121212121212	0.9004810004810004	0.5371830057236755


In [21]:
from sklearn.svm import SVC

model = StackingClassifier(estimators=estimators, final_estimator=SVC())
print("KNN, RFC -> SVM\n")

for vocab, n in [(500, 1)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

KNN, RFC -> SVM

500	1	0.7354138198803647	0.6666666666666666	0.9007635882635883	0.5815850815850816


In [22]:
model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
print("KNN, RFC -> logreg\n")

for vocab, n in [(500, 1)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

KNN, RFC -> logreg

500	1	0.7411178878033798	0.696969696969697	0.8816137566137566	0.5921789321789321


In [29]:
model = SVC()
print('SVM\n')

for vocab, n in [(10000, 1), (10000, 2), (100000, 1), (100000, 2)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

SVM

10000	1	0.7432111402607834	0.7121212121212122	0.8959235209235209	0.621048834206729
10000	2	0.7420824764695809	0.7121212121212122	0.8942400192400193	0.6189827185042496
100000	1	0.7322128164415196	0.696969696969697	0.8917147667147667	0.6018110013325324
100000	2	0.72687300535302	0.696969696969697	0.8893999518999518	0.5988143988143989


In [30]:
model = RandomForestClassifier()
print('RFC\n')

for vocab, n in [(100, 4), (50, 3), (100, 2), (500, 4)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

RFC

100	4	0.7715759599416657	0.6515151515151515	0.9203944203944204	0.585188246097337
50	3	0.7205302845633367	0.6212121212121212	0.9117664742664743	0.5456643165734075
100	2	0.7695230731198325	0.6666666666666666	0.9177489177489178	0.5987144168962351
500	4	0.762749032029271	0.696969696969697	0.9240921115921116	0.6347430103295206


In [31]:
model = KNeighborsClassifier(n_neighbors=3, metric='cosine')
print('KNN\n')

for vocab, n in [(1000, 1), (1000, 2), (100, 1), (100, 2)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

KNN

1000	1	0.6959448738067538	0.6666666666666666	0.8478835978835979	0.523241909605546
1000	2	0.6917702224554254	0.6666666666666666	0.8472522847522848	0.5229195369427477
100	1	0.6831755475207035	0.696969696969697	0.8581830206830207	0.5687547775769008
100	2	0.6831755475207035	0.696969696969697	0.8581830206830207	0.5687547775769008


In [32]:
model = LogisticRegression()
print("logreg\n")

for vocab, n in [(100000, 1), (50000, 1), (30000, 1)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

logreg

100000	1	0.7149560572165581	0.696969696969697	0.8921356421356421	0.5989111898202807
50000	1	0.7185509702252879	0.696969696969697	0.8938191438191438	0.6019414928505837
30000	1	0.723101650647231	0.696969696969697	0.8980278980278981	0.6057982421618785


In [33]:
from sklearn.linear_model import RidgeClassifier

model = RidgeClassifier(alpha=1)
print("Ridge, alpha = 1")

for vocab, n in [(1000, 1)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

Ridge, alpha = 1
1000	1	0.7498689313755034	0.6818181818181818	0.908519721019721	0.6032450691541601


In [34]:
model = RidgeClassifier(alpha=5)
print("Ridge, alpha = 5")

for vocab, n in [(10000, 1)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

Ridge, alpha = 5
10000	1	0.7228608262776692	0.696969696969697	0.8980278980278981	0.6057982421618785


In [37]:
estimators=[('svm', SVC(probability=True)), 
             ('nb', BernoulliNB()), 
             ('log', LogisticRegression())]

In [38]:
model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())

for vocab, n in [(1000, 1), (10000, 1), (20000, 1), (30000, 1)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

1000	1	0.7481699592399489	0.6818181818181818	0.9076779701779701	0.6025702537734624
10000	1	0.7468806126111851	0.7121212121212122	0.8980880230880232	0.624712403838678
20000	1	0.749497682523304	0.6818181818181818	0.9040103415103414	0.5960164122219458
30000	1	0.7657894485810887	0.696969696969697	0.9222582972582973	0.6310056392228225


In [39]:
model = StackingClassifier(estimators=estimators, final_estimator=SVC())

for vocab, n in [(10000, 1), (20000, 1), (30000, 1)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

10000	1	0.7334160904849162	0.696969696969697	0.89496151996152	0.6050888389977218
20000	1	0.7523149389557376	0.7121212121212122	0.8984487734487735	0.6227017267687124
30000	1	0.7322131376418318	0.7121212121212122	0.8976070226070226	0.621816248610507


In [40]:
estimators=[('svm', SVC(probability=True)), 
             ('log', LogisticRegression()),
             ('rid', RidgeClassifier(alpha=5))]

In [41]:
model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())

for vocab, n in [(20000, 1), (30000, 1)]:
    acc, prec, rec, f1 = fit_compute(vocab, n, model, authors_list, df)
    print(f"{vocab}\t{n}\t{acc}\t{prec}\t{rec}\t{f1}")

20000	1	0.7706649630667408	0.696969696969697	0.9138407888407888	0.6232127897797755
30000	1	0.7541644586961304	0.6818181818181818	0.9104737854737854	0.6056957218674506


нилучшие значения точности у следующих классификаторов:   
1) AdaBoost с BernoulliNB (30000, 1) : 0.801   
2) AdaBoost c LogisticRegression (50000, 4) : 0.786   
3) RandomForest (100, 4) : 0.772   
4) SVM, LogReg, Ridge -> RandomForest (20000, 1) : 0.771   