In [1]:
# :(

from pathlib import Path
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import pylab

In [29]:
data = []
base_dir = 'corpus_preprocessed'

for target in ['aimait', 'terribles']:
    target_label = 1 if target == 'aimait' else 0
    
    target_dir = os.path.join(base_dir, target)
    
    for author in os.listdir(target_dir):
        author_dir = os.path.join(target_dir, author)
        
        if os.path.isdir(author_dir):
            for filename in os.listdir(author_dir):
                if filename.endswith('.txt'):
                    file_path = os.path.join(author_dir, filename)
                    
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read()
                    data.append({
                        'target': target_label,
                        'author': author,
                        'title': filename[:-4],
                        'text': text
                    })

df = pd.DataFrame(data)

In [30]:
display(df)

Unnamed: 0,target,author,title,text
0,1,prudentius,prud.psycho,praefatio senex fidelis primus credo uia abram...
1,1,sidonius,sidonius3,epistula sidonius auitus salus multus uinculum...
2,1,sidonius,sidonius2,epistula sidonius ecdicio salus duo nunc parit...
3,1,sidonius,sidonius1,epistula sidonius constantio salus praecipio d...
4,1,sidonius,sidonius5,epistula sidonius petronius salus audio lectit...
...,...,...,...,...
893,0,tacitus,tac.ann15,interea rex parthi uologaeses cognosco corbulo...
894,0,tacitus,tac.ann11,ualerium asiaticum bis consul quondam adulter ...
895,0,tacitus,tac.ann12,caedes messalinae convulsus princeps domus ori...
896,0,tacitus,tac.ann13,primus nouus principatus mors iunii silanus pr...


In [49]:
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import statistics

def compute_accuracy(vocab, ngram, model, test_list, df, l2=False):
    
    accs = []
    
    for author in test_list:
        
        train_df = df[df['author'] != author]
        test_df = df[df['author'] == author]
        
        less = min(train_df.target.value_counts())
        balanced_df = pd.concat((train_df[train_df['target'] == 0].sample(less, random_state=8), train_df[train_df['target'] != 0].sample(less, random_state=8)))
        
        tfidf = TfidfVectorizer(max_features=vocab, ngram_range=(1, ngram))

        X_train = tfidf.fit_transform(balanced_df['text']).toarray()
        if l2:
            X_train = normalize(X_train, norm='l2')
        y_train = np.asarray(balanced_df['target'])
        
        X_test = tfidf.transform(test_df['text']).toarray()
        if l2:
            X_test = normalize(X_test, norm='l2')   
        y_test = np.asarray(test_df['target'])

        model.fit(X_train, y_train)
        y_pre = model.predict(X_test)
        accs.append(accuracy_score(y_test,y_pre))
    
    return statistics.fmean(accs)

In [20]:
vocab_dim = [500, 1000, 5000, 10000, 20000, 30000, 50000, 100000]
ngram = [1, 2, 3, 4]
models = [] # logreg, bernoulli NB, SVM, random forest, stacking (from process), extra trees, KNN Bonus: neural networkss (LSTM) 

Принцип оценки качества моделей: выбираем случайно несколько авторов. Для каждого автора тренируем модель на всех остальных авторах, применяем модель на текстах выбранного автора, считаем среднее арифметическое по accuracy. итоговая accuracy это среднее по всем выбранным авторам.

Для чистоты эксперимента выберем авторов для валидации заранее, одних и тех же для всех моделей. 

In [5]:
authors_class_0 = df[df['target'] == 0]['author'].unique()
authors_class_1 = df[df['target'] == 1]['author'].unique()

print(f"0 - target: {len(authors_class_0)}")
print(f"1 - target: {len(authors_class_1)}")

0 - target: 34
1 - target: 32


Выберем по 6 авторов для валидации

In [8]:
import random

random.seed(8)

selected_authors_0 = random.sample(list(authors_class_0), 6)
selected_authors_1 = random.sample(list(authors_class_1), 6)

print("0 - target:", selected_authors_0)
print("1 - target:", selected_authors_1)

0 - target: ['statius', 'gellius', 'alcuin', 'lactantius', 'plinius', 'arnobius']
1 - target: ['gregorytours', 'petronius', 'tertullianus', 'orientus', 'pauldeacon', 'floridus']


In [9]:
print(f"number of 0 - target val texts: {df[df['author'].isin(selected_authors_0)].shape[0]}")
print(f"number of 1 - target val texts: {df[df['author'].isin(selected_authors_1)].shape[0]}")

number of 0 - target val texts: 69
number of 1 - target val texts: 38


In [17]:
test_list = selected_authors_0 + selected_authors_1

**Оценка моделей**

Начнем по порядку оценивать качество моделей, варьируя гиперпараметры

In [18]:
from sklearn.linear_model import LogisticRegression

In [34]:
model = LogisticRegression()
logreg_list = []

for vocab in vocab_dim:
    for n in ngram:

        accuracy = compute_accuracy(vocab, n, model, test_list, df)
        
        print(f"Vocabulary size: {vocab}, max ngram: {n}, accuracy: {accuracy}")
        logreg_list.append((vocab, n, accuracy))

Vocabulary size: 500, max ngram: 1, accuracy: 0.7122120046849149
Vocabulary size: 500, max ngram: 2, accuracy: 0.7122120046849149
Vocabulary size: 500, max ngram: 3, accuracy: 0.7122120046849149
Vocabulary size: 500, max ngram: 4, accuracy: 0.7122120046849149
Vocabulary size: 1000, max ngram: 1, accuracy: 0.7207412935934608
Vocabulary size: 1000, max ngram: 2, accuracy: 0.7172690713712385
Vocabulary size: 1000, max ngram: 3, accuracy: 0.7172690713712385
Vocabulary size: 1000, max ngram: 4, accuracy: 0.7172690713712385
Vocabulary size: 5000, max ngram: 1, accuracy: 0.7402918038560453
Vocabulary size: 5000, max ngram: 2, accuracy: 0.7310325445967861
Vocabulary size: 5000, max ngram: 3, accuracy: 0.7310325445967861
Vocabulary size: 5000, max ngram: 4, accuracy: 0.7310325445967861
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7266465796845054
Vocabulary size: 10000, max ngram: 2, accuracy: 0.7231743574622831
Vocabulary size: 10000, max ngram: 3, accuracy: 0.7231743574622831
Vocabulary 

best: (100000, 1), (5000, 1), (50000, 1)
OK: all

при увеличении размера н-грамм результаты всегда ухудшаются, поэтому для этой модели имеет смысл рассматривать только 1-граммы

In [36]:
from sklearn import svm

model = svm.SVC()
svm_list = []

for vocab in vocab_dim:
    for n in ngram:

        accuracy = compute_accuracy(vocab, n, model, test_list, df)
        
        print(f"Vocabulary size: {vocab}, max ngram: {n}, accuracy: {accuracy}")
        svm_list.append((vocab, n, accuracy))

Vocabulary size: 500, max ngram: 1, accuracy: 0.6827372761970286
Vocabulary size: 500, max ngram: 2, accuracy: 0.6827372761970286
Vocabulary size: 500, max ngram: 3, accuracy: 0.6827372761970286
Vocabulary size: 500, max ngram: 4, accuracy: 0.6827372761970286
Vocabulary size: 1000, max ngram: 1, accuracy: 0.7237512695136533
Vocabulary size: 1000, max ngram: 2, accuracy: 0.7237512695136533
Vocabulary size: 1000, max ngram: 3, accuracy: 0.7237512695136533
Vocabulary size: 1000, max ngram: 4, accuracy: 0.7237512695136533
Vocabulary size: 5000, max ngram: 1, accuracy: 0.7299299105607155
Vocabulary size: 5000, max ngram: 2, accuracy: 0.7189388913460121
Vocabulary size: 5000, max ngram: 3, accuracy: 0.7189388913460121
Vocabulary size: 5000, max ngram: 4, accuracy: 0.7224111135682344
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7299299105607155
Vocabulary size: 10000, max ngram: 2, accuracy: 0.725056616213737
Vocabulary size: 10000, max ngram: 3, accuracy: 0.725056616213737
Vocabulary si

best: (100000, 2), (10000, 1), (10000, 2-4)
OK: vocab: 1000-30000, (100000, 1-2), (50000, 1)

Для этой модели тоже будем рассматривать только 1-граммы

In [37]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
forest_list = []


for vocab in vocab_dim:
    for n in ngram:

        accuracy = compute_accuracy(vocab, n, model, test_list, df)
        
        print(f"Vocabulary size: {vocab}, max ngram: {n}, accuracy: {accuracy}")
        forest_list.append((vocab, n, accuracy))

Vocabulary size: 500, max ngram: 1, accuracy: 0.7384530566612609
Vocabulary size: 500, max ngram: 2, accuracy: 0.6753490138745556
Vocabulary size: 500, max ngram: 3, accuracy: 0.7009983086801972
Vocabulary size: 500, max ngram: 4, accuracy: 0.756173522040395
Vocabulary size: 1000, max ngram: 1, accuracy: 0.6949066907464739
Vocabulary size: 1000, max ngram: 2, accuracy: 0.657317824790735
Vocabulary size: 1000, max ngram: 3, accuracy: 0.6456577514046554
Vocabulary size: 1000, max ngram: 4, accuracy: 0.6690173145281504
Vocabulary size: 5000, max ngram: 1, accuracy: 0.6960457721100136
Vocabulary size: 5000, max ngram: 2, accuracy: 0.6536506707946336
Vocabulary size: 5000, max ngram: 3, accuracy: 0.6621624527003783
Vocabulary size: 5000, max ngram: 4, accuracy: 0.655734004127967
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7081358420561207
Vocabulary size: 10000, max ngram: 2, accuracy: 0.655953302373581
Vocabulary size: 10000, max ngram: 3, accuracy: 0.6792974060969418
Vocabulary size

best: (100, 4), (50, 3), (100, 2), (500, 4)
OK: vocab: 500, 50, 100

результаты местами хорошие, но не очень стабильные. общей зависимости качества от размера н-грамм нет

In [54]:
model = RandomForestClassifier()

for vocab in [50, 100, 300]:
    for n in ngram:

        accuracy = compute_accuracy(vocab, n, model, test_list, df)
        
        print(f"Vocabulary size: {vocab}, max ngram: {n}, accuracy: {accuracy}")

Vocabulary size: 50, max ngram: 1, accuracy: 0.7491744066047472
Vocabulary size: 50, max ngram: 2, accuracy: 0.7396306329549364
Vocabulary size: 50, max ngram: 3, accuracy: 0.774828001375989
Vocabulary size: 50, max ngram: 4, accuracy: 0.7265562804068996
Vocabulary size: 100, max ngram: 1, accuracy: 0.7217499426671253
Vocabulary size: 100, max ngram: 2, accuracy: 0.7645539502350648
Vocabulary size: 100, max ngram: 3, accuracy: 0.7458813496158697
Vocabulary size: 100, max ngram: 4, accuracy: 0.7773262813897489
Vocabulary size: 300, max ngram: 1, accuracy: 0.5894295583730568
Vocabulary size: 300, max ngram: 2, accuracy: 0.6715757940603142
Vocabulary size: 300, max ngram: 3, accuracy: 0.609493505004341
Vocabulary size: 300, max ngram: 4, accuracy: 0.6833039502350647


In [38]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

model = BernoulliNB()
bernoulli_list = []

for vocab in vocab_dim:
    for n in ngram:

        accuracy = compute_accuracy(vocab, n, model, test_list, df)
        
        print(f"Vocabulary size: {vocab}, max ngram: {n}, accuracy: {accuracy}")
        bernoulli_list.append((vocab, n, accuracy))

Vocabulary size: 500, max ngram: 1, accuracy: 0.5590972938883155
Vocabulary size: 500, max ngram: 2, accuracy: 0.5590972938883155
Vocabulary size: 500, max ngram: 3, accuracy: 0.5590972938883155
Vocabulary size: 500, max ngram: 4, accuracy: 0.5590972938883155
Vocabulary size: 1000, max ngram: 1, accuracy: 0.591777032450407
Vocabulary size: 1000, max ngram: 2, accuracy: 0.591777032450407
Vocabulary size: 1000, max ngram: 3, accuracy: 0.591777032450407
Vocabulary size: 1000, max ngram: 4, accuracy: 0.591777032450407
Vocabulary size: 5000, max ngram: 1, accuracy: 0.6667225662194703
Vocabulary size: 5000, max ngram: 2, accuracy: 0.6722781217750259
Vocabulary size: 5000, max ngram: 3, accuracy: 0.6722781217750259
Vocabulary size: 5000, max ngram: 4, accuracy: 0.6722781217750259
Vocabulary size: 10000, max ngram: 1, accuracy: 0.6404153766769866
Vocabulary size: 10000, max ngram: 2, accuracy: 0.6404153766769866
Vocabulary size: 10000, max ngram: 3, accuracy: 0.6404153766769866
Vocabulary size

best: (100000, 2-3), (100000, 1), (50000, 3-4)
OK: vocab: 50000-100000

Здесь явно улучшается качество при увеличении размера н-грамм, будем рассмотривать только 4-граммы

In [39]:
model = GaussianNB()
gaussian_list = []

for vocab in vocab_dim:
    for n in ngram:

        accuracy = compute_accuracy(vocab, n, model, test_list, df)
        
        print(f"Vocabulary size: {vocab}, max ngram: {n}, accuracy: {accuracy}")
        gaussian_list.append((vocab, n, accuracy))

Vocabulary size: 500, max ngram: 1, accuracy: 0.6238905064949957
Vocabulary size: 500, max ngram: 2, accuracy: 0.6238905064949957
Vocabulary size: 500, max ngram: 3, accuracy: 0.6238905064949957
Vocabulary size: 500, max ngram: 4, accuracy: 0.6238905064949957
Vocabulary size: 1000, max ngram: 1, accuracy: 0.618972881550281
Vocabulary size: 1000, max ngram: 2, accuracy: 0.618972881550281
Vocabulary size: 1000, max ngram: 3, accuracy: 0.618972881550281
Vocabulary size: 1000, max ngram: 4, accuracy: 0.618972881550281
Vocabulary size: 5000, max ngram: 1, accuracy: 0.6390702041050339
Vocabulary size: 5000, max ngram: 2, accuracy: 0.6274717635592248
Vocabulary size: 5000, max ngram: 3, accuracy: 0.6330273191147804
Vocabulary size: 5000, max ngram: 4, accuracy: 0.6330273191147804
Vocabulary size: 10000, max ngram: 1, accuracy: 0.6090184611856438
Vocabulary size: 10000, max ngram: 2, accuracy: 0.6420221304896228
Vocabulary size: 10000, max ngram: 3, accuracy: 0.6315646141497534
Vocabulary size

best: (500000, 4)
OK: vocab>=50000, n>=3

плохая точность, выкинем из рассмотрения эту модель

In [40]:
model = MultinomialNB()
multinomial_list = []

for vocab in vocab_dim:
    for n in ngram:

        accuracy = compute_accuracy(vocab, n, model, test_list, df)
        
        print(f"Vocabulary size: {vocab}, max ngram: {n}, accuracy: {accuracy}")
        multinomial_list.append((vocab, n, accuracy))

Vocabulary size: 500, max ngram: 1, accuracy: 0.6762453519419463
Vocabulary size: 500, max ngram: 2, accuracy: 0.6599546251249038
Vocabulary size: 500, max ngram: 3, accuracy: 0.6599546251249038
Vocabulary size: 500, max ngram: 4, accuracy: 0.6599546251249038
Vocabulary size: 1000, max ngram: 1, accuracy: 0.7110458949661735
Vocabulary size: 1000, max ngram: 2, accuracy: 0.7061439341818598
Vocabulary size: 1000, max ngram: 3, accuracy: 0.7061439341818598
Vocabulary size: 1000, max ngram: 4, accuracy: 0.7061439341818598
Vocabulary size: 5000, max ngram: 1, accuracy: 0.7092184095860566
Vocabulary size: 5000, max ngram: 2, accuracy: 0.7092184095860566
Vocabulary size: 5000, max ngram: 3, accuracy: 0.7092184095860566
Vocabulary size: 5000, max ngram: 4, accuracy: 0.7092184095860566
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7077303143479613
Vocabulary size: 10000, max ngram: 2, accuracy: 0.6685438064114534
Vocabulary size: 10000, max ngram: 3, accuracy: 0.7042580921257392
Vocabulary 

плохая точность, тоже не будем рассматривать эту модель

In [41]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier(n_estimators=50, random_state=2)
trees_list = []

for vocab in vocab_dim:
    for n in ngram:

        accuracy = compute_accuracy(vocab, n, model, test_list, df)
        
        print(f"Vocabulary size: {vocab}, max ngram: {n}, accuracy: {accuracy}")
        trees_list.append((vocab, n, accuracy))

Vocabulary size: 500, max ngram: 1, accuracy: 0.6450711132406178
Vocabulary size: 500, max ngram: 2, accuracy: 0.6756746850787098
Vocabulary size: 500, max ngram: 3, accuracy: 0.6997974920962537
Vocabulary size: 500, max ngram: 4, accuracy: 0.6581308254295871
Vocabulary size: 1000, max ngram: 1, accuracy: 0.7111600488148476
Vocabulary size: 1000, max ngram: 2, accuracy: 0.6231484552885481
Vocabulary size: 1000, max ngram: 3, accuracy: 0.7237698003177879
Vocabulary size: 1000, max ngram: 4, accuracy: 0.6280217496355267
Vocabulary size: 5000, max ngram: 1, accuracy: 0.7028146346257801
Vocabulary size: 5000, max ngram: 2, accuracy: 0.6926056972496601
Vocabulary size: 5000, max ngram: 3, accuracy: 0.7156147107965993
Vocabulary size: 5000, max ngram: 4, accuracy: 0.6006345520664406
Vocabulary size: 10000, max ngram: 1, accuracy: 0.636530664897538
Vocabulary size: 10000, max ngram: 2, accuracy: 0.7159940291906236
Vocabulary size: 10000, max ngram: 3, accuracy: 0.6986713106295149
Vocabulary s

Результаты местами неплохие, но нестабильные

best: (30000, 3), (10000, 4), (50000, 4)

In [46]:
from sklearn.neighbors import KNeighborsClassifier

for num in [3, 5, 10, 15]:
    print(f'\n {num} neighbors:')
    model = KNeighborsClassifier(n_neighbors=num, metric='cosine')
    for vocab in [100, 300, 500, 1000, 3000, 5000, 10000]:
        for n in [1, 2]:
    
            accuracy = compute_accuracy(vocab, n, model, test_list, df)
            
            print(f"Vocabulary size: {vocab}, max ngram: {n}, accuracy: {accuracy}")


 3 neighbors:
Vocabulary size: 100, max ngram: 1, accuracy: 0.7612449219453863
Vocabulary size: 100, max ngram: 2, accuracy: 0.7612449219453863
Vocabulary size: 300, max ngram: 1, accuracy: 0.7151805166511048
Vocabulary size: 300, max ngram: 2, accuracy: 0.7438920012449425
Vocabulary size: 500, max ngram: 1, accuracy: 0.7286332661719658
Vocabulary size: 500, max ngram: 2, accuracy: 0.7286332661719658
Vocabulary size: 1000, max ngram: 1, accuracy: 0.7771071879044015
Vocabulary size: 1000, max ngram: 2, accuracy: 0.7771071879044015
Vocabulary size: 3000, max ngram: 1, accuracy: 0.70281094894098
Vocabulary size: 3000, max ngram: 2, accuracy: 0.690906187036218
Vocabulary size: 5000, max ngram: 1, accuracy: 0.7429111995675464
Vocabulary size: 5000, max ngram: 2, accuracy: 0.7071969138532607
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7248800104837256
Vocabulary size: 10000, max ngram: 2, accuracy: 0.7081019542319852

 5 neighbors:
Vocabulary size: 100, max ngram: 1, accuracy: 0.56052

best: (3, 1000, 1-2), (3, 100, 1-2), (3, 5000, 1)
OK: num of neighbors = 3

будем рассматривать только 1-граммы для этой модели

In [58]:
from sklearn.linear_model import RidgeClassifier

for alpha in [0.1, 1, 5, 10]:
    model = RidgeClassifier(alpha=alpha)
    print(f"\n alpha = {alpha}")
    for vocab in vocab_dim:
    
        accuracy = compute_accuracy(vocab, 1, model, test_list, df)
            
        print(f"Vocabulary size: {vocab}, max ngram: 1, accuracy: {accuracy}")


 alpha = 0.1
Vocabulary size: 500, max ngram: 1, accuracy: 0.658588362245483
Vocabulary size: 1000, max ngram: 1, accuracy: 0.7461336142644193
Vocabulary size: 5000, max ngram: 1, accuracy: 0.6687700665061346
Vocabulary size: 10000, max ngram: 1, accuracy: 0.664384101593854
Vocabulary size: 20000, max ngram: 1, accuracy: 0.6617385989483513
Vocabulary size: 30000, max ngram: 1, accuracy: 0.6617385989483513
Vocabulary size: 50000, max ngram: 1, accuracy: 0.6617385989483513
Vocabulary size: 100000, max ngram: 1, accuracy: 0.6709978582076106

 alpha = 1
Vocabulary size: 500, max ngram: 1, accuracy: 0.6594934026242075
Vocabulary size: 1000, max ngram: 1, accuracy: 0.7519590438514587
Vocabulary size: 5000, max ngram: 1, accuracy: 0.6972788384359592
Vocabulary size: 10000, max ngram: 1, accuracy: 0.6928928735236785
Vocabulary size: 20000, max ngram: 1, accuracy: 0.697766167870657
Vocabulary size: 30000, max ngram: 1, accuracy: 0.6933802029583763
Vocabulary size: 50000, max ngram: 1, accuracy

Итого мы оставляем для дальнейшего исследования:

1) логистическая регрессия: (5000-100000, 1)
2) SVM: (5000-10000, 1)
3) ~случайный лес: (1-4, 50-500)
4) бернулли: (50000-100000, 4)
5) KNN: (3, 100-1000, 1)

Теперь рассмотрим некоторые ансамбли

best: (1, 1000), (5, 100000), (0.1, 1000)
OK: alpha: 5, (10, 5000-100000)

In [74]:
import xgboost as xgb

def acc_xgb(vocab, ngram, test_list, df):
    accs = []
        
    for author in test_list:
        
        train_df = df[df['author'] != author]
        test_df = df[df['author'] == author]
        
        less = min(train_df.target.value_counts())
        balanced_df = pd.concat((train_df[train_df['target'] == 0].sample(less, random_state=8), train_df[train_df['target'] != 0].sample(less, random_state=8)))
        
        tfidf = TfidfVectorizer(max_features=vocab, ngram_range=(1, ngram))
    
        X_train = tfidf.fit_transform(balanced_df['text']).toarray()
        y_train = np.asarray(balanced_df['target'])
        dtrain = xgb.DMatrix(X_train, label=y_train)
        
        X_test = tfidf.transform(test_df['text']).toarray() 
        y_test = np.asarray(test_df['target'])
        dtest = xgb.DMatrix(X_test, label=y_test)
    
        params = {
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'use_label_encoder': False,
            'verbosity': 0
        }
    
        model = xgb.train(params, dtrain, num_boost_round=100)
        y_proba = model.predict(dtest)
        y_pre = (y_proba > 0.5).astype(int)
        accs.append(accuracy_score(y_test,y_pre))
    
    return statistics.fmean(accs)

In [75]:
for vocab in vocab_dim:
    for n in ngram:

        accuracy = acc_xgb(vocab, n, test_list, df)
        
        print(f"Vocabulary size: {vocab}, max ngram: {n}, accuracy: {accuracy}")

Vocabulary size: 500, max ngram: 1, accuracy: 0.701053082051534
Vocabulary size: 500, max ngram: 2, accuracy: 0.6224660916998378
Vocabulary size: 500, max ngram: 3, accuracy: 0.6685187232787851
Vocabulary size: 500, max ngram: 4, accuracy: 0.6268520566121185
Vocabulary size: 1000, max ngram: 1, accuracy: 0.7028222107556473
Vocabulary size: 1000, max ngram: 2, accuracy: 0.7262723802971481
Vocabulary size: 1000, max ngram: 3, accuracy: 0.7218864153848674
Vocabulary size: 1000, max ngram: 4, accuracy: 0.7262723802971481
Vocabulary size: 5000, max ngram: 1, accuracy: 0.7269711247071928
Vocabulary size: 5000, max ngram: 2, accuracy: 0.7453043351843661
Vocabulary size: 5000, max ngram: 3, accuracy: 0.7180225891526201
Vocabulary size: 5000, max ngram: 4, accuracy: 0.738301534063918
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7118674955362262
Vocabulary size: 10000, max ngram: 2, accuracy: 0.7194149589660426
Vocabulary size: 10000, max ngram: 3, accuracy: 0.7247146665683818
Vocabulary si

In [81]:
model = AdaBoostClassifier(estimator=LogisticRegression(), n_estimators=100, algorithm='SAMME', random_state=12)

for vocab in vocab_dim:

    accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
    print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 500, max ngram: 1, accuracy: 0.7236381394663128
Vocabulary size: 1000, max ngram: 1, accuracy: 0.618448183366914
Vocabulary size: 5000, max ngram: 1, accuracy: 0.7160845332284961
Vocabulary size: 10000, max ngram: 1, accuracy: 0.6999678526381312
Vocabulary size: 20000, max ngram: 1, accuracy: 0.7071559617999247
Vocabulary size: 30000, max ngram: 1, accuracy: 0.6277212639441742
Vocabulary size: 50000, max ngram: 1, accuracy: 0.6731647337297493
Vocabulary size: 100000, max ngram: 1, accuracy: 0.6683871648074434


In [83]:
model = AdaBoostClassifier(estimator=LogisticRegression(), n_estimators=100, algorithm='SAMME', random_state=12)

for vocab in vocab_dim:

    accuracy = compute_accuracy(vocab, 4, model, test_list, df)
        
    print(f"Vocabulary size: {vocab}, max ngram: {4}, accuracy: {accuracy}")

Vocabulary size: 500, max ngram: 4, accuracy: 0.6703538871689027
Vocabulary size: 1000, max ngram: 4, accuracy: 0.8303806902878111
Vocabulary size: 5000, max ngram: 4, accuracy: 0.7471631488852851
Vocabulary size: 10000, max ngram: 4, accuracy: 0.7805599579012892
Vocabulary size: 20000, max ngram: 4, accuracy: 0.7085570339246811
Vocabulary size: 30000, max ngram: 4, accuracy: 0.6291658476256
Vocabulary size: 50000, max ngram: 4, accuracy: 0.8708638426130686
Vocabulary size: 100000, max ngram: 4, accuracy: 0.7184341572886465


In [85]:
model = AdaBoostClassifier(estimator=BernoulliNB(), n_estimators=100, algorithm='SAMME', random_state=12)

for vocab in vocab_dim:

    accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
    print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 500, max ngram: 1, accuracy: 0.6273906784936197
Vocabulary size: 1000, max ngram: 1, accuracy: 0.719421101774043
Vocabulary size: 5000, max ngram: 1, accuracy: 0.6452974757154324
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7464629711533736
Vocabulary size: 20000, max ngram: 1, accuracy: 0.6791180361033303
Vocabulary size: 30000, max ngram: 1, accuracy: 0.7641884531590414
Vocabulary size: 50000, max ngram: 1, accuracy: 0.6514161220043573
Vocabulary size: 100000, max ngram: 1, accuracy: 0.6886009959539372


In [89]:
from sklearn.ensemble import VotingClassifier, StackingClassifier

estimators=[('svm', svm.SVC(kernel='sigmoid', gamma=1.0, probability=True)), 
             ('nb', BernoulliNB()), 
             ('log', LogisticRegression())]

model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())


for vocab in vocab_dim:

    accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
    print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 500, max ngram: 1, accuracy: 0.6400908316542991
Vocabulary size: 1000, max ngram: 1, accuracy: 0.7540885506249283
Vocabulary size: 5000, max ngram: 1, accuracy: 0.6791294002981308
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7687944329778694
Vocabulary size: 20000, max ngram: 1, accuracy: 0.7857936303176242
Vocabulary size: 30000, max ngram: 1, accuracy: 0.7104080667354661
Vocabulary size: 50000, max ngram: 1, accuracy: 0.6038134552066441
Vocabulary size: 100000, max ngram: 1, accuracy: 0.6999652931347978


In [90]:
estimators=[('svm', svm.SVC(probability=True)), 
             ('nb', BernoulliNB()), 
             ('log', LogisticRegression()),
             ('rid', RidgeClassifier(alpha=5))]

model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())


for vocab in vocab_dim:

    accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
    print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 500, max ngram: 1, accuracy: 0.6903430963028486
Vocabulary size: 1000, max ngram: 1, accuracy: 0.6377315838616148
Vocabulary size: 5000, max ngram: 1, accuracy: 0.655807205923305
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7810995012039904
Vocabulary size: 20000, max ngram: 1, accuracy: 0.7814685815846807
Vocabulary size: 30000, max ngram: 1, accuracy: 0.7688553491572067
Vocabulary size: 50000, max ngram: 1, accuracy: 0.7264699739544941
Vocabulary size: 100000, max ngram: 1, accuracy: 0.722860152833063


In [92]:
estimators=[('svm', svm.SVC(probability=True)), 
             ('nb', BernoulliNB()), 
             ('log', LogisticRegression())]

model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())


for vocab in [5000, 10000, 20000, 30000, 50000]:

    accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
    print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 5000, max ngram: 1, accuracy: 0.683088951955051
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7762262068570118
Vocabulary size: 20000, max ngram: 1, accuracy: 0.76432144495225
Vocabulary size: 30000, max ngram: 1, accuracy: 0.7480307181352073
Vocabulary size: 50000, max ngram: 1, accuracy: 0.7347545743443576


In [93]:
estimators=[('svm', svm.SVC(probability=True)), 
             ('log', LogisticRegression()),
             ('rid', RidgeClassifier(alpha=5))]

model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())


for vocab in [5000, 10000, 20000, 30000, 50000]:

    accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
    print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 5000, max ngram: 1, accuracy: 0.6672942568840402
Vocabulary size: 10000, max ngram: 1, accuracy: 0.6651560477992366
Vocabulary size: 20000, max ngram: 1, accuracy: 0.7476626615558505
Vocabulary size: 30000, max ngram: 1, accuracy: 0.7590514275885792
Vocabulary size: 50000, max ngram: 1, accuracy: 0.7644407178075908


In [96]:
from xgboost import XGBClassifier

estimators=[('svm', svm.SVC(probability=True)), 
             ('nb', BernoulliNB()), 
             ('log', LogisticRegression()),
             ('rid', RidgeClassifier(alpha=5))]

model = StackingClassifier(estimators=estimators, final_estimator=XGBClassifier(n_estimators=100, max_depth=3))


for vocab in [5000, 10000, 20000, 30000, 50000]:

    accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
    print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 5000, max ngram: 1, accuracy: 0.6117987984667551
Vocabulary size: 10000, max ngram: 1, accuracy: 0.6860825470539093
Vocabulary size: 20000, max ngram: 1, accuracy: 0.7557322636657001
Vocabulary size: 30000, max ngram: 1, accuracy: 0.7818366381640375
Vocabulary size: 50000, max ngram: 1, accuracy: 0.694229548544564


In [97]:
estimators=[('svm', svm.SVC(probability=True)), 
             ('nb', BernoulliNB()), 
             ('log', LogisticRegression()),
             ('rid', RidgeClassifier(alpha=5))]

model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())


for vocab in [5000, 10000, 20000, 30000, 50000]:

    accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
    print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 5000, max ngram: 1, accuracy: 0.6612251625796518
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7310612110341212
Vocabulary size: 20000, max ngram: 1, accuracy: 0.665332653529248
Vocabulary size: 30000, max ngram: 1, accuracy: 0.7205575212541157
Vocabulary size: 50000, max ngram: 1, accuracy: 0.741234212983439


In [98]:
estimators=[('svm', svm.SVC(probability=True)), 
             ('nb', BernoulliNB()), 
             ('log', LogisticRegression()),
             ('rid', RidgeClassifier(alpha=5))]

model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(penalty='l2', solver='liblinear'))


for vocab in [5000, 10000, 20000, 30000, 50000]:

    accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
    print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 5000, max ngram: 1, accuracy: 0.6559254549773126
Vocabulary size: 10000, max ngram: 1, accuracy: 0.733619690566285
Vocabulary size: 20000, max ngram: 1, accuracy: 0.7443670450636395
Vocabulary size: 30000, max ngram: 1, accuracy: 0.724029743476338
Vocabulary size: 50000, max ngram: 1, accuracy: 0.7324622831588776


In [99]:
estimators=[('svm', svm.SVC(probability=True)), 
             ('nb', BernoulliNB()), 
             ('log', LogisticRegression()),
             ('rid', RidgeClassifier(alpha=5))]

model = StackingClassifier(estimators=estimators, final_estimator=svm.SVC())


for vocab in [5000, 10000, 20000, 30000, 50000]:

    accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
    print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 5000, max ngram: 1, accuracy: 0.6885069086113978
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7849981366815731
Vocabulary size: 20000, max ngram: 1, accuracy: 0.7806121717692925
Vocabulary size: 30000, max ngram: 1, accuracy: 0.7810995012039904
Vocabulary size: 50000, max ngram: 1, accuracy: 0.7688553491572067


In [103]:
estimators=[('knn', KNeighborsClassifier(n_neighbors=3, metric='cosine')), 
             ('rfc', RandomForestClassifier())]

model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(penalty='l2', solver='liblinear'))

for vocab in [100, 300, 500, 1000]:
        
            accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
            print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 100, max ngram: 1, accuracy: 0.7235567472603076
Vocabulary size: 300, max ngram: 1, accuracy: 0.6797595500188379
Vocabulary size: 500, max ngram: 1, accuracy: 0.7973876685176995
Vocabulary size: 1000, max ngram: 1, accuracy: 0.7061746482218618


In [104]:
estimators=[('knn', KNeighborsClassifier(n_neighbors=3, metric='cosine')), 
             ('rfc', RandomForestClassifier())]

model = StackingClassifier(estimators=estimators, final_estimator=svm.SVC())

for vocab in [100, 300, 500, 1000]:
        
            accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
            print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 100, max ngram: 1, accuracy: 0.7082980121873311
Vocabulary size: 300, max ngram: 1, accuracy: 0.6952348190738283
Vocabulary size: 500, max ngram: 1, accuracy: 0.810994500139237
Vocabulary size: 1000, max ngram: 1, accuracy: 0.6796628007928317


In [106]:
estimators=[('svm', svm.SVC(probability=True)), 
             ('nb', BernoulliNB()), 
             ('log', LogisticRegression())]

model = StackingClassifier(estimators=estimators, final_estimator=svm.SVC())


for vocab in [5000, 10000, 20000, 30000, 50000]:

    accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
    print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 5000, max ngram: 1, accuracy: 0.6885069086113978
Vocabulary size: 10000, max ngram: 1, accuracy: 0.7893841015938539
Vocabulary size: 20000, max ngram: 1, accuracy: 0.7762262068570118
Vocabulary size: 30000, max ngram: 1, accuracy: 0.7898714310285517
Vocabulary size: 50000, max ngram: 1, accuracy: 0.7688553491572067


In [105]:
estimators=[('knn', KNeighborsClassifier(n_neighbors=3, metric='cosine')), 
             ('rfc', RandomForestClassifier())]

model = StackingClassifier(estimators=estimators, final_estimator=XGBClassifier(n_estimators=100, max_depth=3))

for vocab in [100, 300, 500, 1000]:
        
            accuracy = compute_accuracy(vocab, 1, model, test_list, df)
        
            print(f"Vocabulary size: {vocab}, max ngram: {1}, accuracy: {accuracy}")

Vocabulary size: 100, max ngram: 1, accuracy: 0.8446537094369911
Vocabulary size: 300, max ngram: 1, accuracy: 0.6805888290988911
Vocabulary size: 500, max ngram: 1, accuracy: 0.8392319647157108
Vocabulary size: 1000, max ngram: 1, accuracy: 0.7277493161007093


Теперь отбираем несколько лучших моделей, и протестируем их "полностью", осуществив leave-one-out по всем авторам без исключения.