In [1]:
import re
import spacy
import joblib
import pandas as pd
from spellchecker import SpellChecker
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier

In [None]:
nlp = spacy.load('en_core_web_md')
spell = SpellChecker()

def preprocess_text(text):
    '''Removes punctuation, stop words, whitespaces, newlines, corrects typos, and lemmatizes the tokens.'''

    text = re.sub(r'[\n\r]+', ' ', text)
    doc = nlp(text.strip())
    filtered_tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_stop]
    misspelled = spell.unknown(filtered_tokens)

    for word in misspelled:
        corrected_word = spell.correction(word)  # problem was here
        if corrected_word != None:
            filtered_tokens[filtered_tokens.index(word)] = corrected_word
   
    return ' '.join(filtered_tokens)

def data_pipeline(df):
    '''Input: DataFrame with `text` and `generated` column.
       Output: DataFrame of word2vec representation with shape (n_samples, 300) and target `generated` Series.'''
    
    df['processed_text'] = df['text'].map(preprocess_text)
    df.drop('text', axis=1, inplace=True)
    
    df['word2vec_doc'] = df['processed_text'].apply(lambda text: nlp(text).vector)
    df.drop('processed_text', axis=1, inplace=True)

    X = df['word2vec_doc'].apply(pd.Series)
    y = df['generated']
    
    return X, y

In [2]:
daigt = pd.read_csv('data/train_v2_drcat_02.csv')
mistral = pd.read_csv('data/Mistral7B_CME_v7.csv')
mine = pd.read_csv('data/train_essays_chatgpt_manual.csv')

daigt.rename(columns={'label': 'generated'}, inplace=True)

for df in (daigt, mistral, mine):
    df.drop(df.columns.difference(['text', 'generated']), axis=1, inplace=True)

all_data = pd.concat([daigt, mistral, mine], ignore_index=True)
del daigt, mistral, mine

X, y = data_pipeline(all_data)

In [3]:
estimators = [('lr', LogisticRegression()),
              ('rf', RandomForestClassifier()),
              ('knn', KNeighborsClassifier()),
              ('gb', GradientBoostingClassifier()),
              ('xgb', XGBClassifier(objective='binary:logistic')),
              ('lgb', LGBMClassifier()),
              ('cb', CatBoostClassifier(loss_function='Logloss'))
             ]

model = VotingClassifier(estimators=estimators, voting='soft', verbose=False)

model.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 22497, number of negative: 27371
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077553 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 49868, number of used features: 300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.451131 -> initscore=-0.196102
[LightGBM] [Info] Start training from score -0.196102
Learning rate set to 0.05469
0:	learn: 0.6479776	total: 269ms	remaining: 4m 29s
1:	learn: 0.6092254	total: 313ms	remaining: 2m 35s
2:	learn: 0.5733756	total: 356ms	remaining: 1m 58s
3:	learn: 0.5424883	total: 402ms	remaining: 1m 40s
4:	learn: 0.5159419	total: 447ms	remaining: 1m 29s
5:	learn: 0.4912640	total: 494ms	remaining: 1m 21s
6:	learn: 0.4714630	total: 538ms	remaining: 1m 16s
7:	learn: 0.4523211	total: 585ms	remaining: 1m 12s
8:	learn: 0.4347050	total: 630ms	remaining: 1m 9s
9:	learn: 0.41825

In [4]:
joblib.dump(model, 'models/model_w2v_vc_all_data.joblib')

['models/model_w2v_vc_all_data']