In [1]:
import re
import spacy
import joblib
import pandas as pd
import lightgbm as lgb
from spellchecker import SpellChecker
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier

nlp = spacy.load('en_core_web_md')
spell = SpellChecker()

def preprocess_text(text):
    '''Removes punctuation, stop words, whitespaces, newlines, corrects typos, and lemmatizes the tokens.'''

    text = re.sub(r'[\n\r]+', ' ', text)
    doc = nlp(text.strip())
    filtered_tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_stop]
    misspelled = spell.unknown(filtered_tokens)

    for word in misspelled:
        corrected_word = spell.correction(word)  # problem was here
        if corrected_word != None:
            filtered_tokens[filtered_tokens.index(word)] = corrected_word
   
    return ' '.join(filtered_tokens)

def data_pipeline(df):
    '''Input: DataFrame with `text` column and optional 'generated' column.
       Output: 'word2vec_doc' Series with word2vec representation of shape (n_samples, 300) along with 'generated' Series if present.'''
    
    df['processed_text'] = df['text'].map(preprocess_text)
    df.drop('text', axis=1, inplace=True)
    
    df['word2vec_doc'] = df['processed_text'].apply(lambda text: nlp(text).vector)
    df.drop('processed_text', axis=1, inplace=True)
        
    X_train, X_val, y_train, y_val = train_test_split(df['word2vec_doc'].apply(pd.Series), df['generated'], stratify=df['generated'], test_size=0.2, shuffle=True)
    
    return X_train, X_val, y_train, y_val

In [2]:
daigt = pd.read_csv('data/train_v2_drcat_02.csv')
mistral = pd.read_csv('data/Mistral7B_CME_v7.csv')
mine = pd.read_csv('data/train_essays_chatgpt_manual.csv')

daigt.rename(columns={'label': 'generated'}, inplace=True)

for df in (daigt, mistral, mine):
    df.drop(df.columns.difference(['text', 'generated']), axis=1, inplace=True)

all_data = pd.concat([daigt, mistral, mine], ignore_index=True)
del daigt, mistral, mine

X_train, X_val, y_train, y_val = data_pipeline(all_data)

In [7]:
folder_path = 'data/best_data'

X_train.to_csv(f'{folder_path}/X_train.csv', index=False)
X_val.to_csv(f'{folder_path}/X_val.csv', index=False)
y_train.to_csv(f'{folder_path}/y_train.csv', index=False)
y_val.to_csv(f'{folder_path}/y_val.csv', index=False)

In [3]:
estimators = [('lr', LogisticRegression()),
              ('rf', RandomForestClassifier()),
              ('knn', KNeighborsClassifier()),
              ('gb', GradientBoostingClassifier()),
              ('xgb', XGBClassifier(objective='binary:logistic')),
              ('lgb', lgb.LGBMClassifier()),
              ('cb', CatBoostClassifier(loss_function='Logloss'))
             ]

model = VotingClassifier(estimators=estimators, voting='soft', verbose=False)

model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 17997, number of negative: 21897
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 39894, number of used features: 300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.451120 -> initscore=-0.196145
[LightGBM] [Info] Start training from score -0.196145
Learning rate set to 0.049719
0:	learn: 0.6538881	total: 263ms	remaining: 4m 22s
1:	learn: 0.6181003	total: 310ms	remaining: 2m 34s
2:	learn: 0.5843245	total: 352ms	remaining: 1m 56s
3:	learn: 0.5532080	total: 394ms	remaining: 1m 38s
4:	learn: 0.5282405	total: 439ms	remaining: 1m 27s
5:	learn: 0.5038493	total: 484ms	remaining: 1m 20s
6:	learn: 0.4847390	total: 528ms	remaining: 1m 14s
7:	learn: 0.4669962	total: 574ms	remaining: 1m 11s
8:	learn: 0.4505226	total: 619ms	remaining: 1m 8s
9:	learn: 0.4362

In [4]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5474
           1       0.99      0.97      0.98      4500

    accuracy                           0.98      9974
   macro avg       0.98      0.98      0.98      9974
weighted avg       0.98      0.98      0.98      9974



In [5]:
joblib.dump(model, 'models/model_w2v_vc.joblib')

['models/model_w2v_vc']