In [None]:
import spacy
import pandas as pd
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier

nlp = spacy.load('/kaggle/input/models/en_core_web_md_model')

def preprocess_text(text):
    '''Removes punctuation, stop words, and lemmatizes the tokens.'''
    doc = nlp(text)   
    return ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_stop])

daigt = pd.read_csv('/kaggle/input/kaggle-llm-competition-collected-data/train_v2_drcat_02.csv')  # contains student written essays
mistral = pd.read_csv('/kaggle/input/kaggle-llm-competition-collected-data/Mistral7B_CME_v7.csv')  # all generated
mine = pd.read_csv('/kaggle/input/kaggle-llm-competition-collected-data/train_essays_chatgpt_manual.csv')

daigt.rename(columns={'label': 'generated'}, inplace=True)
daigt = daigt[daigt['RDizzl3_seven']==True]

for df in (daigt, mistral, mine):
    df.drop(df.columns.difference(['text', 'generated']), axis=1, inplace=True)

all_data = pd.concat([daigt, mistral, mine], ignore_index=True)
del daigt, mistral, mine

def data_pipeline(df):
    '''Input: DataFrame with `text` column and optional 'generated' column.
       Output: Series with word2vec representation of shape (n_samples, 300) along with 'generated' Series if present.'''
    
    df['processed_text'] = df['text'].map(preprocess_text)
    df.drop('text', axis=1, inplace=True)
    
    df['word2vec_doc'] = df['processed_text'].apply(lambda text: nlp(text).vector)
    df.drop('processed_text', axis=1, inplace=True)
        
    if 'generated' in df.columns.values:
                
        X = df['word2vec_doc'].apply(pd.Series)
        y = df['generated']

        return X, y
    
    # This is for X_test.
    return df['word2vec_doc'].apply(pd.Series)

In [None]:
X, y = data_pipeline(all_data)

In [None]:
estimators = [('lr', LogisticRegression()),
              ('rf', RandomForestClassifier()),
              ('knn', KNeighborsClassifier()),
              ('gb', GradientBoostingClassifier()),
              ('xgb', XGBClassifier(objective='binary:logistic')),
              ('lgb', lgb.LGBMClassifier()),
              ('cb', CatBoostClassifier(loss_function='Logloss'))
             ]

model = VotingClassifier(estimators=estimators, voting='soft', verbose=True)

model.fit(X, y)

In [None]:
test_data = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
X_test = data_pipeline(test_data)
y_test_pred = pd.Series(model.predict_proba(X_test)[:, 1], name='generated')

output_df = pd.concat((test_data.id, y_test_pred), axis=1)
output_df.to_csv('submission.csv', index=False)