In [1]:
# This submission uses Word2Vec text representation and Gradient Boosting model.
# must return a .csv file with columns=['id', 'generated'] where the latter is a probability

In [2]:
import pandas as pd
import spacy
import numpy as np
import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

nlp = spacy.load('en_core_web_md')

In [3]:
# loading data and adding text representation column
train_data = pd.read_csv('data/compiled_processed_train_data.csv')
val_data = pd.read_csv('data/compiled_processed_val_data.csv')

for data in (train_data, val_data):
    data['word2vec_doc'] = data['text'].apply(lambda text: nlp(text).vector)

X_train = pd.concat([train_data['prompt_id'], train_data['word2vec_doc'].apply(lambda x: pd.Series(x))], axis=1)
X_val = pd.concat([val_data['prompt_id'], val_data['word2vec_doc'].apply(lambda x: pd.Series(x))], axis=1)
X_train.columns = X_train.columns.astype(str)
X_val.columns = X_val.columns.astype(str)

y_train = train_data.generated
y_val = val_data.generated

In [4]:
model = GradientBoostingClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       1.00      0.99      1.00       161

    accuracy                           1.00       436
   macro avg       1.00      1.00      1.00       436
weighted avg       1.00      1.00      1.00       436



In [5]:
joblib.dump(model, 'models/model_w2v_gb_promptid.joblib')

['models/model_word2vec_GB_with_promptid.joblib']