In [1]:
# This file trains and saves the Gradient Boosting model with Word2Vec text representation.

In [1]:
import pandas as pd
import spacy
import numpy as np
import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

nlp = spacy.load('en_core_web_md')

In [2]:
# loading data and adding text representation column
train_data = pd.read_csv('data/compiled_processed_train_data.csv')
test_data = pd.read_csv('data/compiled_processed_test_data.csv')

for data in (train_data, test_data):
    data['word2vec_doc'] = data['text'].apply(lambda text: nlp(text).vector)

X_train = train_data['word2vec_doc'].apply(pd.Series)
X_test = test_data['word2vec_doc'].apply(pd.Series)
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

y_train = train_data.generated
y_test = test_data.generated

In [4]:
model = GradientBoostingClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       0.99      0.99      0.99       161

    accuracy                           1.00       436
   macro avg       1.00      1.00      1.00       436
weighted avg       1.00      1.00      1.00       436



In [5]:
joblib.dump(model, 'models/model_word2vec_GB_no_promptid.joblib')

['models/model_word2vec_GB_no_promptid.joblib']