In [None]:
# This file trains different models with different text representation techniques and compares their F1 scores.

In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
train_data = pd.read_csv('data/compiled_processed_train_data.csv')
val_data = pd.read_csv('data/compiled_processed_val_data.csv')

nlp = spacy.load('en_core_web_md')

bow_vectorizer = CountVectorizer()
uni_bi_gram_vectorizer = CountVectorizer(ngram_range=(1, 2))
tfidf_vectorizer = TfidfVectorizer()

# initializing text representation features
for rep, vectorizer in zip(['bow', 'uni_bi_gram', 'tfidf'], [bow_vectorizer, uni_bi_gram_vectorizer, tfidf_vectorizer]):
    train_data[rep] = vectorizer.fit_transform(train_data['processed_text']).toarray().tolist()
    val_data[rep] = vectorizer.transform(val_data['processed_text']).toarray().tolist()

for data in (train_data, val_data):
    data['word2vec_doc'] = data['text'].apply(lambda text: nlp(text).vector)

In [104]:
model_logistic = LogisticRegression()
model_rf = RandomForestClassifier()
model_gb = GradientBoostingClassifier()

In [112]:
y_train = train_data.generated
y_val = val_data.generated

for model in (model_logistic, model_rf, model_gb):
    for rep in ('bow', 'uni_bi_gram', 'tfidf', 'word2vec_doc'):
        # flattening text representation column from lists into separate columns
        X_train = pd.concat([train_data['prompt_id'], train_data[rep].apply(lambda x: pd.Series(x))], axis=1)
        X_train.columns = X_train.columns.astype(str)
        X_val = pd.concat([val_data['prompt_id'], val_data[rep].apply(lambda x: pd.Series(x))], axis=1)
        X_val.columns = X_val.columns.astype(str)
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_val)
        print(f'Model - {model} | Representation - {rep}')
        print(classification_report(y_val, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model - LogisticRegression() | Representation - bow
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       0.99      0.99      0.99       161

    accuracy                           1.00       436
   macro avg       1.00      1.00      1.00       436
weighted avg       1.00      1.00      1.00       436



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model - LogisticRegression() | Representation - uni_bi_gram
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       0.99      0.99      0.99       161

    accuracy                           1.00       436
   macro avg       1.00      1.00      1.00       436
weighted avg       1.00      1.00      1.00       436

Model - LogisticRegression() | Representation - tfidf
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       1.00      0.99      1.00       161

    accuracy                           1.00       436
   macro avg       1.00      1.00      1.00       436
weighted avg       1.00      1.00      1.00       436



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model - LogisticRegression() | Representation - word2vec_doc
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       1.00      0.99      1.00       161

    accuracy                           1.00       436
   macro avg       1.00      1.00      1.00       436
weighted avg       1.00      1.00      1.00       436

Model - RandomForestClassifier() | Representation - bow
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       1.00      0.99      1.00       161

    accuracy                           1.00       436
   macro avg       1.00      1.00      1.00       436
weighted avg       1.00      1.00      1.00       436

Model - RandomForestClassifier() | Representation - uni_bi_gram
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       1.00      0.99      1.00       161

 