In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import os

os.chdir('/content/drive/MyDrive/NLP Project/src')
os.getcwd()

'/content/drive/MyDrive/NLP Project/src'

In [12]:
from bert_tfidf import *
from transformers import AutoModel
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
def load_model(model_path, labels2id, test_features, model_name="roberta-base"):
  pretrained_model = AutoModel.from_pretrained(model_name)
  concat_model = BERT_TFIDF_Classifier(pretrained_model, num_classes=len(labels2id), tfidf_size=test_features.shape[1])
  concat_model = nn.DataParallel(concat_model)
  
  concat_model.load_state_dict(torch.load(model_path))
  return concat_model

def plot_cm(preds,true_label):
  cm = confusion_matrix(true_label, preds)

  # plot the confusion matrix
  classes = ["Human", "Machine"]
  plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
  plt.title('Confusion Matrix')
  plt.colorbar()
  tick_marks = np.arange(len(classes))
  plt.xticks(tick_marks, classes, rotation=45)
  plt.yticks(tick_marks, classes)
  thresh = cm.max() / 2.
  for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], 'd'),
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")
  plt.xlabel('Predicted Label')
  plt.ylabel('True Label')
  plt.show()

In [13]:
train_df = pd.read_csv('../nlp-data/liwc_pos_dep_tr.csv')
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
reddit_df = pd.read_csv('../nlp-data/liwc_pos_dep_reddit.csv')

labels2id = {l: i for i, l in enumerate(train_df['alg'].unique())}

vectorizer = joblib.load("../New/models/roberta_base_pos_dep_liwc/vectorizer.pkl")
test_features = get_features_test(test_df, vectorizer, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"])
model = load_model("../New/models_state/roberta_base_pos_dep_liwc/checkpoint_epoch=4-val_loss=0.16928044552332722.ckpt",labels2id,test_features)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"], batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()

plot_cm(preds,true_label)

# For Reddit
preds, true_label = evaluate_test(model, reddit_dataloader, labels2id, reddit_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------REDDIT DATA-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        ctrl       0.99      0.98      0.99       107
        fair       0.98      0.95      0.96       111
         gpt       1.00      1.00      1.00       106
        gpt2       0.96      0.96      0.96       107
        gpt3       0.98      0.73      0.84       143
      grover       0.93      0.98      0.96       102
       human       0.99      0.97      0.98       216
 instructgpt       0.63      0.99      0.77        68
        pplm       0.99      1.00      1.00       105
         xlm       0.99      1.00      1.00       106
       xlnet       1.00      0.99      1.00       108

    accuracy                           0.95      1279
   macro avg       0.95      0.96      0.95      1279
weighted avg       0.96      0.95      0.95      1279

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      0.97      0.98       216
     machine       0.99      1.00

  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        ctrl       0.00      0.00      0.00         1
         gpt       0.00      0.00      0.00        63
        gpt2       0.00      0.00      0.00         1
        gpt3       0.95      0.68      0.79      1245
      grover       0.00      0.00      0.00        30
       human       0.37      1.00      0.54       326
 instructgpt       0.56      0.50      0.53       971
       xlnet       0.00      0.00      0.00         7

    accuracy                           0.63      2644
   macro avg       0.23      0.27      0.23      2644
weighted avg       0.70      0.63      0.63      2644

-------------------REDDIT DATA-----------------
              precision    recall  f1-score   support

       human       0.99      0.97      0.98       216
     machine       0.99      1.00      1.00      1063

    accuracy                           0.99      1279
   macro avg       0.99      0.98      0.99      1279
weighted avg       0.99      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


1. CTRL

In [14]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['ctrl','human'])]

test_features = get_features_test(test_df, vectorizer, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"])
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"], batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        ctrl       0.99      1.00      1.00       105
        fair       0.00      0.00      0.00         2
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210

    accuracy                           0.99       319
   macro avg       0.49      0.50      0.50       319
weighted avg       0.98      0.99      0.98       319

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       109

    accuracy                           0.99       319
   macro avg       0.99      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


2. FAIR

In [15]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['fair','human'])]

test_features = get_features_test(test_df, vectorizer, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"])
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"], batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.98      0.99      0.99       106
         gpt       0.00      0.00      0.00         1
        gpt2       0.00      0.00      0.00         2
      grover       0.00      0.00      0.00         2
       human       0.98      1.00      0.99       209

    accuracy                           0.98       320
   macro avg       0.39      0.40      0.40       320
weighted avg       0.97      0.98      0.97       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.98      1.00      0.99       209
     machine       1.00      0.96      0.98       111

    accuracy                           0.99       320
   macro avg       0.99      0.98      0.99       320
weighted avg       0.99      0.99      0.99       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


3. GROVER

In [17]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['grover','human'])]

test_features = get_features_test(test_df, vectorizer, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"])
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"], batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        ctrl       0.00      0.00      0.00         1
        fair       0.00      0.00      0.00         2
      grover       0.94      0.98      0.96       103
       human       0.99      0.98      0.98       214

    accuracy                           0.97       320
   macro avg       0.48      0.49      0.49       320
weighted avg       0.96      0.97      0.97       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      0.98      0.98       214
     machine       0.96      0.97      0.97       106

    accuracy                           0.98       320
   macro avg       0.97      0.98      0.98       320
weighted avg       0.98      0.98      0.98       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


4. GPT2

In [18]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['gpt2','human'])]

test_features = get_features_test(test_df, vectorizer, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"])
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"], batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         5
        gpt2       0.95      1.00      0.98       102
      grover       0.00      0.00      0.00         3
       human       0.98      1.00      0.99       209
       xlnet       0.00      0.00      0.00         1

    accuracy                           0.97       320
   macro avg       0.39      0.40      0.39       320
weighted avg       0.94      0.97      0.96       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.98      1.00      0.99       209
     machine       1.00      0.96      0.98       111

    accuracy                           0.99       320
   macro avg       0.99      0.98      0.99       320
weighted avg       0.99      0.99      0.99       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


5. GPT3

In [19]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['gpt3','human'])]
reddit_df = pd.read_csv('../nlp-data/liwc_pos_dep_reddit.csv')
reddit_df = reddit_df[reddit_df['alg'].isin(['gpt3','human'])]

test_features = get_features_test(test_df, vectorizer, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"])
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"], batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)

# For Reddit
preds, true_label = evaluate_test(model, reddit_dataloader, labels2id, reddit_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------REDDIT DATA-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
        gpt2       0.00      0.00      0.00         1
        gpt3       0.98      1.00      0.99       105
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210
 instructgpt       0.00      0.00      0.00         1

    accuracy                           0.98       320
   macro avg       0.33      0.33      0.33       320
weighted avg       0.97      0.98      0.98       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       110

    accuracy                           0.99       320
   macro avg       0.99      0.99      0.99       320
weighted avg       0.99      0.99      0.99       320


              precision    recall  f1-score   support

        ctrl       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


6. InstructGPT

In [20]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['instructgpt','human'])]
reddit_df = pd.read_csv('../nlp-data/liwc_pos_dep_reddit.csv')
reddit_df = reddit_df[reddit_df['alg'].isin(['instructgpt','human'])]

test_features = get_features_test(test_df, vectorizer, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"])
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"], batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)

# For Reddit
preds, true_label = evaluate_test(model, reddit_dataloader, labels2id, reddit_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------REDDIT DATA-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
        gpt3       0.00      0.00      0.00        37
      grover       0.00      0.00      0.00         1
       human       0.99      1.00      0.99       212
 instructgpt       0.64      1.00      0.78        68

    accuracy                           0.87       319
   macro avg       0.33      0.40      0.35       319
weighted avg       0.80      0.87      0.83       319

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       212
     machine       0.99      0.98      0.99       107

    accuracy                           0.99       319
   macro avg       0.99      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319


              precision    recall  f1-score   support

        ctrl       0.00      0.00      0.00         2
         gpt       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


7. GPT

In [21]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
reddit_df = pd.read_csv('../nlp-data/liwc_pos_dep_reddit.csv')
test_df = test_df[test_df['alg'].isin(['gpt','human'])]

test_features = get_features_test(test_df, vectorizer, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"])
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"], batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
         gpt       1.00      1.00      1.00       106
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210

    accuracy                           0.99       319
   macro avg       0.50      0.50      0.50       319
weighted avg       0.98      0.99      0.99       319

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       109

    accuracy                           0.99       319
   macro avg       0.99      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


8. PPLM

In [22]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['pplm','human'])]

test_features = get_features_test(test_df, vectorizer, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"])
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"], batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210
        pplm       1.00      1.00      1.00       106

    accuracy                           0.99       319
   macro avg       0.50      0.50      0.50       319
weighted avg       0.98      0.99      0.99       319

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       109

    accuracy                           0.99       319
   macro avg       0.99      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


9. XLNET

In [23]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['xlnet','human'])]

test_features = get_features_test(test_df, vectorizer, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"])
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"], batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
      grover       0.00      0.00      0.00         1
       human       0.99      1.00      1.00       211
       xlnet       1.00      1.00      1.00       107

    accuracy                           0.99       320
   macro avg       0.50      0.50      0.50       320
weighted avg       0.99      0.99      0.99       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      1.00       211
     machine       1.00      0.98      0.99       109

    accuracy                           0.99       320
   macro avg       1.00      0.99      0.99       320
weighted avg       0.99      0.99      0.99       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


10. XLM

In [24]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['xlm','human'])]

test_features = get_features_test(test_df, vectorizer, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"])
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, numerical_fields=["semantic_coherence", "Analytic", "WPS", "article", "Period"], batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210
         xlm       1.00      1.00      1.00       107

    accuracy                           0.99       320
   macro avg       0.50      0.50      0.50       320
weighted avg       0.98      0.99      0.99       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       110

    accuracy                           0.99       320
   macro avg       0.99      0.99      0.99       320
weighted avg       0.99      0.99      0.99       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# POS Models

In [None]:
train_df = pd.read_csv('../nlp-data/liwc_pos_dep_tr.csv')
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
reddit_df = pd.read_csv('../nlp-data/liwc_pos_dep_reddit.csv')

labels2id = {l: i for i, l in enumerate(train_df['alg'].unique())}

vectorizer = joblib.load("../New/models/roberta_base_pos/vectorizer.pkl")
test_features = get_features_test(test_df, vectorizer)
model = load_model("../New/models_state/roberta_base_pos/checkpoint_epoch=3-val_loss=0.ckpt",labels2id,test_features)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)

# For Reddit
preds, true_label = evaluate_test(model, reddit_dataloader, labels2id, reddit_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------REDDIT DATA-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)




1. CTRL

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['ctrl','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        ctrl       0.99      1.00      1.00       105
        fair       0.00      0.00      0.00         2
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210

    accuracy                           0.99       319
   macro avg       0.49      0.50      0.50       319
weighted avg       0.98      0.99      0.98       319

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       109

    accuracy                           0.99       319
   macro avg       0.99      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


2. FAIR

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['fair','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.98      0.99      0.99       106
         gpt       0.00      0.00      0.00         1
        gpt2       0.00      0.00      0.00         2
      grover       0.00      0.00      0.00         2
       human       0.98      1.00      0.99       209

    accuracy                           0.98       320
   macro avg       0.39      0.40      0.40       320
weighted avg       0.97      0.98      0.97       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.98      1.00      0.99       209
     machine       1.00      0.96      0.98       111

    accuracy                           0.99       320
   macro avg       0.99      0.98      0.99       320
weighted avg       0.99      0.99      0.99       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


3. GROVER

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['grover','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        ctrl       0.00      0.00      0.00         1
        fair       0.00      0.00      0.00         2
      grover       0.94      0.98      0.96       103
       human       0.99      0.98      0.98       214

    accuracy                           0.97       320
   macro avg       0.48      0.49      0.49       320
weighted avg       0.96      0.97      0.97       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      0.98      0.98       214
     machine       0.96      0.97      0.97       106

    accuracy                           0.98       320
   macro avg       0.97      0.98      0.98       320
weighted avg       0.98      0.98      0.98       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


4. GPT2

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['gpt2','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         5
        gpt2       0.95      1.00      0.98       102
      grover       0.00      0.00      0.00         3
       human       0.98      1.00      0.99       209
       xlnet       0.00      0.00      0.00         1

    accuracy                           0.97       320
   macro avg       0.39      0.40      0.39       320
weighted avg       0.94      0.97      0.96       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.98      1.00      0.99       209
     machine       1.00      0.96      0.98       111

    accuracy                           0.99       320
   macro avg       0.99      0.98      0.99       320
weighted avg       0.99      0.99      0.99       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


5. GPT3

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['gpt3','human'])]
reddit_df = pd.read_csv('../nlp-data/liwc_pos_dep_reddit.csv')
reddit_df = reddit_df[reddit_df['alg'].isin(['gpt3','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)

# For Reddit
preds, true_label = evaluate_test(model, reddit_dataloader, labels2id, reddit_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------REDDIT DATA-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
        gpt2       0.00      0.00      0.00         1
        gpt3       0.98      1.00      0.99       105
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210
 instructgpt       0.00      0.00      0.00         1

    accuracy                           0.98       320
   macro avg       0.33      0.33      0.33       320
weighted avg       0.97      0.98      0.98       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       110

    accuracy                           0.99       320
   macro avg       0.99      0.99      0.99       320
weighted avg       0.99      0.99      0.99       320


              precision    recall  f1-score   support

        ctrl       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


6. InstructGPT

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['instructgpt','human'])]
reddit_df = pd.read_csv('../nlp-data/liwc_pos_dep_reddit.csv')
reddit_df = reddit_df[reddit_df['alg'].isin(['instructgpt','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)

# For Reddit
preds, true_label = evaluate_test(model, reddit_dataloader, labels2id, reddit_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------REDDIT DATA-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
        gpt3       0.00      0.00      0.00        37
      grover       0.00      0.00      0.00         1
       human       0.99      1.00      0.99       212
 instructgpt       0.64      1.00      0.78        68

    accuracy                           0.87       319
   macro avg       0.33      0.40      0.35       319
weighted avg       0.80      0.87      0.83       319

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       212
     machine       0.99      0.98      0.99       107

    accuracy                           0.99       319
   macro avg       0.99      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319


              precision    recall  f1-score   support

        ctrl       0.00      0.00      0.00         2
         gpt       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


7. GPT

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
reddit_df = pd.read_csv('../nlp-data/liwc_pos_dep_reddit.csv')
test_df = test_df[test_df['alg'].isin(['gpt','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
         gpt       1.00      1.00      1.00       106
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210

    accuracy                           0.99       319
   macro avg       0.50      0.50      0.50       319
weighted avg       0.98      0.99      0.99       319

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       109

    accuracy                           0.99       319
   macro avg       0.99      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


8. PPLM

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['pplm','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210
        pplm       1.00      1.00      1.00       106

    accuracy                           0.99       319
   macro avg       0.50      0.50      0.50       319
weighted avg       0.98      0.99      0.99       319

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       109

    accuracy                           0.99       319
   macro avg       0.99      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


9. XLNET

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['xlnet','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
      grover       0.00      0.00      0.00         1
       human       0.99      1.00      1.00       211
       xlnet       1.00      1.00      1.00       107

    accuracy                           0.99       320
   macro avg       0.50      0.50      0.50       320
weighted avg       0.99      0.99      0.99       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      1.00       211
     machine       1.00      0.98      0.99       109

    accuracy                           0.99       320
   macro avg       1.00      0.99      0.99       320
weighted avg       0.99      0.99      0.99       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


10. XLM

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['xlm','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210
         xlm       1.00      1.00      1.00       107

    accuracy                           0.99       320
   macro avg       0.50      0.50      0.50       320
weighted avg       0.98      0.99      0.99       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       110

    accuracy                           0.99       320
   macro avg       0.99      0.99      0.99       320
weighted avg       0.99      0.99      0.99       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# POS Dependency Tfidf +  ROBERTA

In [None]:
train_df = pd.read_csv('../nlp-data/liwc_pos_dep_tr.csv')
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
reddit_df = pd.read_csv('../nlp-data/liwc_pos_dep_reddit.csv')

labels2id = {l: i for i, l in enumerate(train_df['alg'].unique())}

vectorizer = joblib.load("../New/models/roberta_base_pos_dep/vectorizer.pkl")
test_features = get_features_test(test_df, vectorizer)
model = load_model("../New/models_state/roberta_base_pos_dep/checkpoint_epoch=2-val_loss=0.21201472314229855.ckpt",labels2id,test_features)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)

# For Reddit
preds, true_label = evaluate_test(model, reddit_dataloader, labels2id, reddit_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------REDDIT DATA-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


1. CTRL

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['ctrl','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        ctrl       0.99      1.00      1.00       105
        fair       0.00      0.00      0.00         2
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210

    accuracy                           0.99       319
   macro avg       0.49      0.50      0.50       319
weighted avg       0.98      0.99      0.98       319

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       109

    accuracy                           0.99       319
   macro avg       0.99      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


2. FAIR

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['fair','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.98      0.99      0.99       106
         gpt       0.00      0.00      0.00         1
        gpt2       0.00      0.00      0.00         2
      grover       0.00      0.00      0.00         2
       human       0.98      1.00      0.99       209

    accuracy                           0.98       320
   macro avg       0.39      0.40      0.40       320
weighted avg       0.97      0.98      0.97       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.98      1.00      0.99       209
     machine       1.00      0.96      0.98       111

    accuracy                           0.99       320
   macro avg       0.99      0.98      0.99       320
weighted avg       0.99      0.99      0.99       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


3. GROVER

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['grover','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        ctrl       0.00      0.00      0.00         1
        fair       0.00      0.00      0.00         2
      grover       0.94      0.98      0.96       103
       human       0.99      0.98      0.98       214

    accuracy                           0.97       320
   macro avg       0.48      0.49      0.49       320
weighted avg       0.96      0.97      0.97       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      0.98      0.98       214
     machine       0.96      0.97      0.97       106

    accuracy                           0.98       320
   macro avg       0.97      0.98      0.98       320
weighted avg       0.98      0.98      0.98       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


4. GPT2

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['gpt2','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         5
        gpt2       0.95      1.00      0.98       102
      grover       0.00      0.00      0.00         3
       human       0.98      1.00      0.99       209
       xlnet       0.00      0.00      0.00         1

    accuracy                           0.97       320
   macro avg       0.39      0.40      0.39       320
weighted avg       0.94      0.97      0.96       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.98      1.00      0.99       209
     machine       1.00      0.96      0.98       111

    accuracy                           0.99       320
   macro avg       0.99      0.98      0.99       320
weighted avg       0.99      0.99      0.99       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


5. GPT3

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['gpt3','human'])]
reddit_df = pd.read_csv('../nlp-data/liwc_pos_dep_reddit.csv')
reddit_df = reddit_df[reddit_df['alg'].isin(['gpt3','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)

# For Reddit
preds, true_label = evaluate_test(model, reddit_dataloader, labels2id, reddit_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------REDDIT DATA-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
        gpt2       0.00      0.00      0.00         1
        gpt3       0.98      1.00      0.99       105
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210
 instructgpt       0.00      0.00      0.00         1

    accuracy                           0.98       320
   macro avg       0.33      0.33      0.33       320
weighted avg       0.97      0.98      0.98       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       110

    accuracy                           0.99       320
   macro avg       0.99      0.99      0.99       320
weighted avg       0.99      0.99      0.99       320


              precision    recall  f1-score   support

        ctrl       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


6. InstructGPT

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['instructgpt','human'])]
reddit_df = pd.read_csv('../nlp-data/liwc_pos_dep_reddit.csv')
reddit_df = reddit_df[reddit_df['alg'].isin(['instructgpt','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)

# For Reddit
preds, true_label = evaluate_test(model, reddit_dataloader, labels2id, reddit_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------REDDIT DATA-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
        gpt3       0.00      0.00      0.00        37
      grover       0.00      0.00      0.00         1
       human       0.99      1.00      0.99       212
 instructgpt       0.64      1.00      0.78        68

    accuracy                           0.87       319
   macro avg       0.33      0.40      0.35       319
weighted avg       0.80      0.87      0.83       319

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       212
     machine       0.99      0.98      0.99       107

    accuracy                           0.99       319
   macro avg       0.99      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319


              precision    recall  f1-score   support

        ctrl       0.00      0.00      0.00         2
         gpt       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


7. GPT

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
reddit_df = pd.read_csv('../nlp-data/liwc_pos_dep_reddit.csv')
test_df = test_df[test_df['alg'].isin(['gpt','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
         gpt       1.00      1.00      1.00       106
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210

    accuracy                           0.99       319
   macro avg       0.50      0.50      0.50       319
weighted avg       0.98      0.99      0.99       319

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       109

    accuracy                           0.99       319
   macro avg       0.99      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


8. PPLM

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['pplm','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210
        pplm       1.00      1.00      1.00       106

    accuracy                           0.99       319
   macro avg       0.50      0.50      0.50       319
weighted avg       0.98      0.99      0.99       319

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       109

    accuracy                           0.99       319
   macro avg       0.99      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


9. XLNET

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['xlnet','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
      grover       0.00      0.00      0.00         1
       human       0.99      1.00      1.00       211
       xlnet       1.00      1.00      1.00       107

    accuracy                           0.99       320
   macro avg       0.50      0.50      0.50       320
weighted avg       0.99      0.99      0.99       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      1.00       211
     machine       1.00      0.98      0.99       109

    accuracy                           0.99       320
   macro avg       1.00      0.99      0.99       320
weighted avg       0.99      0.99      0.99       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


10. XLM

In [None]:
test_df = pd.read_csv('../nlp-data/liwc_pos_dep_eval.csv')
test_df = test_df[test_df['alg'].isin(['xlm','human'])]

test_features = get_features_test(test_df, vectorizer)
test_dataloader, reddit_dataloader = data_batcher_evaluate(test_df, reddit_df, vectorizer, labels2id, batch_size=32, model_name="roberta-base")

# For AA Paper
preds, true_label = evaluate_test(model, test_dataloader, labels2id, test_df["alg"])
preds = ['machine' if item != 'human' else item for item in preds]
true_label = true_label.replace(['fair', 'grover', 'gpt2', 'gpt3', 'instructgpt', 'gpt', 'ctrl', 'pplm', 'xlnet', 'xlm'],'machine')
print("-------------------AA PAPER-----------------")
print(classification_report(preds,true_label))
print()
plot_cm(preds,true_label)


  tfidf_vector = torch.tensor(batch['tfidf_vector'], dtype=torch.float).to(device)


              precision    recall  f1-score   support

        fair       0.00      0.00      0.00         1
      grover       0.00      0.00      0.00         2
       human       0.99      1.00      0.99       210
         xlm       1.00      1.00      1.00       107

    accuracy                           0.99       320
   macro avg       0.50      0.50      0.50       320
weighted avg       0.98      0.99      0.99       320

-------------------AA PAPER-----------------
              precision    recall  f1-score   support

       human       0.99      1.00      0.99       210
     machine       1.00      0.97      0.99       110

    accuracy                           0.99       320
   macro avg       0.99      0.99      0.99       320
weighted avg       0.99      0.99      0.99       320




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
