## Import and Preprocess Data

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import tensorflow as tf
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [27]:
domain_data_1 = "./data/domain1_train.json"
domain_data_2 = "./data/domain2_train.json"
test_data = "./data/test_set.json"

def prepare_data(data, model=None):
    if model == 'lstm':
        texts = pad_sequences([i['text'] for i in data], padding="post")
        labels = np.array([i['label'] for i in data])
        return texts, labels.reshape(-1, 1)
    else:
        texts = [i['text'] for i in data]
        labels = np.array([i['label'] for i in data])
        return texts, labels

def build_model(train_x, dim=128):
    model = Sequential(
        [
            layers.Embedding(input_dim=5000, output_dim=dim, input_length=train_x.shape[1]),
            layers.SimpleRNN(128, return_sequences=True),
            layers.Dropout(0.5),
            layers.Dense(1, activation='sigmoid')
        ]
    )
    model.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])

    return model

def to_csv(predictions, name='./data/result.csv'):
    predictions = np.where(predictions > 0.5, 1, 0).reshape(-1)
    df = pd.DataFrame({'id': range(len(predictions)), 'label': predictions})
    df.to_csv(name, index=False)

def load(path):
    data = []
    with open(path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

def augment(data):
    machine_txts = []
    human_txts = []
    for i in data:
        if i['label'] == 0:
            machine_txts.append(i)
        else:
            human_txts.append(i)
    num = len(machine_txts) - len(human_txts)

    expand = np.random.choice(human_txts, num)
    human_txts.extend(expand)

    print(f"Humans: {len(human_txts)}, Machines: {len(machine_txts)}")

    new_data = human_txts + machine_txts
    random.shuffle(new_data)
    return new_data

def vectorize(x_train, x_val, n_grams=(1, 2)):
    x_train_txt = [' '.join(map(str, seq)) for seq in x_train]
    x_val_txt = [' '.join(map(str, seq)) for seq in x_val]

    vectorizer = CountVectorizer(ngram_range=n_grams)
    x_train_transformed = vectorizer.fit_transform(x_train_txt)
    x_val_transformed = vectorizer.transform(x_val_txt)

    return x_train_transformed, x_val_transformed


In [36]:
train = load(domain_data_1)
texts, labels = prepare_data(train, model='lstm')

x_train, x_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

## RNN Model

In [37]:
model_r = build_model(x_train)
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)

In [38]:
history = model_r.fit(x_train, y_train, epochs=10, batch_size=100, validation_data=(x_val, y_val), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10


In [39]:
predications = model_r.predict(x_val)



## Logistic Regression Model

In [47]:
train = load(domain_data_2)
train = augment(train)
texts, labels = prepare_data(train)

x_train, x_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)
x_train, x_val = vectorize(x_train, x_val)

Humans: 12750, Machines: 12750


In [48]:
lg_model = LogisticRegression(
    penalty='l2',
    max_iter=1000,
    class_weight='balanced',
)

gridsearch = GridSearchCV(
    lg_model,
    param_grid={
        'C': [0.01, 0.1, 1, 10],
    },
    cv=10,
    scoring='accuracy',
    verbose=1,
)

gridsearch.fit(x_train, y_train.ravel())
best_model = gridsearch.best_estimator_
predictions_lr = best_model.predict(x_val)
acc = accuracy_score(y_val, predictions_lr)
print(f"Accuracy: {acc}")


Fitting 10 folds for each of 4 candidates, totalling 40 fits
Accuracy: 0.9494117647058824


## Bagging Model

In [49]:

base_clf = DecisionTreeClassifier(max_depth=10)
bag_clf = BaggingClassifier(estimator=base_clf, n_estimators=100, max_samples=0.5, n_jobs=-1, bootstrap=True, bootstrap_features=True)
bag_clf.fit(x_train, y_train)

predictions_b = bag_clf.predict(x_val)
acc = accuracy_score(y_val, predictions_b)
print(f"Accuracy: {acc}")


Accuracy: 0.7243137254901961


## Output