In [25]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import csv
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import gensim

In [26]:
df = pd.read_csv(r'../data/adversarial_swap_train_final.csv')
df.head(5)
df.dropna(how='any', inplace=True)

In [27]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(df['clean_text'], df['classification'], test_size=0.2)
print(X_train_raw.shape)
print(X_test_raw.shape)
print(y_train_raw.shape)
print(y_test_raw.shape)

(13077,)
(3270,)
(13077,)
(3270,)


In [28]:
print(X_train_raw)

7219            oh u get must lol mum still isnt convinced
12531           coming myspaceee yr work though dnt bother
2872         ftw awh im sorry im probably going thing haha
8663                 love itthats going one new fave quote
14453                               happy mother day mommy
                               ...                        
7775     yes chloe youtube may th made feel x better ha...
5893     hey fused gaming fusedgaming forum delayed pm ...
2916                  cant stop smiling im best mood right
15690                                 woot woot super cool
743                                               arm hurt
Name: clean_text, Length: 13077, dtype: object


In [29]:
t = Tokenizer()
t.fit_on_texts(X_train_raw)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(X_train_raw)
max_length = 23
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [95]:
def tokenize_data(train):
    X_train = []
    vocab = []
    for x in train:
        x = x.split(' ')
        for word in x:
            if word not in vocab:
                vocab.append(word)
        X_train.append(x)
    return X_train

X_train_tokens = tokenize_data(X_train_raw)
X_test_tokens = tokenize_data(X_test_raw)

In [31]:
dense_model = gensim.models.Word2Vec(X_train_tokens, window=5, min_count=1, seed=1)
# dense_model.wv.save_word2vec_format('test.txt', binary=False)
dim = 100

In [32]:
def create_data(train_data, label_data, model):
    n = 100
    m = len(train_data)
    X = np.zeros((m, n))
    y = label_data
    
    ind = 0
    for tokens in train_data:
        vec = np.zeros((n))
        count = 0
        for token in tokens:
            if token in model.wv:
                count += 1
                vec += model.wv[token]
        if count != 0:
            vec = np.divide(vec, count)
        X[ind] = vec
        ind += 1
    return X, y

In [34]:
X_train, y_train = create_data(X_train_tokens, y_train_raw, dense_model)

In [35]:
print(X_train.shape)

(13077, 100)


In [36]:
print(X_train_raw.shape)

(13077,)


In [37]:
X_test, y_test = create_data(X_test, y_test_raw, dense_model)

NameError: name 'X_test' is not defined

In [38]:
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

In [39]:
sparse_model = load_glove_model("output.txt")

Loading Glove Model
15872 words loaded!


In [96]:
print(sparse_model['cyprus'])

KeyError: 'cyprus'

In [92]:
def create_data_sparse(train_data, label_data, model):
    n = 100
    m = len(train_data)
    X = np.zeros((m, n))
    y = label_data
    
    ind = 0
    for tokens in train_data:
        vec = np.zeros((n))
        count = 0
        for token in tokens:
            if token in model:
                count += 1
                vec += model[token]
        if count != 0:
            vec = np.divide(vec, count)
        X[ind] = vec
        ind += 1
    return X, y

In [73]:
X_train, y_train = create_data_sparse(X_train_tokens, y_train_raw, sparse_model)
X_test, y_test = create_data_sparse(X_test_tokens, y_test_raw, sparse_model)

# Baselines

## AdaBoost

In [76]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score

In [77]:
ada_model = AdaBoostClassifier(n_estimators=800, random_state = 1)
ada_model.fit(X_train, y_train)

AdaBoostClassifier(n_estimators=800, random_state=1)

In [78]:
test_pred = ada_model.predict(X_test)
print(accuracy_score(y_test, test_pred))
print(f1_score(y_test, test_pred))

0.6314984709480123
0.6053062561415001


## Sparse Linear Regression

In [80]:
from sklearn.linear_model import LogisticRegression

In [85]:
clf = LogisticRegression(random_state=0, solver='saga', penalty='l1', max_iter=7000)
clf.fit(X_train, y_train)

LogisticRegression(max_iter=7000, penalty='l1', random_state=0, solver='saga')

In [86]:
clf_pred = clf.predict(X_test)
print(accuracy_score(y_test, clf_pred))
print(f1_score(y_test, clf_pred))

0.5935779816513761
0.5912027068594278


In [89]:
clf_orig = LogisticRegression(random_state=0, solver='saga', penalty='none', max_iter=7000)
clf_orig.fit(X_train, y_train)



LogisticRegression(max_iter=7000, penalty='none', random_state=0, solver='saga')

In [90]:
clf_orig_pred = clf_orig.predict(X_test)
print(accuracy_score(y_test, clf_orig_pred))
print(f1_score(y_test, clf_orig_pred))

0.591131498470948
0.5900030665440049
