## Problem: Detection of aggressive tweets

Training dataset: 12776 tweets<br/> 
Validation dataset: 3194 tweets<br/> 
Test dataset: 3993 tweets<br/>
Tweets are labeled (by human) as:
* 1 (Cyber-Aggressive)
* 0 (Non Cyber-Aggressive)

In [53]:
import pandas as pd
import numpy as np

import pickle
import os
from keras.models import load_model
from keras.utils import print_summary

from mytextpreprocessing import TextPreprocessor
from mytextpreprocessing import FrequencyExtractor
from mytextpreprocessing import WordToIndexTransformer

## Data

In [3]:
data_path = './Data'
data_train = pd.read_json(os.path.join(data_path, 'train.json'))
data_valid = pd.read_json(os.path.join(data_path, 'valid.json'))
data_test = pd.read_json(os.path.join(data_path, 'test.json'))

In [4]:
X_train, y_train = data_train.content, data_train.label
X_valid, y_valid = data_valid.content, data_valid.label
X_test, y_test = data_test.content, data_test.label

X_train = np.r_[X_train, X_valid]
y_train = np.r_[y_train, y_valid]

In [5]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([9681, 6289]))

In [6]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([2460, 1533]))

## Testing of Models

In [7]:
models_path = 'Models'
file_names = ['logisticregression.p', 'rbfsvm.p', 'baggingtree.p', 'xgb.p']

results = pd.DataFrame(columns=['train_accuracy', 'test_accuracy'])
for file_name in file_names:
    with open(os.path.join(models_path, file_name), 'rb') as file:
        print(file_name)
        loaded_model = pickle.load(file)
        train_score = loaded_model.score(X_train, y_train)
        test_score = loaded_model.score(X_test, y_test)
        results.loc[file_name] = [train_score, test_score]

logisticregression.p
rbfsvm.p
baggingtree.p
xgb.p


  if diff:
  if diff:


In [45]:
with open(os.path.join('Models', 'wordToIndex.p'), 'rb') as file:
    wordToIndex = pickle.load(file)

X_rnn_train = wordToIndex.transform(X_train)
X_rnn_test = wordToIndex.transform(X_test)

In [57]:
file_names = ['birnn.h5', 'lstmrnn.h5']
# file_names = ['rnn.h5']
for file_name in file_names:
    print(file_name)
    loaded_model = load_model(os.path.join(models_path, file_name))
    train_score = loaded_model.evaluate(X_rnn_train, y_train)[1]
    test_score = loaded_model.evaluate(X_rnn_test, y_test)[1]
    results.loc[file_name] = [train_score, test_score]

birnn.h5
lstmrnn.h5


In [61]:
results

Unnamed: 0,train_accuracy,test_accuracy
logisticregression.p,0.976268,0.890308
rbfsvm.p,0.997245,0.96544
baggingtree.p,0.742142,0.700977
xgb.p,0.987664,0.916103
birnn.h5,0.989167,0.893814
lstmrnn.h5,0.975579,0.885049
