## Problem: Detection of aggressive tweets

Training dataset: 12776 tweets<br/> 
Validation dataset: 3194 tweets<br/> 
Test dataset: 3993 tweets<br/>
Tweets are labeled (by human) as:
* 1 (Cyber-Aggressive)
* 0 (Non Cyber-Aggressive)

In [2]:
import pandas as pd
import numpy as np

import pickle
import os
from keras.models import load_model
from keras.utils import print_summary

from mytextpreprocessing import TextPreprocessor
from mytextpreprocessing import FrequencyExtractor
from mytextpreprocessing import WordToIndexTransformer

In [3]:
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

## Data

In [4]:
data_path = './Data'
data_train = pd.read_json(os.path.join(data_path, 'train.json'))
data_valid = pd.read_json(os.path.join(data_path, 'valid.json'))
data_test = pd.read_json(os.path.join(data_path, 'test.json'))

In [5]:
X_train, y_train = data_train.content, data_train.label
X_valid, y_valid = data_valid.content, data_valid.label
X_test, y_test = data_test.content, data_test.label

X_train = np.r_[X_train, X_valid]
y_train = np.r_[y_train, y_valid]

In [6]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([9681, 6289]))

In [7]:
np.unique(y_test, return_counts=True)

(array([0, 1]), array([2460, 1533]))

## Testing of Models

In [8]:
models_path = 'Models'
file_names = ['logisticregression2.p', 'logisticregression18.p', 
              'rbfsvm2.p', 'rbfsvm18.p',
              'baggingtree2.p', 'baggingtree25.p',
              'xgb2.p', 'xgb15.p', 'xgb31.p']

results = pd.DataFrame(columns=['train_accuracy', 'test_accuracy'])
for file_name in file_names:
    with open(os.path.join(models_path, file_name), 'rb') as file:
        print(file_name)
        loaded_model = pickle.load(file)
        train_score = loaded_model.score(X_train, y_train)
        test_score = loaded_model.score(X_test, y_test)
        results.loc[file_name] = [train_score, test_score]

logisticregression2.p
logisticregression18.p
rbfsvm2.p
rbfsvm18.p
baggingtree2.p
baggingtree25.p
xgb2.p
xgb15.p
xgb31.p


In [10]:
with open(os.path.join(models_path, 'wordToIndex.p'), 'rb') as file:
    wordToIndex = pickle.load(file)

X_rnn_train = wordToIndex.transform(X_train)
X_rnn_test = wordToIndex.transform(X_test)

In [11]:
file_names = ['bi30rnn.h5', 'lstm30rnn.h5']
for file_name in file_names:
    print(file_name)
    loaded_model = load_model(os.path.join(models_path, file_name))
    train_score = loaded_model.evaluate(X_rnn_train, y_train)[1]
    test_score = loaded_model.evaluate(X_rnn_test, y_test)[1]
    results.loc[file_name] = [train_score, test_score]

bi30rnn.h5
lstm30rnn.h5


In [13]:
np.round(results, 3)

Unnamed: 0,train_accuracy,test_accuracy
logisticregression2.p,0.976,0.89
logisticregression18.p,0.798,0.744
rbfsvm2.p,0.997,0.965
rbfsvm18.p,0.997,0.962
baggingtree2.p,0.742,0.701
baggingtree25.p,0.751,0.716
xgb2.p,0.988,0.916
xgb15.p,0.983,0.914
xgb31.p,0.99,0.916
bi30rnn.h5,0.993,0.906
