In [75]:
from __future__ import print_function

import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.python.data import Dataset
import db
import sys
import pymongo
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

myclient = pymongo.MongoClient("mongodb://localhost:27017/")
transferdb = myclient["transferdb"]

collfalse = transferdb["labelled__false_tweets"]
colltrue = transferdb["labelled_tweets"]
df_false = pd.DataFrame(list(collfalse.find()))
df_true = pd.DataFrame(list(colltrue.find()))
df_merged = pd.concat([df_false, df_true])

# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

In [76]:
def preprocess_features(df):
    selected_features = df["tweet_text"].tolist()
    return selected_features

def preprocess_targets(df):
    output_targets = pd.DataFrame()
    output_targets["label_int"] = (df["label"] == "True").astype(float)
    return output_targets["label_int"].tolist()
    

    
    

In [77]:
# making the training sets
shuffeled = df_merged.sample(frac=1)
training_examples = preprocess_features(shuffeled.head(100))
training_targets = preprocess_targets(shuffeled.head(100))

validation_examples = preprocess_features(shuffeled.tail(50))
validation_targets = preprocess_targets(shuffeled.tail(50))





In [78]:
print("Training examples summary:")
display.display(training_examples)
print("Validation examples summary:")
display.display(validation_examples)

print("Training targets summary:")
display.display(training_targets)
print("Validation targets summary:")
display.display(validation_targets)

Training examples summary:


[u'Jurgen Klopp believes Liverpool stars will welcome new arrivals #epl : Midfield trio Fabinho, Naby Keita and Xherdan Shaqiri have arrived since the start of the transfer window and Klopp looks set to land goalkeeper Alisson... https:// eplfeeds.com/article/jurgen -klopp-believes-liverpool-stars-will-welcome-new-arrivals \u2026',
 u'Manchester United transfer news LIVE Man Utd squad confirmed ahead of Spurs fixture https:// ift.tt/2LtOybM',
 u'NEWS : Leeds United in talks with Premier League club over late transfer #LUFC https:// footballleagueworld.co.uk/leeds-united-i n-talks-with-premier-league-club-over-late-transfer/ \u2026',
 u'Report: Alireza Jahanbakhsh flown in for medical ahead of \xa319.4m Leicester move #leicestercityfc #premierleague http://www. hitc.com/en-gb/2018/07/ 13/report-alireza-jahanbakhsh-undergoes-medical-ahead-of-194m-leice/?utm_medium=share%20service&utm_campaign=social%20media&utm_source=twitter&utm_content=HITCdeadlineday \u2026',
 u'Report: Nottingham For

Validation examples summary:


[u"Tottenham Hotspur transfer news : Rennes in talks to sign Georges-Kevin N'Koudou",
 u'Richarlison heading for Everton medical ahead of a transfer from Watford - fee in excess of \xa340m',
 u'Report: Yannick Bolasie travels for Middlesbrough medical ahead of Everton exit #efc #boro #evertonfc #premierleague http://www. hitc.com/en-gb/2018/08/ 20/report-yannick-bolasie-travels-for-middlesbrough-medical-ahead-o/?utm_medium=share%20service&utm_campaign=social%20media&utm_source=twitter&utm_content=HITCdeadlineday \u2026',
 u'Arsenal in pole to land ex-City star as agent confirms London medical #epl : The agent of Yaya Toure has claimed that the former Manchester City midfielder has had a medical in London ahead of a move. The Ivorian ended his e... https:// eplfeeds.com/article/arsena l-in-pole-to-land-ex-city-star-as-agent-confirms-london-medical \u2026',
 u'Loaned him out again... ? RT @DeadlineDayLive : DEAL DONE : Fulham have signed Timothy Fosu-Mensah from Manchester United on a se

Training targets summary:


[1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0]

Validation targets summary:


[0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0]

In [79]:
def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val

In [80]:
ndarr = np.array(training_targets)


x_train, x_val = ngram_vectorize(training_examples, ndarr, validation_examples)
print(x_val)

  (0, 214)	0.27870458
  (0, 215)	0.27870458
  (0, 251)	0.27870458
  (0, 252)	0.28923038
  (0, 264)	0.12392044
  (0, 266)	0.14799255
  (0, 291)	0.27870458
  (0, 292)	0.27870458
  (0, 296)	0.27870458
  (0, 376)	0.17995387
  (0, 474)	0.19896379
  (0, 516)	0.14613773
  (0, 519)	0.24043378
  (0, 551)	0.11857423
  (0, 561)	0.21478193
  (0, 565)	0.23459212
  (0, 566)	0.27870458
  (0, 571)	0.16236429
  (0, 573)	0.20647675
  (1, 39)	0.24746309
  (1, 40)	0.24746309
  (1, 172)	0.31667438
  (1, 173)	0.3343153
  (1, 180)	0.29007414
  (1, 186)	0.17502454
  :	:
  (48, 520)	0.19462802
  (48, 551)	0.065122634
  (48, 631)	0.17353398
  (49, 39)	0.20010394
  (49, 40)	0.20010394
  (49, 113)	0.3634268
  (49, 124)	0.08044598
  (49, 241)	0.27033448
  (49, 242)	0.27033448
  (49, 272)	0.16135573
  (49, 275)	0.28872502
  (49, 307)	0.27033448
  (49, 347)	0.15067653
  (49, 348)	0.27033448
  (49, 357)	0.20566629
  (49, 366)	0.1557934
  (49, 371)	0.21849447
  (49, 389)	0.28872502
  (49, 393)	0.14593907
  (49, 419)	0

In [81]:
def _get_last_layer_units_and_activation(num_classes):
    """Gets the # units and activation function for the last network layer.

    # Arguments
        num_classes: int, number of classes.

    # Returns
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation

In [89]:
def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    """Creates an instance of a multi-layer perceptron model.

    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.

    # Returns
        An MLP model instance.
    """
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=op_units, activation=op_activation))
    return model


In [94]:
def train_ngram_model(data,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.2):
    """Trains n-gram model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of Dense layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.

    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
#     num_classes = explore_data.get_num_classes(train_labels)
    num_classes = 2
#     unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
#     if len(unexpected_labels):
#         raise ValueError('Unexpected label values found in the validation set:'
#                          ' {unexpected_labels}. Please make sure that the '
#                          'labels in the validation set are in the same range '
#                          'as training labels.'.format(
#                              unexpected_labels=unexpected_labels))

    # Vectorize texts.
    x_train, x_val = ngram_vectorize(
        train_texts, train_labels, val_texts)

    # Create model instance.
    model = mlp_model(layers=layers,
                                  units=units,
                                  dropout_rate=dropout_rate,
                                  input_shape=x_train.shape[1:],
                                  num_classes=num_classes)

    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

    # Train and validate model.
    history = model.fit(
            x_train,
            train_labels,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, val_labels),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('twitter_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]

In [97]:
data = (training_examples, training_targets),(validation_examples, validation_targets)
train_ngram_model(data,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.2)

Train on 100 samples, validate on 50 samples
Epoch 1/1000
 - 0s - loss: 0.6887 - acc: 0.6000 - val_loss: 0.6866 - val_acc: 0.6000
Epoch 2/1000
 - 0s - loss: 0.6831 - acc: 0.6500 - val_loss: 0.6818 - val_acc: 0.6800
Epoch 3/1000
 - 0s - loss: 0.6767 - acc: 0.7200 - val_loss: 0.6770 - val_acc: 0.7000
Epoch 4/1000
 - 0s - loss: 0.6671 - acc: 0.8000 - val_loss: 0.6721 - val_acc: 0.7000
Epoch 5/1000
 - 0s - loss: 0.6617 - acc: 0.7800 - val_loss: 0.6673 - val_acc: 0.7000
Epoch 6/1000
 - 0s - loss: 0.6568 - acc: 0.7900 - val_loss: 0.6625 - val_acc: 0.6800
Epoch 7/1000
 - 0s - loss: 0.6454 - acc: 0.7900 - val_loss: 0.6577 - val_acc: 0.6800
Epoch 8/1000
 - 0s - loss: 0.6416 - acc: 0.7700 - val_loss: 0.6528 - val_acc: 0.6800
Epoch 9/1000
 - 0s - loss: 0.6297 - acc: 0.8800 - val_loss: 0.6479 - val_acc: 0.6800
Epoch 10/1000
 - 0s - loss: 0.6243 - acc: 0.8100 - val_loss: 0.6429 - val_acc: 0.6800
Epoch 11/1000
 - 0s - loss: 0.6171 - acc: 0.7900 - val_loss: 0.6379 - val_acc: 0.7000
Epoch 12/1000
 - 0

Epoch 96/1000
 - 0s - loss: 0.1138 - acc: 0.9900 - val_loss: 0.3672 - val_acc: 0.8400
Epoch 97/1000
 - 0s - loss: 0.1038 - acc: 0.9900 - val_loss: 0.3667 - val_acc: 0.8400
Epoch 98/1000
 - 0s - loss: 0.1046 - acc: 0.9800 - val_loss: 0.3661 - val_acc: 0.8400
Epoch 99/1000
 - 0s - loss: 0.0932 - acc: 0.9900 - val_loss: 0.3657 - val_acc: 0.8400
Epoch 100/1000
 - 0s - loss: 0.0963 - acc: 1.0000 - val_loss: 0.3653 - val_acc: 0.8400
Epoch 101/1000
 - 0s - loss: 0.0922 - acc: 0.9900 - val_loss: 0.3650 - val_acc: 0.8400
Epoch 102/1000
 - 0s - loss: 0.0890 - acc: 1.0000 - val_loss: 0.3648 - val_acc: 0.8400
Epoch 103/1000
 - 0s - loss: 0.0916 - acc: 1.0000 - val_loss: 0.3646 - val_acc: 0.8400
Epoch 104/1000
 - 0s - loss: 0.0824 - acc: 1.0000 - val_loss: 0.3645 - val_acc: 0.8400
Epoch 105/1000
 - 0s - loss: 0.0828 - acc: 1.0000 - val_loss: 0.3643 - val_acc: 0.8400
Epoch 106/1000
 - 0s - loss: 0.0887 - acc: 1.0000 - val_loss: 0.3642 - val_acc: 0.8400
Epoch 107/1000
 - 0s - loss: 0.0827 - acc: 1.00

(0.84, 0.3634070158004761)

Train on 100 samples, validate on 50 samples
Epoch 1/1000
 - 0s - loss: 0.6853 - acc: 0.6500 - val_loss: 0.6705 - val_acc: 0.6800
Epoch 2/1000
 - 0s - loss: 0.6696 - acc: 0.6700 - val_loss: 0.6653 - val_acc: 0.6600
Epoch 3/1000
 - 0s - loss: 0.6623 - acc: 0.6900 - val_loss: 0.6602 - val_acc: 0.6600
Epoch 4/1000
 - 0s - loss: 0.6578 - acc: 0.7200 - val_loss: 0.6550 - val_acc: 0.6600
Epoch 5/1000
 - 0s - loss: 0.6433 - acc: 0.7400 - val_loss: 0.6499 - val_acc: 0.6600
Epoch 6/1000
 - 0s - loss: 0.6390 - acc: 0.7000 - val_loss: 0.6449 - val_acc: 0.6600
Epoch 7/1000
 - 0s - loss: 0.6317 - acc: 0.7200 - val_loss: 0.6399 - val_acc: 0.6600
Epoch 8/1000
 - 0s - loss: 0.6209 - acc: 0.7500 - val_loss: 0.6349 - val_acc: 0.6600
Epoch 9/1000
 - 0s - loss: 0.6135 - acc: 0.7200 - val_loss: 0.6298 - val_acc: 0.6600
Epoch 10/1000
 - 0s - loss: 0.6018 - acc: 0.7400 - val_loss: 0.6248 - val_acc: 0.6600
Epoch 11/1000
 - 0s - loss: 0.5961 - acc: 0.7500 - val_loss: 0.6196 - val_acc: 0.6600
Epoch 12/1000
 - 0

Epoch 96/1000
 - 0s - loss: 0.0943 - acc: 1.0000 - val_loss: 0.3699 - val_acc: 0.8400
Epoch 97/1000
 - 0s - loss: 0.0952 - acc: 1.0000 - val_loss: 0.3695 - val_acc: 0.8400
Epoch 98/1000
 - 0s - loss: 0.0853 - acc: 1.0000 - val_loss: 0.3692 - val_acc: 0.8400
Epoch 99/1000
 - 0s - loss: 0.0858 - acc: 0.9900 - val_loss: 0.3688 - val_acc: 0.8400
Epoch 100/1000
 - 0s - loss: 0.0859 - acc: 1.0000 - val_loss: 0.3684 - val_acc: 0.8400
Epoch 101/1000
 - 0s - loss: 0.0791 - acc: 1.0000 - val_loss: 0.3680 - val_acc: 0.8400
Epoch 102/1000
 - 0s - loss: 0.0793 - acc: 1.0000 - val_loss: 0.3676 - val_acc: 0.8400
Epoch 103/1000
 - 0s - loss: 0.0879 - acc: 0.9900 - val_loss: 0.3672 - val_acc: 0.8400
Epoch 104/1000
 - 0s - loss: 0.0809 - acc: 1.0000 - val_loss: 0.3670 - val_acc: 0.8400
Epoch 105/1000
 - 0s - loss: 0.0783 - acc: 1.0000 - val_loss: 0.3667 - val_acc: 0.8400
Epoch 106/1000
 - 0s - loss: 0.0763 - acc: 0.9900 - val_loss: 0.3665 - val_acc: 0.8400
Epoch 107/1000
 - 0s - loss: 0.0844 - acc: 1.00

(0.84, 0.3646441400051117)