# Predicting if users are spammers or not

In [32]:
# importing nessasary stuff
from __future__ import absolute_import, division, print_function, unicode_literals
import neural_structured_learning as nsl
import tensorflow as tf

import csv
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.expand_frame_repr', False)

In [None]:
userdata = pd.read_csv('/Volumes/Samsung_T5/deep_learning_data/social spammer dataset/preprocessed_data/userdata.nodes',sep="\t", header=None)

In [5]:
relations = pd.read_csv('/Volumes/Samsung_T5/deep_learning_data/social spammer dataset/preprocessed_data/relations.edges',sep="\t", header=None)

In [6]:
relations

Unnamed: 0,0,1
0,3185072,1528455
1,5148962,3748312
2,4463760,64388
3,4884226,75811
4,625,564072
...,...,...
4995,1863471,5333852
4996,178627,502784
4997,2038748,555573
4998,28562,1344471


## Getting the data

In [33]:
!tar -C /tmp -xvzf /Volumes/Samsung_T5/deep_learning_data/social_spammer_dataset/preprocessed_data/social_spammer.tgz

x relations.edges
x userdata.nodes


In [35]:
!python preprocessing_spammer_dataset.py \
--input_content=/tmp/userdata.nodes \
--input_graph=/tmp/relations.edges \
--max_nbrs=5 \
--output_train_data=/tmp/train_merged_examples.tfr \
--output_test_data=/tmp/test_examples.tfr

  with open(in_file, 'rU') as cora_content:
Reading graph file: /tmp/relations.edges...
Done reading 5000 edges from: /tmp/relations.edges (0.02 seconds).
Making all edges bi-directional...
Done (0.01 seconds). Total graph nodes: 9882
Joining seed and neighbor tf.train.Examples with graph edges...
Done creating and writing 4486930 merged tf.train.Examples (369.93 seconds).
Out-degree histogram: [(0, 4479000), (1, 7890), (2, 40)]
Output training data written to TFRecord file: /tmp/train_merged_examples.tfr.
Output test data written to TFRecord file: /tmp/test_examples.tfr.
Total running time: 12.95 minutes.


In [36]:
### Experiment dataset
TRAIN_DATA_PATH = '/tmp/train_merged_examples.tfr'
TEST_DATA_PATH = '/tmp/test_examples.tfr'

### Constants used to identify neighbor features in the input.
NBR_FEATURE_PREFIX = 'NL_nbr_'
NBR_WEIGHT_SUFFIX = '_weight'

## setting hyperprams

In [101]:
class HParams(object):
  """Hyperparameters used for training."""
  def __init__(self):
    ### dataset parameters
    self.num_classes = 2
    self.max_seq_length = 89 # distinct features
    ### neural graph learning parameters
    self.distance_type = nsl.configs.DistanceType.L2
    self.graph_regularization_multiplier = 0.1
    self.num_neighbors = 1
    ### model architecture
    self.num_fc_units = [50,50]
    ### training parameters
    self.train_epochs = 5
    self.batch_size = 150
    self.dropout_rate = 0.5
    ### eval parameters
    self.eval_steps = None  # All instances in the test set are evaluated.

HPARAMS = HParams()

## Load train and test data to correct format for nsl

In [90]:
def parse_example(example_proto):
  """Extracts relevant fields from the `example_proto`.

  Args:
    example_proto: An instance of `tf.train.Example`.

  Returns:
    A pair whose first value is a dictionary containing relevant features
    and whose second value contains the ground truth labels.
  """
  # The 'words' features is a multi-hot, bag-of-words representation of the
  # original raw text. A default value is required for examples that don't
  # have the feature.
  feature_spec = {
      'words':
          tf.io.FixedLenFeature([HPARAMS.max_seq_length],
                                tf.int64,
                                default_value=tf.constant(
                                    0,
                                    dtype=tf.int64,
                                    shape=[HPARAMS.max_seq_length])),
      'label':
          tf.io.FixedLenFeature((), tf.int64, default_value=-1),
  }
  # We also extract corresponding neighbor features in a similar manner to
  # the features above.
  for i in range(HPARAMS.num_neighbors):
    nbr_feature_key = '{}{}_{}'.format(NBR_FEATURE_PREFIX, i, 'words')
    nbr_weight_key = '{}{}{}'.format(NBR_FEATURE_PREFIX, i, NBR_WEIGHT_SUFFIX)
    feature_spec[nbr_feature_key] = tf.io.FixedLenFeature(
        [HPARAMS.max_seq_length],
        tf.int64,
        default_value=tf.constant(
            0, dtype=tf.int64, shape=[HPARAMS.max_seq_length]))

    # We assign a default value of 0.0 for the neighbor weight so that
    # graph regularization is done on samples based on their exact number
    # of neighbors. In other words, non-existent neighbors are discounted.
    feature_spec[nbr_weight_key] = tf.io.FixedLenFeature(
        [1], tf.float32, default_value=tf.constant([0.0]))

  features = tf.io.parse_single_example(example_proto, feature_spec)

  labels = features.pop('label')
  return features, labels


def make_dataset(file_path, training=False):
  """Creates a `tf.data.TFRecordDataset`.

  Args:
    file_path: Name of the file in the `.tfrecord` format containing
      `tf.train.Example` objects.
    training: Boolean indicating if we are in training mode.

  Returns:
    An instance of `tf.data.TFRecordDataset` containing the `tf.train.Example`
    objects.
  """
  dataset = tf.data.TFRecordDataset([file_path])
  if training:
    dataset = dataset.shuffle(10000)
  dataset = dataset.map(parse_example)
  dataset = dataset.batch(HPARAMS.batch_size)
  return dataset

In [91]:
train_dataset = make_dataset(TRAIN_DATA_PATH, training=True)
test_dataset = make_dataset(TEST_DATA_PATH)

## Creating functional keras model

In [92]:
def make_mlp_functional_model(hparams):
    """Creates a functional API-based multi-layer perceptron model."""
    inputs = tf.keras.Input(shape=(hparams.max_seq_length,), dtype='int64', name='words')

  # casting one hot to floating point format.
    cur_layer = tf.keras.layers.Lambda(
      lambda x: tf.keras.backend.cast(x, tf.float32))(
          inputs)

    for num_units in hparams.num_fc_units:
        cur_layer = tf.keras.layers.Dense(num_units, activation='relu')(cur_layer)
        cur_layer = tf.keras.layers.Dropout(hparams.dropout_rate)(cur_layer)
        cur_layer = tf.keras.layers.BatchNormalization()(cur_layer)

    outputs = tf.keras.layers.Dense(
      hparams.num_classes, activation='softmax')(
          cur_layer)

    model = tf.keras.Model(inputs, outputs=outputs)
    return model

In [97]:
# # Create a base MLP model using the functional API.
base_model = make_mlp_functional_model(HPARAMS)
base_model.summary()

Model: "model_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
words (InputLayer)           [(None, 89)]              0         
_________________________________________________________________
lambda_10 (Lambda)           (None, 89)                0         
_________________________________________________________________
dense_30 (Dense)             (None, 50)                4500      
_________________________________________________________________
dropout_20 (Dropout)         (None, 50)                0         
_________________________________________________________________
batch_normalization_20 (Batc (None, 50)                200       
_________________________________________________________________
dense_31 (Dense)             (None, 50)                2550      
_________________________________________________________________
dropout_21 (Dropout)         (None, 50)                0  

## Evaluation function

In [98]:
# Helper function to print evaluation metrics.
def print_metrics(model_desc, eval_metrics):
    print('\n')
    print('Eval accuracy for ', model_desc, ': ', eval_metrics['accuracy'])
    print('Eval loss for ', model_desc, ': ', eval_metrics['loss'])
    if 'graph_loss' in eval_metrics:
        print('Eval graph loss for ', model_desc, ': ', eval_metrics['graph_loss'])

## Traning base model

In [99]:
# Compile and train the base MLP model
base_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'])
base_model_history = base_model.fit(train_dataset, epochs=HPARAMS.train_epochs, verbose=1, validation_data=test_dataset)

Epoch 1/50
   9735/Unknown - 169s 17ms/step - loss: 0.6952 - accuracy: 0.6359

KeyboardInterrupt: 

# Training graph regulated model

In [109]:
# Build a new base MLP model.
base_reg_model = make_mlp_functional_model(
    HPARAMS)

In [110]:
# Wrap the base MLP model with graph regularization.
graph_reg_config = nsl.configs.make_graph_reg_config(
    max_neighbors=HPARAMS.num_neighbors,
    multiplier=HPARAMS.graph_regularization_multiplier,
    distance_type=HPARAMS.distance_type,
    sum_over_axis=-1)
graph_reg_model = nsl.keras.GraphRegularization(base_reg_model,
                                                graph_reg_config)
graph_reg_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'])
graph_reg_history = graph_reg_model.fit(train_dataset, epochs=HPARAMS.train_epochs, verbose=1, validation_data=test_dataset)

Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
