# Deep Learning Project

Developed by: Mandy Sack
August 2019


In [1]:
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import sklearn
import sys
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
# import the data
# first, verify you have the files in the correct location
print(os.path.isdir("data"))
dataDir = "data/"

print(os.path.isfile("data/content_polluters_classified.csv"))
print(os.path.isfile("data/legitimate_users_classified.csv"))

bDcolnames=['UserID','CreatedAt','CollectedAt', 'NumberOfFollowings', 'NumberOfFollowers', 'NumberOfTweets','LengthOfScreenName','LengthOfDescriptionInUserProfile','BadUser'] 
badData = pd.read_csv(dataDir+"content_polluters_classified.csv", names=bDcolnames)

nDcolnames=['UserID','CreatedAt','CollectedAt', 'NumberOfFollowings', 'NumberOfFollowers', 'NumberOfTweets','LengthOfScreenName','LengthOfDescriptionInUserProfile','BadUser']
normalData = pd.read_csv(dataDir+"legitimate_users_classified.csv", names=nDcolnames)

#TODO: make this configurable to enable your own data to be passed in 


True
True
True


In [3]:
print(badData.head(3))
print(normalData.head(3))

   UserID         CreatedAt      CollectedAt  NumberOfFollowings  \
0    6301    9/18/2006 1:07  1/17/2010 20:38                3269   
1   10836  10/27/2006 14:38   6/18/2010 3:35                1949   
2   10997   10/29/2006 9:50   4/24/2010 1:12                1119   

   NumberOfFollowers  NumberOfTweets  LengthOfScreenName  \
0               3071             861                   8   
1                793             226                   9   
2               9644           38674                  12   

   LengthOfDescriptionInUserProfile  BadUser  
0                               132        1  
1                               134        1  
2                               158        1  
   UserID        CreatedAt       CollectedAt  NumberOfFollowings  \
0     614  7/13/2006 15:30  11/20/2009 23:56                 510   
1    1038  7/15/2006 16:12   11/16/2009 5:12                 304   
2    1437  7/16/2006 12:29  11/16/2009 16:25                  45   

   NumberOfFollowers  Num

In [4]:
# merge the data together based on CollectedAt colum & save to file
mergedData = pd.concat([badData,normalData])
mergedData.sort_values(by=['CollectedAt'], inplace=True)
mergedData.to_csv(dataDir+"mergedData_classified.csv")
mergedData

Unnamed: 0,UserID,CreatedAt,CollectedAt,NumberOfFollowings,NumberOfFollowers,NumberOfTweets,LengthOfScreenName,LengthOfDescriptionInUserProfile,BadUser
5857,61932874,7/31/2009 20:59,1/1/2010 0:22,1351,767,87,15,160,1
4994,53389291,7/3/2009 8:50,1/1/2010 0:22,1413,1082,466,13,149,1
7865,80910809,10/8/2009 13:32,1/1/2010 0:22,1852,1123,231,11,145,1
4733,49911277,6/23/2009 2:01,1/1/2010 0:22,866,1485,74,8,0,1
7475,78144979,9/28/2009 17:47,1/1/2010 0:23,3353,3347,798,14,70,1
11455,100910119,12/31/2009 23:23,1/1/2010 0:23,67,1,2,13,0,1
2967,30323406,4/10/2009 16:57,1/1/2010 0:33,1936,855,127,15,55,1
6388,67007483,8/19/2009 9:03,1/1/2010 0:33,18316,19548,3362,14,125,1
8726,87631976,11/5/2009 0:41,1/1/2010 0:33,1555,1161,396,11,91,1
6544,68790423,8/25/2009 14:54,1/1/2010 0:46,3434,3296,865,9,80,1


In [5]:
# Remove nonnumeric columns
mergedData.drop("CreatedAt", axis=1, inplace=True)
mergedData.drop("CollectedAt", axis=1, inplace=True)
mergedData.drop("UserID", axis=1, inplace=True)
mergedData.head(3)

Unnamed: 0,NumberOfFollowings,NumberOfFollowers,NumberOfTweets,LengthOfScreenName,LengthOfDescriptionInUserProfile,BadUser
5857,1351,767,87,15,160,1
4994,1413,1082,466,13,149,1
7865,1852,1123,231,11,145,1


In [6]:
column_names = list(mergedData.columns)
# features are the attributes of the data 
features = column_names[:-1]
# label is to be predicted
labels = column_names[-1] 

In [7]:
# The data needs to be split into a training set and a test set
# To use 80/20, set the training size to .8
training_set_size_portion = .8

# Keep track of the accuracy score
accuracy_score = 0

# The DNN has hidden units, set the spec for them here
hidden_units_spec = [10,20,10]
n_classes_spec = 2

# Define the temp directory for keeping the model and checkpoints
tmp_dir_spec = "data/model"

# The number of training steps
steps_spec = 2000

# The number of epochs
epochs_spec = 500
#1500 got essentially the same amount but took exponentially longer

In [8]:
# Randomize dataset & save to file
randomized_data = mergedData.sample(frac=1)
mergedData.to_csv(dataDir+"trainingData_classified.csv")

In [9]:
total_records = len(randomized_data)
training_set_size = int(total_records * training_set_size_portion)
test_set_size = total_records = training_set_size

In [10]:
# Build the training features and labels
training_features = randomized_data.head(training_set_size)[features].copy()
training_labels = randomized_data.head(training_set_size)[labels].copy()
print(training_features.head())
print(training_labels.head())

       NumberOfFollowings  NumberOfFollowers  NumberOfTweets  \
2341                  149                228            5014   
13452                 112                 57              38   
15533                  36                 17             112   
465                  1274                854              46   
19177                 121                 21             102   

       LengthOfScreenName  LengthOfDescriptionInUserProfile  
2341                   10                                20  
13452                  15                               142  
15533                   8                                51  
465                     7                                39  
19177                  12                                 0  
2341     0
13452    0
15533    0
465      1
19177    0
Name: BadUser, dtype: int64


In [11]:
# Build the testing features and labels
testing_features = randomized_data.tail(test_set_size)[features].copy()
testing_labels = randomized_data.tail(test_set_size)[labels].copy()

In [12]:
feature_columns = [tf.feature_column.numeric_column(key) for key in features]

In [13]:
classifier = tf.estimator.DNNClassifier( feature_columns=feature_columns, hidden_units=hidden_units_spec,  n_classes=n_classes_spec, model_dir=tmp_dir_spec)

In [14]:
# Define the training input function
train_input_fn = tf.estimator.inputs.pandas_input_fn( x=training_features, y=training_labels, num_epochs=epochs_spec, shuffle=True)

In [15]:
# Train the model using the classifer.
classifier.train(input_fn=train_input_fn, steps=steps_spec)

W0826 23:54:13.271922 20140 deprecation.py:323] From C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\training\training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
W0826 23:54:13.305154 20140 deprecation.py:323] From C:\Users\user\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\inputs\queues\feeding_queue_runner.py:62: QueueRunner.__init__ (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
W0826 23:54:13.308626 20140 deprecation.py:323] From C:\Users\user\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\inputs\queues\feeding_functions.py:500: add_queue_runner (from te

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier at 0x1eefd4d5128>

In [16]:
# Define the test input function
test_input_fn = tf.estimator.inputs.pandas_input_fn( x=testing_features, y=testing_labels, num_epochs=epochs_spec, shuffle=False)

In [17]:
# Evaluate accuracy
accuracy_score = classifier.evaluate(input_fn=test_input_fn)["accuracy"]
print("Accuracy = {}".format(accuracy_score))

W0826 23:54:39.020307 20140 deprecation.py:323] From C:\Users\user\Anaconda3\lib\site-packages\tensorflow\python\ops\metrics_impl.py:2027: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
W0826 23:54:39.813450 20140 metrics_impl.py:804] Trapezoidal rule is known to produce incorrect PR-AUCs; please switch to "careful_interpolation" instead.
W0826 23:54:39.893265 20140 metrics_impl.py:804] Trapezoidal rule is known to produce incorrect PR-AUCs; please switch to "careful_interpolation" instead.


Accuracy = 0.8912015557289124


In [18]:
# Create a prediction set --
# this is a list of input features that you want to classify
# Using a "known" Bad User to see if it classifies correctly
prediction_set = pd.DataFrame({'NumberOfFollowings':[12848], 'NumberOfFollowers':[12933], 'NumberOfTweets':[2315],'LengthOfScreenName':[5],'LengthOfDescriptionInUserProfile':[63]})

In [19]:
predict_input_fn = tf.estimator.inputs.pandas_input_fn( x=prediction_set, num_epochs=1, shuffle=False)

In [20]:
# Get a list of the predictions
predictions = list(classifier.predict(input_fn=predict_input_fn))

In [21]:
predicted_classes = [p["classes"] for p in predictions] 
results=np.concatenate(predicted_classes) 
print(results)

[b'1']
