In [2]:
#libraries
import glob
from preprocessor import api as tweet_preprocessor

import numpy as np
import math
import random
import os
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

from pyspark.sql.functions import col, udf, to_timestamp, lit, to_timestamp, when, rand
from pyspark.sql.types import IntegerType, LongType, DoubleType, StringType, ArrayType
from pyspark.ml.feature import Normalizer, StandardScaler, MinMaxScaler, VectorAssembler

import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate, Dropout, Activation, BatchNormalization
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import SGD, Adam

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import MeanSquaredError, CategoricalCrossentropy

In [3]:
import findspark
findspark.init()

In [4]:
#load dataset
bot_tweets_dataset_path = 'F://TwitterBotDataset//tweet_dataset_small//bot_tweets//'
genuine_tweets_dataset_path = 'F://TwitterBotDataset//tweet_dataset_small//genuine_tweets//'
COLUMN_NAMES = ['text', 'retweet_count', 'reply_count', 'favorite_count',
                'num_hashtags', 'num_urls', 'num_mentions']


In [6]:
conf = SparkConf()
conf.setMaster("local[8]").setAppName("ml_account_ base_session")
sc = SparkContext(conf=conf)

# init spark
#spark = SparkSession.builder.appName('ml_account_ base_session').getOrCreate()
spark = SparkSession.builder.getOrCreate()

In [7]:
spark

In [6]:
# read dataset from csv
def read_dataset():
    bot_tweets = spark.read.csv(bot_tweets_dataset_path, header = True, inferSchema = True)
    genuine_tweets = spark.read.csv(genuine_tweets_dataset_path, header = True, inferSchema = True)
    return bot_tweets, genuine_tweets

bot_tweets_df, genuine_tweets_df = read_dataset()

print(bot_tweets_df.count(), genuine_tweets_df.count())


99999 199999


In [7]:
# column name of dataframe
column_name = [cname for cname, tp in bot_tweets_df.dtypes]
len(column_name), len(genuine_tweets_df.dtypes)
#column_name

#Number of column is diffrent for bot and genuine tweets data

#genuine_tweets_df = genuine_tweets_df.toDF(*column_name)
genuine_tweets_df = genuine_tweets_df.toDF('id','text','source','user_id','truncated','in_reply_to_status_id', 
                                           'in_reply_to_user_id','in_reply_to_screen_name', 'retweeted_status_id',
                                           'geo','place','contributors','retweet_count', 'reply_count','favorite_count',
                                           'favorited', 'retweeted','possibly_sensitive','num_hashtags','num_urls',
                                           'num_mentions','created_at','timestamp','crawled_at', 'updated', "last_one")
genuine_tweets_df = genuine_tweets_df.drop('created_at') # remove 5th column from end

#update column name according to 
genuine_tweets_df = genuine_tweets_df.toDF('id','text','source','user_id','truncated','in_reply_to_status_id', 
                                           'in_reply_to_user_id','in_reply_to_screen_name', 'retweeted_status_id',
                                           'geo','place','contributors','retweet_count', 'reply_count','favorite_count',
                                           'favorited', 'retweeted','possibly_sensitive','num_hashtags','num_urls',
                                           'num_mentions','created_at','timestamp','crawled_at', 'updated')


In [8]:
# Same column has diffrent data type. So make data type same for every column
genuine_tweets_df = genuine_tweets_df.withColumn("id",col("id").cast(IntegerType())) \
                                .withColumn("favorite_count",col("favorite_count").cast(LongType())) \
                                .withColumn("favorited",col("favorited").cast(IntegerType()))

In [9]:
print(bot_tweets_df.count(), bot_tweets_df.count())

99999 99999


In [10]:
## only keep the required column from the dataframe
bot_tweets_df = bot_tweets_df.select(*COLUMN_NAMES)
genuine_tweets_df = genuine_tweets_df.select(*COLUMN_NAMES)

## add BotOrNot column
bot_tweets_df = bot_tweets_df.withColumn('BotOrNot', lit(1))
genuine_tweets_df = genuine_tweets_df.withColumn('BotOrNot', lit(0))

#combine clean and bot accounts data togather
tweets_df = bot_tweets_df.union(genuine_tweets_df)

# shuffle dataset
tweets_df = tweets_df.orderBy(rand())

In [11]:
print(tweets_df.count())

299998


In [12]:
tweets_df.dtypes

[('text', 'string'),
 ('retweet_count', 'string'),
 ('reply_count', 'string'),
 ('favorite_count', 'bigint'),
 ('num_hashtags', 'string'),
 ('num_urls', 'string'),
 ('num_mentions', 'string'),
 ('BotOrNot', 'int')]

In [13]:
tweets_df.head(5)

[Row(text='So what is Bitcoin and the Bitcoin block chain exactly? http://t.co/OmjqcnvPdP', retweet_count='2', reply_count='0', favorite_count=0, num_hashtags='0', num_urls='1', num_mentions='0', BotOrNot=0),
 Row(text='Check out these alternatives to traditional beef hamburgers!  http://t.co/HlLcXvUpYA', retweet_count='0', reply_count='0', favorite_count=0, num_hashtags='0', num_urls='1', num_mentions='0', BotOrNot=0),
 Row(text="Every homie ain't ya homie boy I thought you knew ? http://t.co/ZsJr5BWvmk", retweet_count='0', reply_count='0', favorite_count=0, num_hashtags='0', num_urls='1', num_mentions='0', BotOrNot=1),
 Row(text='It really rains on your parade when you get nachos and the cheese is cold.', retweet_count='0', reply_count='0', favorite_count=0, num_hashtags='0', num_urls='0', num_mentions='0', BotOrNot=1),
 Row(text='Where there is an open mind, there will always be a frontier. - Charles F. Kettering', retweet_count='0', reply_count='0', favorite_count=0, num_hashtags='

In [14]:
#process each partition of data in each worker node

# def preocessTextColumn(df, column_name, new_column_name):
#     values = df.select(column_name).collect()
#     values = [tweet_preprocessor.tokenize(row_val.text) for row_val in values]
#     print(type(values))
#     print(values[:5])
#     return df

text_process_udf = udf(lambda x : tweet_preprocessor.tokenize(x), StringType())

def preprocess_data(df):
    df = df.withColumn('text', text_process_udf(df.text))
    df = df.withColumn("retweet_count",col("retweet_count").cast(DoubleType()))
    df = df.withColumn("reply_count",col("reply_count").cast(DoubleType()))
    df = df.withColumn("favorite_count",col("favorite_count").cast(DoubleType()))
    df = df.withColumn("num_hashtags",col("num_hashtags").cast(DoubleType()))
    df = df.withColumn("num_urls",col("num_urls").cast(DoubleType()))
    df = df.withColumn("num_mentions",col("num_mentions").cast(DoubleType()))
    
    return df

# def preprocess_data2(df):
#         df_rdd = df.rdd.map(lambda x: (tweet_preprocessor.tokenize(x.text), x.retweet_count, x.reply_count, x.favorite_count,
#                                        x.num_hashtags, x.num_urls, x.num_mentions, x.BotOrNot))
#         df = df_rdd.toDF()
#         return df
    

In [15]:
tweets_df = preprocess_data(tweets_df)

In [16]:
print("Total Tweets: {} \nType: {} \ndataschema: {}".format(tweets_df.count(), type(tweets_df), tweets_df.printSchema()))

root
 |-- text: string (nullable = true)
 |-- retweet_count: double (nullable = true)
 |-- reply_count: double (nullable = true)
 |-- favorite_count: double (nullable = true)
 |-- num_hashtags: double (nullable = true)
 |-- num_urls: double (nullable = true)
 |-- num_mentions: double (nullable = true)
 |-- BotOrNot: integer (nullable = false)

Total Tweets: 299998 
Type: <class 'pyspark.sql.dataframe.DataFrame'> 
dataschema: None


In [17]:
print(tweets_df.columns)

['text', 'retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_urls', 'num_mentions', 'BotOrNot']


In [18]:
tweets_df.head(5)

[Row(text='So what is Bitcoin and the Bitcoin block chain exactly? <URL>', retweet_count=2.0, reply_count=0.0, favorite_count=0.0, num_hashtags=0.0, num_urls=1.0, num_mentions=0.0, BotOrNot=0),
 Row(text='Check out these alternatives to traditional beef hamburgers! <URL>', retweet_count=0.0, reply_count=0.0, favorite_count=0.0, num_hashtags=0.0, num_urls=1.0, num_mentions=0.0, BotOrNot=0),
 Row(text="Every homie ain't ya homie boy I thought you knew ? <URL>", retweet_count=0.0, reply_count=0.0, favorite_count=0.0, num_hashtags=0.0, num_urls=1.0, num_mentions=0.0, BotOrNot=1),
 Row(text='It really rains on your parade when you get nachos and the cheese is cold.', retweet_count=0.0, reply_count=0.0, favorite_count=0.0, num_hashtags=0.0, num_urls=0.0, num_mentions=0.0, BotOrNot=1),
 Row(text='Where there is an open mind, there will always be a frontier. - Charles F. Kettering', retweet_count=0.0, reply_count=0.0, favorite_count=0.0, num_hashtags=0.0, num_urls=0.0, num_mentions=0.0, BotOrN

In [19]:
#turn a line of text into d dimentional vector 
GLOVE_DIR = "C://Users//USER//projects//"

def makeGloveWordEmbedder(glove_path):
    embedding_dict = {}
    with open(glove_path, 'r', encoding="utf-8") as glove_file:
        for line in glove_file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embedding_dict[word] = vector
            
    return embedding_dict        

In [20]:
#create word embedding GLoVE model dictionary. Use pre trained model

text_feature_dimention = 25
glove_word2vec_embedder = makeGloveWordEmbedder(GLOVE_DIR + "glove.twitter.27B.25d.txt")

In [21]:
glove_word2vec_embedder["google"]

array([ 1.5541   ,  0.055494 ,  0.0036707, -0.14801  ,  0.86527  ,
       -0.44493  ,  0.19887  ,  0.60156  ,  0.71646  ,  0.16747  ,
        0.86978  , -0.53673  , -3.2188   , -0.97591  ,  0.020251 ,
        0.31074  ,  0.22997  ,  0.65166  , -0.19235  , -0.61838  ,
       -0.17933  , -1.7447   , -0.56918  , -0.4337   , -0.47025  ],
      dtype=float32)

In [22]:
#Give a word and get that word representing feature vector of dimention 25 from embedding dictionary
def word2vec(word_dict=None, word=None, dim=25):
    default_vector = np.zeros(dim)
    
    if word_dict is None or word is None:
        return default_vector
    
    word_vector = word_dict.get(word)
    
    if word_vector is None:
        return default_vector
    return word_vector

In [23]:
# test a word representing feature vector
word_vector = word2vec(glove_word2vec_embedder, "tweet", text_feature_dimention)
print(type(word_vector), word_vector)

<class 'numpy.ndarray'> [ 1.3429    0.32133   0.24542   0.070143  0.73769  -0.1736    0.63438
  1.571    -0.88553   0.326    -0.31173  -0.067133 -3.6154   -0.38971
 -0.31431   1.3033    0.31316  -0.90273  -1.9166   -0.5154    0.51635
 -0.54521  -0.3446    0.45339  -1.0241  ]


In [24]:
'''
# LSTM + Dense layer model
def lstm_model(input_dim, output_dim):
    model = tf.keras.Sequential()
    model.add(LSTM(64, input_shape=(input_dim,1)))
    model.add(BatchNormalization())
    model.add(Dense(output_dim))
    return model

'''

'''
def lstm_model(input_dim, output_dim):
    lstm_model = LSTM(32, input_shape=(input_dim,1), return_sequences=True, return_state=True)
    return lstm_model
'''

'''
def reset_weights(model):
    session = tf.keras.get_session()
    for layer in model.layers: 
        if hasattr(layer, 'kernel_initializer'):
            layer.kernel.initializer.run(session=session)
'''

#----------------LSTM Model----------------------

#create a 1 layer LSTM model
#input dimention: 3d (1,7,25) input_sample = [[[1,...,25]],..,[1,...,25]]
#output dimention: 1d (1,32) output_sample [[1,2,3,,,,32]]
def lstm_model(output_dim):
    model = LSTM(output_dim, return_sequences=False, return_state=False)
    return model

def reset_lstm_model(model):
    model.resate_states() # stateful=True is required for reset states



In [25]:
# lstm = lstm_model(input_dim=25, output_dim=32)
# lstm.summary()

# lstm.compile(
#     loss=CategoricalCrossentropy(),
#     optimizer="adam",
#     metrics=["accuracy"])


In [26]:
# create LSTM model of output dimention 32. Model output feature vector will be of 32 dimention vector
lstm = lstm_model(32) 

In [27]:
# convver a sentence to a feature vector using LSTM(RNN) model
def sent2vec(sent):
    words = sent.split(' ')
    word_vectors = np.array([])
    count = 0;
    for word in words:
        word_vector = word2vec(glove_word2vec_embedder, word)
        print("word dim: {}".format(len(word_vector)))
        if word_vectors.size == 0:
            word_vectors = np.array([word_vector])
        else:
            word_vectors = np.vstack([word_vectors, word_vector])
        count = count + 1
    
    print("Input feature vector shape before reshape(2D): {}".format(word_vectors.shape))
        
    input_feature_vectors = np.reshape(word_vectors, (1, count, text_feature_dimention))
    print("Input feature vector shape after reshape(3d): {}".format(input_feature_vectors.shape))
    print("LSTM requirs 3d shape inputs [batch, timesteps, feature]")
    output_vector = lstm(input_feature_vectors)
#     lstm.reset_states() # stateful = True is required for reset

    print("result vector shape: {}".format(output_vector.shape))
    print("Last input was: {}".format(input_feature_vectors[0][-1]))
    print("output result: {}".format(output_vector))
    
    # (tensore --> numpy 0bject --> numpy.array --> array/list/ArrayType)
    return output_vector.numpy()[0].tolist() 
    
 

In [28]:
#-----------------------------
# Save code for reuse
# input_feature_vectors = np.reshape(input_feature_vectors, (1, count, 25))
# print(input_feature_vectors)
# lstm = LSTM(32, return_sequences=True, return_state=True, stateful=True)
# whole_seq_output, final_memory_state, final_carry_state = lstm(input_feature_vectors)
##-----------------------


In [29]:
sent = "Twitter is a large social media network"

res_vector = sent2vec(sent)
type(res_vector), res_vector

word dim: 25
word dim: 25
word dim: 25
word dim: 25
word dim: 25
word dim: 25
word dim: 25
Input feature vector shape before reshape(2D): (7, 25)
Input feature vector shape after reshape(3d): (1, 7, 25)
LSTM requirs 3d shape inputs [batch, timesteps, feature]
result vector shape: (1, 32)
Last input was: [ 0.52543998  0.81762999 -0.33526    -1.08469999  1.34399998 -0.76054001
  0.35901001 -0.14561     1.23880005  0.063683   -0.085714   -0.1045
 -3.53110003  0.0064582   0.44380999  0.58358997 -0.050973    0.25244999
  0.57172     0.22875001 -0.27950999 -1.24839997 -0.36148    -0.60921001
 -0.61317998]
output result: [[ 0.45467544  0.27176714 -0.09925152 -0.25991294  0.1314513   0.06059496
   0.07321241 -0.37092263 -0.13981614  0.18063284  0.22067295  0.06147751
   0.0376804  -0.07965042  0.06123774 -0.16793038  0.24811278 -0.17758675
   0.15323628 -0.01909693 -0.15135057  0.24262711 -0.31437188 -0.13134502
  -0.2592779   0.3224458  -0.24764483 -0.4268025  -0.42393532  0.261188
   0.25213

(list,
 [0.45467543601989746,
  0.27176713943481445,
  -0.09925151616334915,
  -0.2599129378795624,
  0.1314512938261032,
  0.060594964772462845,
  0.07321241497993469,
  -0.37092262506484985,
  -0.13981613516807556,
  0.1806328445672989,
  0.22067295014858246,
  0.06147750839591026,
  0.03768039867281914,
  -0.07965042442083359,
  0.061237744987010956,
  -0.16793037950992584,
  0.2481127828359604,
  -0.17758674919605255,
  0.1532362848520279,
  -0.019096925854682922,
  -0.15135057270526886,
  0.2426271140575409,
  -0.31437188386917114,
  -0.1313450187444687,
  -0.25927790999412537,
  0.322445809841156,
  -0.2476448267698288,
  -0.42680248618125916,
  -0.42393532395362854,
  0.26118800044059753,
  0.25213882327079773,
  0.28772708773612976])

In [30]:
# text string --> vector 32 dimention
sent_to_vector_udf = udf(lambda x : sent2vec(x), ArrayType(DoubleType()))
def processTextColumn(df, column_name, new_column_name):
    df = df.withColumn(new_column_name, sent_to_vector_udf(col(column_name)))
    return df

In [31]:
text_updated_column = 'text_features'
tweets_df = processTextColumn(tweets_df, "text", text_updated_column)

In [32]:
print(tweets_df.count(), type(tweets_df), tweets_df.printSchema())

root
 |-- text: string (nullable = true)
 |-- retweet_count: double (nullable = true)
 |-- reply_count: double (nullable = true)
 |-- favorite_count: double (nullable = true)
 |-- num_hashtags: double (nullable = true)
 |-- num_urls: double (nullable = true)
 |-- num_mentions: double (nullable = true)
 |-- BotOrNot: integer (nullable = false)
 |-- text_features: array (nullable = true)
 |    |-- element: double (containsNull = true)

299998 <class 'pyspark.sql.dataframe.DataFrame'> None


In [33]:
tweets_df.head(2)

[Row(text='So what is Bitcoin and the Bitcoin block chain exactly? <URL>', retweet_count=2.0, reply_count=0.0, favorite_count=0.0, num_hashtags=0.0, num_urls=1.0, num_mentions=0.0, BotOrNot=0, text_features=[0.28481611609458923, 0.21882453560829163, 0.06194785609841347, -0.3682086169719696, 0.262920618057251, 0.21629296243190765, 0.015426518395543098, -0.2468055635690689, -0.14578208327293396, 0.2106577306985855, 0.24197855591773987, -0.08361523598432541, 0.09170368313789368, 0.08617748320102692, 0.026449576020240784, -0.05779039487242699, 0.2882993221282959, -0.08279384672641754, 0.24426540732383728, -0.150812566280365, -0.14580199122428894, 0.07274134457111359, -0.2563191056251526, -0.2306952327489853, -0.18770168721675873, 0.13491402566432953, 0.05788596719503403, -0.22538118064403534, -0.1378612518310547, 0.13252323865890503, 0.1744457334280014, 0.1887575387954712]),
 Row(text='Check out these alternatives to traditional beef hamburgers! <URL>', retweet_count=0.0, reply_count=0.0, 

In [34]:
tweets_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- retweet_count: double (nullable = true)
 |-- reply_count: double (nullable = true)
 |-- favorite_count: double (nullable = true)
 |-- num_hashtags: double (nullable = true)
 |-- num_urls: double (nullable = true)
 |-- num_mentions: double (nullable = true)
 |-- BotOrNot: integer (nullable = false)
 |-- text_features: array (nullable = true)
 |    |-- element: double (containsNull = true)



In [35]:
#
columns = ['retweet_count', 'reply_count', 'favorite_count',
           'num_hashtags' ,'num_urls', 'num_mentions', 'BotOrNot']

tweets_df = tweets_df.select(*columns, 
                      tweets_df.text_features[0], tweets_df.text_features[1], tweets_df.text_features[2],tweets_df.text_features[3], tweets_df.text_features[4],
                      tweets_df.text_features[5], tweets_df.text_features[6], tweets_df.text_features[7],tweets_df.text_features[8], tweets_df.text_features[9], 
                      tweets_df.text_features[10], tweets_df.text_features[11], tweets_df.text_features[12],tweets_df.text_features[13], tweets_df.text_features[14],
                      tweets_df.text_features[15], tweets_df.text_features[16], tweets_df.text_features[17],tweets_df.text_features[18], tweets_df.text_features[19],
                      tweets_df.text_features[20], tweets_df.text_features[21], tweets_df.text_features[22],tweets_df.text_features[23], tweets_df.text_features[24])

In [36]:
print(tweets_df.columns, len(tweets_df.collect()), tweets_df.printSchema())

root
 |-- retweet_count: double (nullable = true)
 |-- reply_count: double (nullable = true)
 |-- favorite_count: double (nullable = true)
 |-- num_hashtags: double (nullable = true)
 |-- num_urls: double (nullable = true)
 |-- num_mentions: double (nullable = true)
 |-- BotOrNot: integer (nullable = false)
 |-- text_features[0]: double (nullable = true)
 |-- text_features[1]: double (nullable = true)
 |-- text_features[2]: double (nullable = true)
 |-- text_features[3]: double (nullable = true)
 |-- text_features[4]: double (nullable = true)
 |-- text_features[5]: double (nullable = true)
 |-- text_features[6]: double (nullable = true)
 |-- text_features[7]: double (nullable = true)
 |-- text_features[8]: double (nullable = true)
 |-- text_features[9]: double (nullable = true)
 |-- text_features[10]: double (nullable = true)
 |-- text_features[11]: double (nullable = true)
 |-- text_features[12]: double (nullable = true)
 |-- text_features[13]: double (nullable = true)
 |-- text_featu

In [37]:
# prepare inpute feature vectors for DenseModel
# tweets_df = tweets_df.drop('text')
# tweets_df.printSchema()

In [38]:
#remove 

feature_columns = ['retweet_count','reply_count','favorite_count','num_hashtags','num_urls','num_mentions',
                   'text_features[0]','text_features[1]', 'text_features[2]','text_features[3]','text_features[4]',
                   'text_features[5]','text_features[6]','text_features[7]', 'text_features[8]','text_features[9]',
                   'text_features[10]','text_features[11]','text_features[12]','text_features[13]','text_features[14]',
                   'text_features[15]','text_features[16]','text_features[17]','text_features[18]','text_features[19]',
                   'text_features[20]','text_features[21]','text_features[22]', 'text_features[23]', 'text_features[24]']

tweets_df = tweets_df.na.fill(value=0.0 ,subset= feature_columns)
feature_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'independent_features')

tweets_updated_df = feature_assembler.transform(tweets_df)

In [39]:
#check
num = len(tweets_updated_df.collect())
print(num, type(tweets_updated_df), tweets_updated_df.printSchema())

root
 |-- retweet_count: double (nullable = false)
 |-- reply_count: double (nullable = false)
 |-- favorite_count: double (nullable = false)
 |-- num_hashtags: double (nullable = false)
 |-- num_urls: double (nullable = false)
 |-- num_mentions: double (nullable = false)
 |-- BotOrNot: integer (nullable = false)
 |-- text_features[0]: double (nullable = false)
 |-- text_features[1]: double (nullable = false)
 |-- text_features[2]: double (nullable = false)
 |-- text_features[3]: double (nullable = false)
 |-- text_features[4]: double (nullable = false)
 |-- text_features[5]: double (nullable = false)
 |-- text_features[6]: double (nullable = false)
 |-- text_features[7]: double (nullable = false)
 |-- text_features[8]: double (nullable = false)
 |-- text_features[9]: double (nullable = false)
 |-- text_features[10]: double (nullable = false)
 |-- text_features[11]: double (nullable = false)
 |-- text_features[12]: double (nullable = false)
 |-- text_features[13]: double (nullable = fa

In [40]:
tweets_updated_df.show(5)

+-------------+-----------+--------------+------------+--------+------------+--------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|retweet_count|reply_count|favorite_count|num_hashtags|num_urls|num_mentions|BotOrNot|   text_features[0]|   text_features[1]|    text_features[2]|   text_features[3]|   text_features[4]|   text_features[5]|    text_features[6]|    text_features[7]|    text_features[8]|   text_features[9]|  text_features[10]|   text_features[11]|  text_features[12]|   text_features[13]|   tex

In [41]:
#remove unnecessary columns
tweets_updated_df = tweets_updated_df.drop(*feature_columns)

In [42]:
print(len(tweets_updated_df.collect()), tweets_updated_df.columns)

299998 ['BotOrNot', 'independent_features']


In [43]:
## create model
def get2DenseLayeredModel(input_dim):
    model = Sequential()
    model.add(Dense(500, input_dim=input_dim))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(200))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [44]:
train_df, test_df = tweets_updated_df.randomSplit([0.80, 0.20])

In [45]:
print(train_df.count(), test_df.count())

240077 59959


In [46]:
print(train_df.columns, test_df.columns)

['BotOrNot', 'independent_features'] ['BotOrNot', 'independent_features']


In [47]:
# features --> 'BotOrNot'
X_train = train_df.drop('BotOrNot')
y_train = train_df.select('BotOrNot')
X_test = test_df.drop('BotOrNot')
y_test = test_df.select('BotOrNot')

In [None]:
#checkpoint
print(len(X_train.collect()), len(y_train.collect()))
print(len(X_test.collect()), len(y_test.collect()))

In [None]:
def to_nparray_list(df, column_name):
    rows = df.select(column_name).collect()
    lists = [x[column_name] for x in rows]
    nparr = np.array(lists)
    
    return nparr

X_train = to_nparray_list(X_train, 'independent_features')
y_train = to_nparray_list(y_train, 'BotOrNot')
X_test = to_nparray_list(X_test, 'independent_features')
y_test = to_nparray_list(y_test, 'BotOrNot')

In [None]:
# ml model train and validation
model = get2DenseLayeredModel(31)
model.fit(X_train, y_train,
          batch_size=64,
          epochs=10,
          validation_data=(X_test, y_test))
score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test accuracy:', acc)