In [199]:
import pandas as pd
import numpy as np
import math

from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import col, udf, to_timestamp, lit
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import when, rand
from pyspark.ml.feature import Normalizer, StandardScaler, MinMaxScaler, VectorAssembler

from tensorflow.keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [137]:
spark = SparkSession.builder.appName('ml_account_base_session').getOrCreate()
spark

In [138]:
# read dataset

requiredColumns = requiredColumns = ['screen_name', 'created_at', 'updated', 'location', 'verified', 'statuses_count', 'friends_count','followers_count', 'favourites_count', 'default_profile_image', 'profile_use_background_image', 'protected', 'default_profile']

bot_accounts1 = spark.read.csv('data/social_spambots_1.csv', header = True, inferSchema = True).select(requiredColumns)
bot_accounts2 = spark.read.csv('data/social_spambots_2.csv', header = True, inferSchema = True).select(requiredColumns)
bot_accounts3 = spark.read.csv('data/social_spambots_3.csv', header = True, inferSchema = True).select(requiredColumns)

bot_accounts = bot_accounts1.union(bot_accounts2.union(bot_accounts3))
clean_accounts = spark.read.csv('data/geniune_accounts.csv', header = True, inferSchema = True).select(requiredColumns)

In [139]:
bot_accounts1.count(), bot_accounts2.count(), bot_accounts3.count(), bot_accounts.count(), clean_accounts.count()

(991, 3457, 464, 4912, 3474)

In [141]:
bot_accounts.printSchema()
clean_accounts.printSchema()

root
 |-- screen_name: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- updated: timestamp (nullable = true)
 |-- location: string (nullable = true)
 |-- verified: string (nullable = true)
 |-- statuses_count: integer (nullable = true)
 |-- friends_count: integer (nullable = true)
 |-- followers_count: integer (nullable = true)
 |-- favourites_count: integer (nullable = true)
 |-- default_profile_image: string (nullable = true)
 |-- profile_use_background_image: integer (nullable = true)
 |-- protected: string (nullable = true)
 |-- default_profile: integer (nullable = true)

root
 |-- screen_name: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- updated: timestamp (nullable = true)
 |-- location: string (nullable = true)
 |-- verified: integer (nullable = true)
 |-- statuses_count: integer (nullable = true)
 |-- friends_count: integer (nullable = true)
 |-- followers_count: integer (nullable = true)
 |-- favourites_count: integer (nullable

In [148]:
bot_accounts.limit(3).toPandas()

Unnamed: 0,screen_name,created_at,updated,location,verified,statuses_count,friends_count,followers_count,favourites_count,default_profile_image,profile_use_background_image,protected,default_profile
0,davideb66,Tue Mar 17 08:51:12 +0000 2009,2016-03-15 14:12:22,,,1299,40,22,1,1.0,1,,1.0
1,ElisaDospina,Sun Apr 19 14:38:04 +0000 2009,2016-03-15 14:17:13,Italy,,18665,3442,12561,16358,,1,,
2,Vladimir65,Wed May 13 15:34:41 +0000 2009,2016-03-15 14:16:44,"iPhone: 45.471680,9.192429",,22987,755,600,14,,1,,


In [149]:
clean_accounts.limit(3).toPandas()

Unnamed: 0,screen_name,created_at,updated,location,verified,statuses_count,friends_count,followers_count,favourites_count,default_profile_image,profile_use_background_image,protected,default_profile
0,0918Bask,Tue Jun 11 11:20:35 +0000 2013,2016-03-15 15:53:47,Tokyo .Japan .,,2177,332,208,265,,,,
1,1120Roll,Tue May 13 10:37:57 +0000 2014,2016-03-15 15:53:48,神奈川県横浜市,,2660,485,330,3972,,1.0,,1.0
2,14KBBrown,Wed May 04 23:30:37 +0000 2011,2016-03-15 15:53:48,,,1254,177,166,1185,,1.0,,


In [187]:
def printFeatureDomain(df):
    columns = df.columns
    for col in columns:
        col_domain = df.select(col).distinct().collect()
        value_domain = [item[0] for item in col_domain]
        print("{}({}): {}\n".format(col,len(value_domain), value_domain[:5]))
        
printFeatureDomain(bot_accounts)
printFeatureDomain(clean_accounts)

screen_name(4912): ['MarcoMurante', 'CarolaParnasse', 'RobertoBusca', 'CarlaBipolare', 'EmanuelaDuccio']

created_at(4891): ['Mon Jan 16 07:46:29 +0000 2012', 'Mon Jan 16 08:41:48 +0000 2012', 'Tue Jan 17 10:51:05 +0000 2012', 'Tue Jan 17 11:29:19 +0000 2012', 'Wed Jan 18 04:59:52 +0000 2012']

updated(1687): [datetime.datetime(2016, 3, 15, 14, 12, 44), datetime.datetime(2016, 3, 15, 14, 14, 42), datetime.datetime(2016, 3, 15, 14, 17, 15), datetime.datetime(2016, 3, 15, 14, 20, 39), datetime.datetime(2016, 3, 15, 14, 16, 46)]

location(207): ['Palermo', 'Cave (RM)', 'Forte dei Marmi', 'Firenze', 'Phoenix']

verified(1): [None]

statuses_count(948): [471, 148, 26755, 540, 31]

friends_count(953): [6466, 5300, 4935, 5518, 31]

followers_count(929): [3997, 243, 4190, 4161, 1522]

favourites_count(72): [874, 76, 103, 12, 601]

default_profile_image(2): [None, '1']

profile_use_background_image(2): [None, 1]

protected(1): [None]

default_profile(2): [None, 1]

screen_name(3474): ['AddisonH

In [63]:
# def clean_df(df):
#     type(df)
#     df['created_at'] = pd.to_datetime(df['created_at']).dt.tz_localize(None)
#     df['updated'] = pd.to_datetime(df['updated']).dt.tz_localize(None)
#     df['age'] = (df['updated'] - df['created_at']).astype('timedelta64[D]').astype(int)
#     df['has_location'] = df['location'].apply(lambda x: 0 if x==x else 1)
#     df['has_avatar'] = df['default_profile_image'].apply(lambda x: 1 if x==x else 0)
#     df['has_background'] = df['profile_use_background_image'].apply(lambda x: 1 if x==x else 0)
#     df['is_verified']=df['verified'].apply(lambda x: 1 if x==x else 0)
#     df['is_protected']=df['protected'].apply(lambda x: 1 if x==x else 0)
#     df['profile_modified'] = df['default_profile'].apply(lambda x: 0 if x==x else 1)
#     df = df.rename(index=str, columns={"screen_name": "username", "statuses_count": "total_tweets", "friends_count": "total_following", "followers_count": "total_followers", "favourites_count": "total_likes"})
#     return df[['username', 'age', 'has_location', 'is_verified', 'total_tweets', 'total_following', 'total_followers', 'total_likes', 'has_avatar', 'has_background', 'is_protected', 'profile_modified']]


In [188]:

def cleanData(df):
    df = df.withColumn('age', lit(0)) # need to calculate from 'updated' -'created_at'
    df = df.withColumn('has_location', when((df['location'] != None), 1).otherwise(0))
    df = df.withColumn('has_avatar', when((df['default_profile_image'] != None), 1).otherwise(0))
    df = df.withColumn('has_background', when((df['profile_use_background_image'] != None), 1).otherwise(0))
    df = df.withColumn('is_verified', when((df['verified'] != None), 1).otherwise(0))
    df = df.withColumn('is_protected', when((df['protected'] != None), 1).otherwise(0))
    df = df.withColumn('profile_modified', when((df['default_profile'] != None), 1).otherwise(0))
    df = df.withColumnRenamed("screen_name", "username")
    df = df.withColumnRenamed("statuses_count", "total_tweets")
    df = df.withColumnRenamed("friends_count", "total_following")
    df = df.withColumnRenamed("followers_count", "total_followers")
    df = df.withColumnRenamed("favourites_count", "total_likes")
    
    return df.select('username', 'age', 'has_location', 'is_verified', 'total_tweets', 'total_following', 'total_followers', 'total_likes', 'has_avatar', 'has_background', 'is_protected', 'profile_modified')
    

In [189]:
bot_accounts = cleanData(bot_accounts)
clean_accounts = cleanData(clean_accounts)

In [190]:
bot_accounts.printSchema()

root
 |-- username: string (nullable = true)
 |-- age: integer (nullable = false)
 |-- has_location: integer (nullable = false)
 |-- is_verified: integer (nullable = false)
 |-- total_tweets: integer (nullable = true)
 |-- total_following: integer (nullable = true)
 |-- total_followers: integer (nullable = true)
 |-- total_likes: integer (nullable = true)
 |-- has_avatar: integer (nullable = false)
 |-- has_background: integer (nullable = false)
 |-- is_protected: integer (nullable = false)
 |-- profile_modified: integer (nullable = false)



In [191]:
clean_accounts.printSchema()

root
 |-- username: string (nullable = true)
 |-- age: integer (nullable = false)
 |-- has_location: integer (nullable = false)
 |-- is_verified: integer (nullable = false)
 |-- total_tweets: integer (nullable = true)
 |-- total_following: integer (nullable = true)
 |-- total_followers: integer (nullable = true)
 |-- total_likes: integer (nullable = true)
 |-- has_avatar: integer (nullable = false)
 |-- has_background: integer (nullable = false)
 |-- is_protected: integer (nullable = false)
 |-- profile_modified: integer (nullable = false)



In [192]:
clean_accounts.limit(5).toPandas()

Unnamed: 0,username,age,has_location,is_verified,total_tweets,total_following,total_followers,total_likes,has_avatar,has_background,is_protected,profile_modified
0,0918Bask,0,0,0,2177,332,208,265,0,0,0,0
1,1120Roll,0,0,0,2660,485,330,3972,0,0,0,0
2,14KBBrown,0,0,0,1254,177,166,1185,0,0,0,0
3,wadespeters,0,0,0,202968,981,2248,60304,0,0,0,0
4,191a5bd05da04dc,0,0,0,82,79,21,5,0,0,0,0


In [193]:
bot_accounts.limit(5).toPandas()

Unnamed: 0,username,age,has_location,is_verified,total_tweets,total_following,total_followers,total_likes,has_avatar,has_background,is_protected,profile_modified
0,davideb66,0,0,0,1299,40,22,1,0,0,0,0
1,ElisaDospina,0,0,0,18665,3442,12561,16358,0,0,0,0
2,Vladimir65,0,0,0,22987,755,600,14,0,0,0,0
3,RafielaMorales,0,0,0,7975,350,398,11,0,0,0,0
4,FabrizioC_c,0,0,0,20218,405,413,162,0,0,0,0


In [194]:
## add BotOrNot column
bot_accounts = bot_accounts.withColumn('BotOrNot', lit(1))
clean_accounts = clean_accounts.withColumn('BotOrNot', lit(0))

In [195]:
combined_df = bot_accounts.union(clean_accounts)

new_df = combined_df.orderBy(rand())
new_df = new_df.drop('username')

In [196]:
new_df.printSchema()

root
 |-- age: integer (nullable = false)
 |-- has_location: integer (nullable = false)
 |-- is_verified: integer (nullable = false)
 |-- total_tweets: integer (nullable = true)
 |-- total_following: integer (nullable = true)
 |-- total_followers: integer (nullable = true)
 |-- total_likes: integer (nullable = true)
 |-- has_avatar: integer (nullable = false)
 |-- has_background: integer (nullable = false)
 |-- is_protected: integer (nullable = false)
 |-- profile_modified: integer (nullable = false)
 |-- BotOrNot: integer (nullable = false)



In [197]:
new_df.count()

8386

In [201]:
new_df.columns

['age',
 'has_location',
 'is_verified',
 'total_tweets',
 'total_following',
 'total_followers',
 'total_likes',
 'has_avatar',
 'has_background',
 'is_protected',
 'profile_modified',
 'BotOrNot']

In [203]:
## convert into feature vector
feature_columns = ['age', 'has_location', 'is_verified', 'total_tweets', 'total_following', 
                   'total_followers', 'total_likes', 'has_avatar', 'has_background', 
                   'is_protected', 'profile_modified']

feature_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'independent_features')
df_updated = feature_assembler.transform(new_df)
df_updated.limit(5).toPandas()

Unnamed: 0,age,has_location,is_verified,total_tweets,total_following,total_followers,total_likes,has_avatar,has_background,is_protected,profile_modified,BotOrNot,independent_features
0,0,0,0,24,0,0,0,0,0,0,0,1,"(0.0, 0.0, 0.0, 24.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,0,0,0,65,46,2,0,0,0,0,0,1,"(0.0, 0.0, 0.0, 65.0, 46.0, 2.0, 0.0, 0.0, 0.0..."
2,0,0,0,31040,672,453,28037,0,0,0,0,0,"(0.0, 0.0, 0.0, 31040.0, 672.0, 453.0, 28037.0..."
3,0,0,0,12029,1509,866,1,0,0,0,0,1,"(0.0, 0.0, 0.0, 12029.0, 1509.0, 866.0, 1.0, 0..."
4,0,0,0,366,4797,4778,0,0,0,0,0,1,"(0.0, 0.0, 0.0, 366.0, 4797.0, 4778.0, 0.0, 0...."


In [198]:
## Make data standar
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.MinMaxScaler.html
columns_to_standardize = ['total_tweets', 'total_following', 'total_followers', 'total_likes']
output_columns = ['s_total_tweets', 's_total_following', 's_total_followers', 's_total_likes']

def scaleColumn(df, inputColName, outputColName):
    min_max_scaler = MinMaxScaler(inputCol = inputColName, outputCol = outputColName)
    min_max_scaler.fit(df).transform(df)
    df = df.withColumn(inputColName, df[outputColName])
    df = df.drop(outputColName)
    return df
def scaling(df, in_cols, out_cols):
    for idx in range(len(in_cols)):
        df = scaleColumn(df, in_cols[idx], out_cols[idx])
    return df

new_df = scaling(new_df, columns_to_standardize, output_columns)


IllegalArgumentException: requirement failed: Column total_tweets must be of type class org.apache.spark.ml.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually class org.apache.spark.sql.types.IntegerType$:int.

In [127]:
train_df, test_df = new_df.randomSplit([0.80, 0.20])

In [128]:
train_df.count(), test_df.count()

(6730, 1656)

In [129]:
X_train = train_df.drop('BotOrNot')
y_train = train_df.select('BotOrNot')
X_test = test_df.drop('BotOrNot')
y_test = test_df.select('BotOrNot')

In [131]:
## create model

inp = Input(shape=[10])

another = Dense(500, activation='relu')(inp)
another = Dense(200, activation='relu')(another)
another = Dense(1, activation='sigmoid')(another)

mod = Model(inp, another)
mod.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [132]:
## training model
training = mod.fit(x=X_train, y=y_train, batch_size=64, epochs=20, validation_data=(X_test, y_test))

ValueError: Failed to find data adapter that can handle input: <class 'pyspark.sql.dataframe.DataFrame'>, <class 'pyspark.sql.dataframe.DataFrame'>