In [1]:
# required libraries
import pandas as pd
import numpy as np
import math

from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import col, udf, to_timestamp, lit
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import when, rand
from pyspark.ml.feature import Normalizer, StandardScaler, MinMaxScaler, VectorAssembler

from tensorflow.keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate, Dropout, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import SGD, Adam

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# init spark
spark = SparkSession.builder.appName('ml_account_base_session').getOrCreate()
spark

In [None]:
#dataset path
dataset_folder_s3 = 'data/' # 's3://bot-dataset/data/'
result_path_s3 = '' # 's3://bot-dataset/result/'

In [None]:
# read dataset from csv

requiredColumns = requiredColumns = ['screen_name', 'created_at', 'updated', 'location', 'verified', 'statuses_count', 'friends_count','followers_count', 'favourites_count', 'default_profile_image', 'profile_use_background_image', 'protected', 'default_profile']

bot_accounts1 = spark.read.csv(dataset_folder_s3 + 'social_spambots_1.csv', header = True, inferSchema = True).select(requiredColumns)
bot_accounts2 = spark.read.csv(dataset_folder_s3 + 'social_spambots_2.csv', header = True, inferSchema = True).select(requiredColumns)
bot_accounts3 = spark.read.csv(dataset_folder_s3 + 'social_spambots_3.csv', header = True, inferSchema = True).select(requiredColumns)

# combine multiple bot_account dataset
bot_accounts = bot_accounts1.union(bot_accounts2.union(bot_accounts3))
clean_accounts = spark.read.csv(dataset_folder_s3 + 'geniune_accounts.csv', header = True, inferSchema = True).select(requiredColumns)

In [None]:
# check number of rows in each dataset
bot_accounts1.count(), bot_accounts2.count(), bot_accounts3.count(), bot_accounts.count(), clean_accounts.count()

In [None]:
# check structure of the dataframe
bot_accounts.printSchema()
clean_accounts.printSchema()

In [None]:
#check bot account data
bot_accounts.limit(3).toPandas()

In [None]:
# check clean_account data
clean_accounts.limit(3).toPandas()

In [None]:
# test feature value domain space for feature engineering
def printFeatureDomain(df):
    columns = df.columns
    for col in columns:
        col_domain = df.select(col).distinct().collect()
        value_domain = [item[0] for item in col_domain]
        print("{}({}): {}\n".format(col,len(value_domain), value_domain[:5]))
        
printFeatureDomain(bot_accounts)
printFeatureDomain(clean_accounts)

In [None]:
# def clean_df(df):
#     type(df)
#     df['created_at'] = pd.to_datetime(df['created_at']).dt.tz_localize(None)
#     df['updated'] = pd.to_datetime(df['updated']).dt.tz_localize(None)
#     df['age'] = (df['updated'] - df['created_at']).astype('timedelta64[D]').astype(int)
#     df['has_location'] = df['location'].apply(lambda x: 0 if x==x else 1)
#     df['has_avatar'] = df['default_profile_image'].apply(lambda x: 1 if x==x else 0)
#     df['has_background'] = df['profile_use_background_image'].apply(lambda x: 1 if x==x else 0)
#     df['is_verified']=df['verified'].apply(lambda x: 1 if x==x else 0)
#     df['is_protected']=df['protected'].apply(lambda x: 1 if x==x else 0)
#     df['profile_modified'] = df['default_profile'].apply(lambda x: 0 if x==x else 1)
#     df = df.rename(index=str, columns={"screen_name": "username", "statuses_count": "total_tweets", "friends_count": "total_following", "followers_count": "total_followers", "favourites_count": "total_likes"})
#     return df[['username', 'age', 'has_location', 'is_verified', 'total_tweets', 'total_following', 'total_followers', 'total_likes', 'has_avatar', 'has_background', 'is_protected', 'profile_modified']]


In [None]:
# clean dataset
def cleanData(df):
    df = df.withColumn('age', lit(0)) # need to calculate from 'updated' -'created_at'
    df = df.withColumn('has_location', when((df['location'] != None), 1).otherwise(0))
    df = df.withColumn('has_avatar', when((df['default_profile_image'] != None), 1).otherwise(0))
    df = df.withColumn('has_background', when((df['profile_use_background_image'] != None), 1).otherwise(0))
    df = df.withColumn('is_verified', when((df['verified'] != None), 1).otherwise(0))
    df = df.withColumn('is_protected', when((df['protected'] != None), 1).otherwise(0))
    df = df.withColumn('profile_modified', when((df['default_profile'] != None), 1).otherwise(0))
    df = df.withColumnRenamed("screen_name", "username")
    df = df.withColumnRenamed("statuses_count", "total_tweets")
    df = df.withColumnRenamed("friends_count", "total_following")
    df = df.withColumnRenamed("followers_count", "total_followers")
    df = df.withColumnRenamed("favourites_count", "total_likes")
    
    return df.select('username', 'age', 'has_location', 'is_verified', 'total_tweets', 'total_following', 'total_followers', 'total_likes', 'has_avatar', 'has_background', 'is_protected', 'profile_modified')
    

In [None]:
bot_accounts = cleanData(bot_accounts)
clean_accounts = cleanData(clean_accounts)

In [None]:
bot_accounts.printSchema()

In [None]:
clean_accounts.printSchema()

In [None]:
clean_accounts.limit(5).toPandas()

In [None]:
bot_accounts.limit(5).toPandas()

In [None]:
## add BotOrNot column
bot_accounts = bot_accounts.withColumn('BotOrNot', lit(1))
clean_accounts = clean_accounts.withColumn('BotOrNot', lit(0))

In [None]:
#combine clean and bot accounts data togather
combined_df = bot_accounts.union(clean_accounts)

# shuffle dataset
new_df = combined_df.orderBy(rand())

#remove 'userrname' columns from dataset
new_df = new_df.drop('username')

In [None]:
new_df.printSchema()

In [None]:
new_df.count()

In [None]:
new_df.columns

In [None]:
## convert into feature vector for ml model
feature_columns = ['age', 'has_location', 'is_verified', 'total_tweets', 'total_following', 
                   'total_followers', 'total_likes', 'has_avatar', 'has_background', 
                   'is_protected', 'profile_modified']

feature_assembler = VectorAssembler(inputCols = feature_columns, outputCol = 'independent_features')
df_updated = feature_assembler.transform(new_df)
df_updated.limit(5).toPandas()

In [None]:
type(df_updated), df_updated.columns

In [None]:
# keep only required features/columns
df_updated = df_updated.select('independent_features', 'BotOrNot')

In [None]:
df_updated.select("independent_features", 'BotOrNot').limit(5).toPandas()

In [None]:
## Make data standard
# https://spark.apache.org/docs/1.4.1/ml-features.html#standardscaler

scaler = StandardScaler(inputCol="independent_features", outputCol="scaled_features",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(df_updated)

# Normalize each feature to have unit standard deviation.
scaled_df = scalerModel.transform(df_updated)

In [None]:
scaled_df.select("independent_features", "scaled_features", 'BotOrNot').limit(5).toPandas()

In [None]:
scaled_df.columns

In [None]:
# keep only necessary feature/column for ml model
scaled_df = scaled_df.select('scaled_features', 'BotOrNot')

In [None]:
# split data for training ana testing
train_df, test_df = scaled_df.randomSplit([0.80, 0.20])

In [None]:
train_df.count(), test_df.count()

In [None]:
train_df.columns

In [None]:
# features --> 'BotOrNot'
X_train = train_df.drop('BotOrNot')
y_train = train_df.select('BotOrNot')
X_test = test_df.drop('BotOrNot')
y_test = test_df.select('BotOrNot')

In [None]:
X_train.columns, y_train.columns

In [None]:
X_train.printSchema(), y_train.printSchema()

In [None]:
type(X_train), type(y_train)

In [None]:
X_train.limit(5).toPandas()

In [None]:
## create model

# inp = Input(shape=[11])

# another = Dense(500, activation='relu')(inp)
# another = Dense(200, activation='relu')(another)
# another = Dense(1, activation='sigmoid')(another)

# mod = Model(inp, another)
# mod.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


model = Sequential()
model.add(Dense(500, input_dim=11))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

model.summary()

In [None]:
X_train.printSchema(), y_train.printSchema()

In [None]:
# convert DataFrame column into nparray
# nparray required for model training, validation

def to_nparray_list(df, column_name):
    rows = df.select(column_name).collect()
    lists = [x[column_name] for x in rows]
    nparr = np.array(lists)
    
    return nparr

In [None]:
# DataFrame(column) --> nparray
X_train = to_nparray_list(X_train, 'scaled_features')
y_train = to_nparray_list(y_train, 'BotOrNot')
X_test = to_nparray_list(X_test, 'scaled_features')
y_test = to_nparray_list(y_test, 'BotOrNot')

In [None]:
X_train[:5]

In [None]:
# ml model train and validation

model.fit(X_train, y_train,
          batch_size=64,
          epochs=20,
          validation_data=(X_test, y_test))
score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test accuracy:', acc)

In [None]:
model.save(result_path_s3 + 'my_model.h5')

In [None]:
#model cross validation
from sklearn.model_selection import KFold

In [None]:
X = np.concatenate([X_train, X_test])
Y = np.concatenate([y_train, y_test])

len(X), len(Y)

In [None]:
# def splitDataset(n_split, X, Y):
#     for train_index,test_index in KFold(n_split).split(X):

#         x_train, x_test=X[train_index],X[test_index]
#         #y_train, y_test=Y[train_index],Y[t est_index]
#         #print( "train: {},{} test: {},{}".format(len(x_train), len(y_train), len(x_test), len(y_test)))
#         print( "train: {},{} test: {},{}".format(len(x_train), len(y_train), len(x_test)))
# splitDataset(5, X, Y)

In [None]:
def distributedTrainingGradients(df, feature_column, target_column, n_splits):
    print(df.count())
    each_len = df.count() // n_splits
    
    ##split dataset into 'n_splits' part
    copy_df = df
    for i in range(n_splits):
        temp_df = copy_df.limit(each_len)
        copy_df = copy_df.subtract(temp_df)
        
        print(temp_df.count())

# distributedTrainingGradients(scaled_df, "", "", 5)

In [None]:
scaled_df.printSchema()

In [None]:
scaled_df.count()

In [None]:
distributedTrainingGradients(scaled_df, '', '', 5)