In [1]:
import pandas as pd
import numpy as np
import math

from tensorflow.keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
bot_accounts = pd.concat([pd.read_csv('data/social_spambots_1.csv'), pd.read_csv('data/social_spambots_2.csv'), pd.read_csv('data/social_spambots_3.csv')]).reset_index(drop=True)
clean_accounts = pd.read_csv('data/geniune_accounts.csv')

requiredColumns = ['screen_name', 'created_at', 'updated', 'location', 'verified', 'statuses_count', 'friends_count','followers_count', 'favourites_count', 'default_profile_image', 'profile_use_background_image', 'protected', 'default_profile']
bot_accounts = bot_accounts[requiredColumns]
clean_accounts = clean_accounts[requiredColumns]

In [3]:
print("clear_accounts num: {}, bot_account_num: {}".format(clean_accounts.size, bot_accounts.size))
bot_accounts.dtypes

clear_accounts num: 45162, bot_account_num: 63856


screen_name                      object
created_at                       object
updated                          object
location                         object
verified                        float64
statuses_count                    int64
friends_count                     int64
followers_count                   int64
favourites_count                  int64
default_profile_image           float64
profile_use_background_image    float64
protected                       float64
default_profile                 float64
dtype: object

In [4]:
def clean_df(df):
    type(df)
    df['created_at'] = pd.to_datetime(df['created_at']).dt.tz_localize(None)
    df['updated'] = pd.to_datetime(df['updated']).dt.tz_localize(None)
    df['age'] = (df['updated'] - df['created_at']).astype('timedelta64[D]').astype(int)
    df['has_location'] = df['location'].apply(lambda x: 0 if x==x else 1)
    df['has_avatar'] = df['default_profile_image'].apply(lambda x: 1 if x==x else 0)
    df['has_background'] = df['profile_use_background_image'].apply(lambda x: 1 if x==x else 0)
    df['is_verified']=df['verified'].apply(lambda x: 1 if x==x else 0)
    df['is_protected']=df['protected'].apply(lambda x: 1 if x==x else 0)
    df['profile_modified'] = df['default_profile'].apply(lambda x: 0 if x==x else 1)
    df = df.rename(index=str, columns={"screen_name": "username", "statuses_count": "total_tweets", "friends_count": "total_following", "followers_count": "total_followers", "favourites_count": "total_likes"})
    return df[['username', 'age', 'has_location', 'is_verified', 'total_tweets', 'total_following', 'total_followers', 'total_likes', 'has_avatar', 'has_background', 'is_protected', 'profile_modified']]

bot_accounts = clean_df(bot_accounts)
clean_accounts = clean_df(clean_accounts)

In [5]:
bot_accounts.dtypes

username            object
age                  int32
has_location         int64
is_verified          int64
total_tweets         int64
total_following      int64
total_followers      int64
total_likes          int64
has_avatar           int64
has_background       int64
is_protected         int64
profile_modified     int64
dtype: object

In [6]:
bot_accounts.head()

Unnamed: 0,username,age,has_location,is_verified,total_tweets,total_following,total_followers,total_likes,has_avatar,has_background,is_protected,profile_modified
0,davideb66,2555,1,0,1299,40,22,1,1,1,0,0
1,ElisaDospina,2521,0,0,18665,3442,12561,16358,0,1,0,1
2,Vladimir65,2497,0,0,22987,755,600,14,0,1,0,1
3,RafielaMorales,2435,0,0,7975,350,398,11,0,1,0,1
4,FabrizioC_c,2413,0,0,20218,405,413,162,0,1,0,1


In [7]:
clean_accounts.dtypes

username            object
age                  int32
has_location         int64
is_verified          int64
total_tweets         int64
total_following      int64
total_followers      int64
total_likes          int64
has_avatar           int64
has_background       int64
is_protected         int64
profile_modified     int64
dtype: object

In [8]:
clean_accounts.head()

Unnamed: 0,username,age,has_location,is_verified,total_tweets,total_following,total_followers,total_likes,has_avatar,has_background,is_protected,profile_modified
0,0918Bask,1008,0,0,2177,332,208,265,0,0,0,1
1,1120Roll,672,0,0,2660,485,330,3972,0,1,0,0
2,14KBBrown,1776,1,0,1254,177,166,1185,0,1,0,1
3,wadespeters,2006,0,0,202968,981,2248,60304,0,1,0,1
4,191a5bd05da04dc,403,0,0,82,79,21,5,0,1,0,0


In [9]:
bot_accounts['BotOrNot'] = 1
clean_accounts['BotOrNot'] = 0

combined_df = pd.concat([bot_accounts, clean_accounts])

new_df = combined_df.sample(frac=1).reset_index(drop=True)

In [10]:
training_df = new_df.drop('username', axis=1)[:int(combined_df.shape[0] * 0.8)]
test_df = new_df.drop('username', axis=1)[int(combined_df.shape[0] * 0.8):]

columns_to_standardize = ['age', 'total_tweets', 'total_following', 'total_followers', 'total_likes']

training_df_mean = training_df[columns_to_standardize].mean()
training_df_std = training_df[columns_to_standardize].std()

training_df[columns_to_standardize] = (training_df[columns_to_standardize] - training_df_mean)/training_df_std
test_df[columns_to_standardize] = (test_df[columns_to_standardize] - training_df_mean)/training_df_std

# training_df_mean = training_df.mean()
# training_df_std = training_df.std()

# training_df = (training_df - training_df_mean)/training_df_std
# test_df = (test_df - training_df_mean)/training_df_std

# max_vals = training_df.max()

# training_df = training_df/max_vals
# test_df = test_df/max_vals

In [11]:
X_train = training_df.drop(['BotOrNot', 'is_protected'], axis=1).values
y_train = training_df['BotOrNot'].values.reshape(-1,1)

X_test = test_df.drop(['BotOrNot', 'is_protected'], axis=1).values
y_test = test_df['BotOrNot'].values.reshape(-1,1)

In [12]:
inp = Input(shape=[10])

another = Dense(500, activation='relu')(inp)
another = Dense(200, activation='relu')(another)
another = Dense(1, activation='sigmoid')(another)

mod = Model(inp, another)
mod.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [13]:
training = mod.fit(x=X_train, y=y_train, batch_size=64, epochs=20, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [14]:
mod.save('my_model.h5')

In [15]:
# evaluate model
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

pred_result = mod.predict(X_test)
for val in pred_result:
    val[0] = val[0] / 0.5
    val[0] = int(val[0])
    

acu = accuracy_score(y_test, pred_result)
abs_error = mean_absolute_error(y_test, pred_result)
squared_error = mean_squared_error(y_test, pred_result)

print("accuracy: {}, abs_error: {}, squared error: {}".format(acu, abs_error, squared_error))

accuracy: 0.9678188319427891, abs_error: 0.03218116805721097, squared error: 0.03218116805721097
