# Features extractor

## Read datasets

In [68]:
import pandas as pd
import numpy as np
import datetime
from collections import OrderedDict

In [87]:
datasets_fake = ["FSF", "INT", "TWT"]
datasets_hum = ["TFP", "E13"]
datasets = datasets_hum + datasets_fake
folder_datasets = "./datasets/"
file_users = "users.csv"
file_tweets = "tweets.csv"
file_friends = "friends.csv"
file_followers = "followers.csv"
features_file = "features.json"
features_name_file = "features_name.json"
target_file = "target.json"

In [73]:
users = pd.read_csv(folder_datasets + datasets[0] + '/' + file_users)
tweets = pd.read_csv(folder_datasets + datasets[0] + '/' + file_tweets)
friends = pd.read_csv(folder_datasets + datasets[0] + '/' + file_friends)
followers = pd.read_csv(folder_datasets + datasets[0] + '/' + file_followers)
for dataset in datasets[1:]:
    users = pd.concat([users, pd.read_csv(folder_datasets + dataset + '/' + file_users)])
    tweets = pd.concat([tweets, pd.read_csv(folder_datasets + dataset + '/' + file_tweets)])
    friends = pd.concat([friends, pd.read_csv(folder_datasets + dataset + '/' + file_friends)])
    followers = pd.concat([followers, pd.read_csv(folder_datasets + dataset + '/' + file_followers)])
    
    

  interactivity=interactivity, compiler=compiler, result=result)


## Compute features

In [81]:
SAMPLE_SIZE = 10
X = list()
y = list()
features_name = list()
nb_fake_acc =0
# Compute features for each Twitter account
for index, user in users.iterrows():
    acc_feat = OrderedDict()
    #if index > SAMPLE_SIZE:
    #    break
    
    # Class A (Profile)
    #===================
    acc_feat['has_name'] = int(user['name'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['has_image'] = int(user['default_profile_image'] != 1)
    acc_feat['has_address'] = int(user['location'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['has_biography'] = int(user['description'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['followers_ge_30'] = int(user['followers_count'] >= 30)
    acc_feat['belongs_to_a_list'] = int(user['listed_count'] > 0)
    acc_feat['nb_tweets_ge_50'] = int(tweets.loc[tweets['user_id'] == user['id']].size >= 50)
    acc_feat['url_in_profile'] = int(user['url'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['followers_2_times_ge_friends'] = int(2 * user['followers_count'] >= user['friends_count'])
    
    acc_feat['bot_in_biography'] = int(type(user['description']) is str and 'bot' in user['description'].lower())
    acc_feat['ratio_friends_followers_around_100'] = int(user['followers_count'] > 0 and 80.0 <= float(user['friends_count']) / user['followers_count'] >= 120.0)
    acc_feat['duplicate_profile_picture'] = int(users.loc[users['default_profile_image'] == user['default_profile_image']].size > 1)
    
    acc_feat['ratio_friends_followers_ge_50'] = int(user['followers_count'] > 0 and float(user['friends_count']) / user['followers_count'] >= 50)
    acc_feat['default_image_after_2_month'] = int(user['default_profile_image'] == 1 and (datetime.datetime.now() - datetime.datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')) > datetime.timedelta(weeks=4)) 
    acc_feat['friends_ge_100'] = int(user['friends_count'] >= 100)
    acc_feat['no_bio'] = int(user['description'] in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['no_location'] = int(user['location'] in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['no_tweets'] = int(tweets.loc[tweets['user_id'] == user['id']].size == 0)
    
    acc_feat['nb_friends'] = int(user['friends_count'])
    acc_feat['nb_tweets'] = int(tweets.loc[tweets['user_id'] == user['id']].size)
    acc_feat['ratio_friends_followers_square'] = float(user['friends_count']) / pow(user['followers_count'], 2) if user['followers_count'] > 0 else 0
    
    acc_feat['age'] = (datetime.datetime.now() - datetime.datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')).total_seconds()
    acc_feat['following_rate'] = float(user['friends_count']) / age
    
    target = 1 if user['dataset'] in datasets_fake else 0
    nb_fake_acc = nb_fake_acc + target
    
    y.append(target)
    X.append(list(acc_feat.values()))
    features_name = list(acc_feat.keys())
nb_hum_acc = abs(len(y) - nb_fake_acc)

In [82]:
print("TOTAL accounts: " + str(users.size))
#print("Features name: " + str(list(features_name)))
print(X[0])
print("# fake accounts: " + str(nb_fake_acc))
print("# human accounts: " + str(nb_hum_acc))

TOTAL accounts: 180234
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1466, 67089, 5.7370898357435176e-05, 350123444.107283, 7.116335562518837e-06]
# fake accounts: 3351
# human accounts: 1950


In [83]:
# Shuffle & balance the whole dataset (50-50 human/fake accounts)
from sklearn.utils import shuffle
X, y = shuffle(X, y)

max_sample = min(nb_fake_acc, nb_hum_acc) # max_sample = 1950 in our case
X, y = X[:max_sample], y[:max_sample]
print(len(y))

In [88]:
# Store features on disk
import json
with open(features_file, 'w') as dstfile :
    json.dump(X, dstfile)
with open(target_file, 'w') as dstfile :
    json.dump(y, dstfile)
with open(features_name_file, 'w') as dstfile :
    json.dump(features_name, dstfile)

## Statistics on dataset

In [8]:
users.isnull().sum()

id                                      0
name                                    0
screen_name                             0
statuses_count                          0
followers_count                         0
friends_count                           0
favourites_count                        0
listed_count                            0
created_at                              0
url                                   235
lang                                    0
time_zone                             101
location                              145
default_profile                       314
default_profile_image                 454
geo_enabled                           230
profile_image_url                       0
profile_banner_url                    249
profile_use_background_image           26
profile_background_image_url_https      0
profile_text_color                      0
profile_image_url_https                 0
profile_sidebar_border_color            0
profile_background_tile           