# Features extractor

## Read datasets

In [1]:
import pandas as pd
import numpy as np
import datetime
import config as c
from collections import OrderedDict

In [2]:
compute_only_class_a = True
datasets_fake = ["FSF", "INT", "TWT"]
datasets_hum = ["TFP", "E13"]
datasets = datasets_hum + datasets_fake

In [None]:
users = pd.read_csv(c.folder_datasets + datasets[0] + '/' + c.file_users)
tweets = pd.read_csv(c.folder_datasets + datasets[0] + '/' + c.file_tweets, dtype={"geo": str})
friends = pd.read_csv(c.folder_datasets + datasets[0] + '/' + c.file_friends)
followers = pd.read_csv(c.folder_datasets + datasets[0] + '/' + c.file_followers)
for dataset in datasets[1:]:
    users = pd.concat([users, pd.read_csv(c.folder_datasets + dataset + '/' + c.file_users)])
    tweets = pd.concat([tweets, pd.read_csv(c.folder_datasets + dataset + '/' + c.file_tweets, dtype={"geo": str})])
    friends = pd.concat([friends, pd.read_csv(c.folder_datasets + dataset + '/' + c.file_friends)])
    followers = pd.concat([followers, pd.read_csv(c.folder_datasets + dataset + '/' + c.file_followers)])  

## Compute features

In [6]:
def is_from_web():
    for n in tweets.source.unique():
        for src in ["iphone", "android", "foursquare", "instagram", "web"]:
            if src in n.lower():
                return 1
    return 0

import string
def use_ponctuation(user_id):
    # If one tweets use ponctuation return 1
    user_tweets = tweets.loc[tweets['user_id'] == user['id'], 'text']
    for tweet in user_tweets:
        for c in tweet:
            # See if the char is punctuation.
            if c in string.punctuation:
                return 1
        return 0


In [31]:
import time
import datetime
start_time = time.time()
SAMPLE_SIZE = 100
X = list()
y = list()
features_name = list()
nb_fake_acc =0
# Compute features for each Twitter account
for index, user in users.iterrows():
    acc_feat = OrderedDict()
    #if index > SAMPLE_SIZE:
    #    break
    
    # Class A (Profile)
    #===================
    acc_feat['has_name'] = int(user['name'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['has_image'] = int(user['default_profile_image'] != 1)
    acc_feat['has_address'] = int(user['location'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['has_biography'] = int(user['description'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['followers_ge_30'] = int(user['followers_count'] >= 30)
    acc_feat['belongs_to_a_list'] = int(user['listed_count'] > 0)
    acc_feat['nb_tweets_ge_50'] = int(tweets.loc[tweets['user_id'] == user['id']].size >= 50)
    acc_feat['url_in_profile'] = int(user['url'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['followers_2_times_ge_friends'] = int(2 * user['followers_count'] >= user['friends_count'])
    
    acc_feat['bot_in_biography'] = int(type(user['description']) is str and 'bot' in user['description'].lower())
    acc_feat['ratio_friends_followers_around_100'] = int(user['followers_count'] > 0 and 80.0 <= float(user['friends_count']) / user['followers_count'] >= 120.0)
    acc_feat['duplicate_profile_picture'] = int(users.loc[users['default_profile_image'] == user['default_profile_image']].size > 1)
    
    acc_feat['ratio_friends_followers_ge_50'] = int(user['followers_count'] > 0 and float(user['friends_count']) / user['followers_count'] >= 50)
    acc_feat['default_image_after_2_month'] = int(user['default_profile_image'] == 1 and (datetime.datetime.now() - datetime.datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')) > datetime.timedelta(weeks=4)) 
    acc_feat['friends_ge_100'] = int(user['friends_count'] >= 100)
    acc_feat['no_bio'] = int(user['description'] in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['no_location'] = int(user['location'] in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['no_tweets'] = int(tweets.loc[tweets['user_id'] == user['id']].size == 0)
    
    acc_feat['nb_friends'] = int(user['friends_count'])
    acc_feat['nb_tweets'] = int(tweets.loc[tweets['user_id'] == user['id']].size)
    acc_feat['ratio_friends_followers_square'] = float(user['friends_count']) / pow(user['followers_count'], 2) if user['followers_count'] > 0 else 0
    
    acc_feat['age'] = (datetime.datetime.now() - datetime.datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')).total_seconds()
    acc_feat['following_rate'] = float(user['friends_count']) / acc_feat['age']
    
    # Class B (Timeline)
    #===================
    if not compute_only_class_a:
        #acc_feat['geo_localized'] = int(tweets.loc[tweets['user_id'] == user['id'] & (tweets['geo'] != '')].size)
        #acc_feat['is_favorite'] = int(tweets.loc[tweets['user_id'] == user['id'] & (tweets['favorite_count'] > 0)].size)
        #acc_feat['from_web'] = is_from_web()
        #acc_feat['use_ponctuation'] = use_ponctuation(user['id'])
        
        #acc_feat['same_sentence']
        acc_feat['from_API'] = tweets.loc[tweets['user_id'] == user['id'] & (tweets['source'] != '<a href="http://twitter.com/tweetbutton" rel="nofollow">Tweet Button</a>')].size > 0
        
    
    target = 1 if user['dataset'] in datasets_fake else 0
    nb_fake_acc = nb_fake_acc + target
    
    y.append(target)
    X.append(list(acc_feat.values()))
    features_name = list(acc_feat.keys())
nb_hum_acc = abs(len(y) - nb_fake_acc)
total_time = datetime.timedelta(seconds=time.time() - start_time)
print("Feature computation time : " + str(total_time))

Feature computation time : 0:01:35.151358


In [8]:
print("TOTAL accounts: " + str(users.size))
print("# fake accounts: " + str(nb_fake_acc))
print("# human accounts: " + str(nb_hum_acc))
#print("Features name: " + str(list(features_name)))
#print(X[0])

TOTAL accounts: 180234
# fake accounts: 3351
# human accounts: 1950


In [9]:
# Shuffle & balance the whole dataset (50-50 human/fake accounts)
from sklearn.utils import shuffle
X, y = shuffle(X, y)

max_sample = min(nb_fake_acc, nb_hum_acc) # max_sample = 1950 in our case
X, y = X[:max_sample], y[:max_sample]
print(len(y))

1950


In [47]:
# Store features on disk
import json
import os

def save_features(folder_features):
    with open(folder_features + c.file_features, 'w') as dstfile :
        json.dump(X, dstfile)
    with open(folder_features + c.file_target, 'w') as dstfile :
        json.dump(y, dstfile)
    with open(folder_features + c.file_features_name, 'w') as dstfile :
        json.dump(features_name, dstfile)
        
if compute_only_class_a:
    try:
        os.mkdir(c.folder_class_a)
    except FileExistsError :
        pass
    save_features(c.folder_class_a)
else:
    try:
        os.mkdir(c.folder_class_a_b_c)
    except FileExistsError :
        pass
    save_features(c.folder_class_a_b_c)

## Statistics on dataset

In [33]:
from prettytable import PrettyTable
# Count unique values of last feature
# Useful to test the computation of the feature
from collections import Counter
def extract_col(matrix, i):
    col = list()
    for columns in matrix:
        col.append(columns[i])
    return col

table = PrettyTable(['Feature name', '# val1', '# val2', '# val3', '# val4', '# val5'])
for i in range(len(X[0])):
    stats = extract_col(X, i)
    #print(features_name[i])
    values = list()
    for k, v in Counter(stats).most_common(5):
        values.append(round(v, 3))
        #values.append(str(round(k,3)) + ":"+ str(round(v, 3)))
        #print(str(k) + ": " + str(v))
    values = values + [""] * (5-len(Counter(stats).most_common(5)))
    table.add_row([features_name[i], values[0], values[1], values[2], values[3], values[4]])
print(table)

+------------------------------------+--------+--------+--------+--------+--------+
|            Feature name            | # val1 | # val2 | # val3 | # val4 | # val5 |
+------------------------------------+--------+--------+--------+--------+--------+
|              has_name              |  5301  |        |        |        |        |
|             has_image              |  5276  |   25   |        |        |        |
|            has_address             |  4089  |  1212  |        |        |        |
|           has_biography            |  4084  |  1217  |        |        |        |
|          followers_ge_30           |  3547  |  1754  |        |        |        |
|         belongs_to_a_list          |  4150  |  1151  |        |        |        |
|          nb_tweets_ge_50           |  4934  |  367   |        |        |        |
|           url_in_profile           |  3166  |  2135  |        |        |        |
|    followers_2_times_ge_friends    |  4414  |  887   |        |        |  

In [49]:
from prettytable import PrettyTable
# Count unique values of last feature
# Useful to test the computation of the feature
from collections import Counter
def extract_col(matrix, i):
    col = list()
    for columns in matrix:
        col.append(columns[i])
    return col

table = PrettyTable(['Feature name', '# val1', '# val2'])#, '# val3', '# val4', '# val5'])
for i in range(len(X[0])):
    if i > 17 or i == 0:
        continue
    stats = extract_col(X, i)
    #print(features_name[i])
    values = list()
    for k, v in Counter(stats).most_common(5):
        values.append(round(v, 3))
        #values.append(str(round(k,3)) + ":"+ str(round(v, 3)))
        #print(str(k) + ": " + str(v))
    values = values + [""] * (3-len(Counter(stats).most_common(5)))
    table.add_row([features_name[i], values[0], values[1]])#, values[2], values[3], values[4]])
print(table)

+------------------------------------+--------+--------+
|            Feature name            | # val1 | # val2 |
+------------------------------------+--------+--------+
|             has_image              |  5276  |   25   |
|            has_address             |  4089  |  1212  |
|           has_biography            |  4084  |  1217  |
|          followers_ge_30           |  3547  |  1754  |
|         belongs_to_a_list          |  4150  |  1151  |
|          nb_tweets_ge_50           |  4934  |  367   |
|           url_in_profile           |  3166  |  2135  |
|    followers_2_times_ge_friends    |  4414  |  887   |
|          bot_in_biography          |  5283  |   18   |
| ratio_friends_followers_around_100 |  5067  |  234   |
|     duplicate_profile_picture      |  5276  |   25   |
|   ratio_friends_followers_ge_50    |  4750  |  551   |
|    default_image_after_2_month     |  5276  |   25   |
|           friends_ge_100           |  4679  |  622   |
|               no_bio         

In [60]:
# Count the # of same values by feature and by class (Bot & non-Bot)
from prettytable import PrettyTable

stats = dict()
for i in range(len(X)):
    _class = y[i]
    for j in range(len(X[i])):
        f_name = features_name[j]
        if j > 17 or j == 0:
            continue
        if f_name not in stats:
            stats[f_name] = dict()
            stats[f_name]["0"] = dict()
            stats[f_name]["1"] = dict()
            stats[f_name]["0"]["0"] = 0
            stats[f_name]["0"]["1"] = 0
            stats[f_name]["1"]["0"] = 0
            stats[f_name]["1"]["1"] = 0
            
        stats[f_name][str(X[i][j])][str(_class)] += 1 
    
# Print !    

# class : 1 = fake
table = PrettyTable(['Feature name', 'Bot(1)', 'Real(1)', 'Bot(0)', 'Real(0)'])#, '# val3', '# val4', '# val5'])
for feature, v in stats.items():
    table.add_row([feature, v["1"]["1"], v["1"]["0"], v["0"]["1"], v["0"]["0"]])
print(table)

+------------------------------------+--------+---------+--------+---------+
|            Feature name            | Bot(1) | Real(1) | Bot(0) | Real(0) |
+------------------------------------+--------+---------+--------+---------+
|             has_image              |  3345  |   1931  |   6    |    19   |
|            has_address             |  2776  |   1313  |  575   |   637   |
|           has_biography            |  2278  |   1806  |  1073  |   144   |
|          followers_ge_30           |  172   |   1582  |  3179  |   368   |
|         belongs_to_a_list          |   99   |   1052  |  3252  |   898   |
|          nb_tweets_ge_50           |  2998  |   1936  |  353   |    14   |
|           url_in_profile           |  1438  |   697   |  1913  |   1253  |
|    followers_2_times_ge_friends    |   13   |   874   |  3338  |   1076  |
|          bot_in_biography          |   12   |    6    |  3339  |   1944  |
| ratio_friends_followers_around_100 |  234   |    0    |  3117  |   1950  |

In [64]:
print(tweets.geo.unique())
for n in users.name.unique():
    if len(n) < 1:
        print(n)

for n in tweets.source.unique():
    for src in ["iphone", "android", "foursquare", "instagram", "web"]:
        if src in n.lower():
            print(n)
    
    #if len(n) < 50:
    #    print(n)
        
#iphone, android, foursquare, instagram and web

[nan]
web
<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>
<a href="http://foursquare.com" rel="nofollow">foursquare</a>
<a href="http://instagram.com" rel="nofollow">Instagram</a>
<a href="http://app.ft.com/" rel="nofollow">FT Web App</a>
<a href="http://mobile.twitter.com" rel="nofollow">Mobile Web</a>
<a href="http://mobile.twitter.com" rel="nofollow">mobile web</a>
<a href="http://m.twitter.com/" rel="nofollow">mobile web</a>
<a href="http://mobile.twitter.com" rel="nofollow">Twitter for Android</a>
<a href="https://mobile.twitter.com" rel="nofollow">Mobile Web (M5)</a>
<a href="http://bluefinapps.com" rel="nofollow">RunHelper for iPhone</a>
<a href="http://itunes.com/apps/shazam" rel="nofollow">Shazam on iPhone</a>
<a href="http://itunes.apple.com/app/twitter/id333903271?mt=8" rel="nofollow">Twitter for iPhone</a>
<a href="http://itunes.apple.com/us/app/instagram/

In [8]:
users.isnull().sum()

id                                      0
name                                    0
screen_name                             0
statuses_count                          0
followers_count                         0
friends_count                           0
favourites_count                        0
listed_count                            0
created_at                              0
url                                   235
lang                                    0
time_zone                             101
location                              145
default_profile                       314
default_profile_image                 454
geo_enabled                           230
profile_image_url                       0
profile_banner_url                    249
profile_use_background_image           26
profile_background_image_url_https      0
profile_text_color                      0
profile_image_url_https                 0
profile_sidebar_border_color            0
profile_background_tile           