In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import math

twitter_human_bots_dataset_path = "twitter_human_bots_dataset/twitter_human_bots_dataset.csv"
tweets_dataset_path = "twitter_human_bots_dataset/tweets140522.csv"

twitter_human_bots_dataset = pd.read_csv(twitter_human_bots_dataset_path, index_col=0)
tweets_dataset = pd.read_csv(tweets_dataset_path, index_col=0)

# Posts Likes Count

In [2]:
posts_likes_count = {}

for user_id in twitter_human_bots_dataset.id:
    current_user_tweets = tweets_dataset.loc[tweets_dataset["account_id"]==user_id]
    posts_likes_count[user_id] = (current_user_tweets.shape[0], 
                                  int(current_user_tweets.like_count.sum()))

# Posts Created At Calcs

In [34]:
CREATED_AT_TIME_FORMAT = '%Y-%m-%dT%H:%M:%S.%fZ'
TEN_MINUTES_IN_SEC = 10*60
MINUTE_IN_SEC = 60


ACCOUNT_ID_COL = 'account_id'
POSTS_TIMES_COL = 'posts_times'
POSTS_DELTAS_COL = 'posts_deltas'
POSTS_HAVE_CONSTANT_DELTAS_COL = 'posts_constant_deltas'
LESS_THAN_1MIN_DELTAS_COL = 'less_than_one_min_deltas_count'


posts_created_at_calcs_df = pd.DataFrame(columns=[ACCOUNT_ID_COL, POSTS_TIMES_COL, POSTS_DELTAS_COL, 
                                                  POSTS_HAVE_CONSTANT_DELTAS_COL, LESS_THAN_1MIN_DELTAS_COL])

for user_id in twitter_human_bots_dataset.id:
    current_user_tweets = tweets_dataset.loc[tweets_dataset["account_id"]==user_id]
    current_user_tweets_ca_str = current_user_tweets.created_at.dropna().astype(str).tolist()
    current_user_tweets_ca = [datetime.strptime(tweet_created_at, CREATED_AT_TIME_FORMAT) 
                              for tweet_created_at in current_user_tweets_ca_str]
    
    current_user_deltas = []
    for i in range(len(current_user_tweets_ca)-1):
        current_user_deltas.append(current_user_tweets_ca[i]-current_user_tweets_ca[i+1])
        
    # We want to check if a user posts tweets every constant time (means using some automated script probably)
    # We decided to measure this constant time by the creation time deltas variation. 
    # If all posts are with deltas with a difference of max 10 minutes
    total_seconds_deltas = [d.total_seconds() for d in current_user_deltas]
    if len(total_seconds_deltas) > 2:
        user_deltas_avg = sum(total_seconds_deltas) / len(total_seconds_deltas)
        posts_have_constant_deltas = all([abs(tsd-user_deltas_avg)<=TEN_MINUTES_IN_SEC 
                                          for tsd in total_seconds_deltas])
    else:
        posts_have_constant_deltas = None
    
    # We want to check if a user posts tweets in a difference of less than a minute.
    # This data will be represented simply by the count of deltas <= 1min
    
    less_than_1min_deltas = len([t for t in total_seconds_deltas if t<=MINUTE_IN_SEC])
    
    
    posts_created_at_calcs_df = posts_created_at_calcs_df.append(
        {ACCOUNT_ID_COL: user_id,
         POSTS_TIMES_COL: current_user_tweets_ca,
         POSTS_DELTAS_COL: total_seconds_deltas,
         POSTS_HAVE_CONSTANT_DELTAS_COL: posts_have_constant_deltas,
         LESS_THAN_1MIN_DELTAS_COL: less_than_1min_deltas}, ignore_index=True)
    

In [35]:
posts_created_at_calcs_df.loc[posts_created_at_calcs_df.posts_constant_deltas==True].account_id

23                313116175
48       772315537992916992
59               2157748434
64       818494659873951745
110              3304160639
                ...        
37266              46745190
37317             169696352
37349    878072807107600384
37369              87343277
37430            4588875447
Name: account_id, Length: 1743, dtype: object

In [41]:
constant_account_ids = posts_created_at_calcs_df.loc[posts_created_at_calcs_df.posts_constant_deltas==True].account_id.tolist()
twitter_human_bots_dataset.query("id in @constant_account_ids").account_type.value_counts()

bot      1050
human     693
Name: account_type, dtype: int64

In [None]:
# We want to check if a user posts tweets every constant time (means using some automated script probably)
# We decided to measure this constant time by the creation time deltas variation. 
# If all posts are with deltas with a difference of max 10 minutes

# TEN_MINUTES_IN_SEC = 10*60
# posts_constant_deltas = {}

# for user_id, user_deltas in posts_deltas.items():
#     total_seconds_deltas = [d.total_seconds() for d in user_deltas]
#     user_deltas_avg = sum(total_seconds_deltas) / len(total_seconds_deltas)
    
#     posts_constant_deltas[user_id] = all([abs(tsd-user_deltas_avg)<=TEN_MINUTES_IN_SEC 
#                                           for tsd in total_seconds_deltas])

In [None]:
# We want to check if a user posts tweets in a difference of less than a minute.
# We decided to measure it by dividing

TEN_MINUTES_IN_SEC = 10*60
posts_constant_deltas = {}

for user_id, user_deltas in posts_deltas.items():
    total_seconds_deltas = [d.total_seconds() for d in user_deltas]
    user_deltas_avg = sum(total_seconds_deltas) / len(total_seconds_deltas)
    
    posts_constant_deltas[user_id] = all([abs(tsd-user_deltas_avg)<=TEN_MINUTES_IN_SEC 
                                          for tsd in total_seconds_deltas])

In [66]:
posts_deltas

{53779179: [datetime.timedelta(seconds=6651),
  datetime.timedelta(days=9, seconds=53243),
  datetime.timedelta(seconds=75964),
  datetime.timedelta(seconds=257),
  datetime.timedelta(seconds=10),
  datetime.timedelta(days=11, seconds=14633),
  datetime.timedelta(seconds=65210),
  datetime.timedelta(seconds=6309),
  datetime.timedelta(seconds=31277)],
 105916557: [datetime.timedelta(days=16, seconds=5867),
  datetime.timedelta(seconds=2949),
  datetime.timedelta(days=3, seconds=73259),
  datetime.timedelta(days=241, seconds=10771),
  datetime.timedelta(days=105, seconds=54312),
  datetime.timedelta(seconds=27),
  datetime.timedelta(seconds=99),
  datetime.timedelta(days=5, seconds=24751),
  datetime.timedelta(days=1, seconds=80028)],
 509788597: [datetime.timedelta(days=1, seconds=9549),
  datetime.timedelta(days=7, seconds=50946),
  datetime.timedelta(seconds=103),
  datetime.timedelta(days=3, seconds=11546),
  datetime.timedelta(days=1, seconds=36383),
  datetime.timedelta(days=11, s

In [78]:
posts_deltas1 = {}
for user_id in posts_deltas:
    posts_deltas1[f"{twitter_human_bots_dataset.loc[twitter_human_bots_dataset.id==user_id].account_type.iloc[0]}-{user_id}"] = posts_deltas[user_id]
    
    
    
    

In [89]:
(posts_deltas1['bot-53779179'][0]-posts_deltas1['bot-53779179'][1]).total_seconds()

-824192.0

In [86]:
posts_deltas1['bot-53779179'][1].total_seconds()

830843.0

In [90]:
posts_deltas1

{'bot-53779179': [datetime.timedelta(seconds=6651),
  datetime.timedelta(days=9, seconds=53243),
  datetime.timedelta(seconds=75964),
  datetime.timedelta(seconds=257),
  datetime.timedelta(seconds=10),
  datetime.timedelta(days=11, seconds=14633),
  datetime.timedelta(seconds=65210),
  datetime.timedelta(seconds=6309),
  datetime.timedelta(seconds=31277)],
 'human-105916557': [datetime.timedelta(days=16, seconds=5867),
  datetime.timedelta(seconds=2949),
  datetime.timedelta(days=3, seconds=73259),
  datetime.timedelta(days=241, seconds=10771),
  datetime.timedelta(days=105, seconds=54312),
  datetime.timedelta(seconds=27),
  datetime.timedelta(seconds=99),
  datetime.timedelta(days=5, seconds=24751),
  datetime.timedelta(days=1, seconds=80028)],
 'human-509788597': [datetime.timedelta(days=1, seconds=9549),
  datetime.timedelta(days=7, seconds=50946),
  datetime.timedelta(seconds=103),
  datetime.timedelta(days=3, seconds=11546),
  datetime.timedelta(days=1, seconds=36383),
  datetim

In [10]:
tweets_dataset.loc[tweets_dataset["account_id"]==370890435]

Unnamed: 0,id,attachments,context_annotations,conversation_id,created_at,account_id,in_reply_to_user_id,lang,retweet_count,reply_count,like_count,quote_count,reply_settings,source,text,mentions,hashtags,urls
318164,8.986528e+17,,,8.986528e+17,2017-08-18T21:08:15.000Z,370890435,,en,8464.0,0.0,0.0,0.0,everyone,Twitter for Android,RT @STU_ACTOR: Devastated to hear the Legend t...,['STU_ACTOR'],[],[]
318165,8.986501e+17,,"[{'domain': {'id': '67', 'name': 'Interests an...",8.986501e+17,2017-08-18T20:57:42.000Z,370890435,,en,1067.0,0.0,0.0,0.0,everyone,Twitter for Android,RT @LiveFunAndDiy: How to Make a Fidget Spinne...,"['lifehacker', 'lifehackorg']",['LifeHacks'],['https://youtu.be/r0ZavqtlaHc']
318166,8.986474e+17,,"[{'domain': {'id': '67', 'name': 'Interests an...",8.986474e+17,2017-08-18T20:47:01.000Z,370890435,,en,1077.0,0.0,0.0,0.0,everyone,Twitter for Android,RT @LiveFunAndDiy: How to Make a Fidget Spinne...,"['vidme', 'VidmeClub']","['vidme', 'TUTORIAL', 'DIY']",['https://vid.me/88Ua8']
318167,8.986257e+17,,,8.986257e+17,2017-08-18T19:20:44.000Z,370890435,,und,114.0,0.0,0.0,0.0,everyone,Twitter for Android,RT @GriffyOnline: https://t.co/YVPTddKqJ8,['GriffyOnline'],[],['http://fb.me/3TdZw7vq4']
318168,8.986166e+17,{'media_keys': ['7_898506990661492736']},,8.986166e+17,2017-08-18T18:44:26.000Z,370890435,,ja,583.0,0.0,0.0,0.0,everyone,Twitter for Android,RT @yuchaso_prince: だまされた... https://t.co/299z...,['yuchaso_prince'],[],['https://twitter.com/yuchaso_prince/status/89...
318169,8.986085e+17,,,8.986085e+17,2017-08-18T18:12:27.000Z,370890435,,en,1492.0,0.0,0.0,0.0,everyone,Twitter for Android,RT @yashqaraah: Just upload it boosting testos...,['yashqaraah'],[],['http://7stage.com']


In [122]:
tweets_dataset.loc[tweets_dataset.id.duplicated()==True & tweets_dataset.id.notna()].id.tolist()

[nan,
 9.008736941296558e+17,
 9.008798058219232e+17,
 9.132059093204582e+17,
 9.155896056209777e+17,
 9.13209194660094e+17,
 1.5242131162710958e+18,
 1.5241623185604813e+18,
 1.5240927956731863e+18,
 1.523406809070854e+18,
 1.523406793006719e+18,
 1.5233880541525238e+18,
 1.5244732489933414e+18,
 1.5237785974245335e+18,
 1.5219099927391314e+18,
 1.5246231223008174e+18,
 1.5246229683824968e+18,
 1.5246227021784556e+18,
 1.090098464942248e+18,
 1.524422067147391e+18,
 1.5245424434539643e+18,
 1.5244647353784607e+18,
 1.5241146251759045e+18,
 1.5237500248778015e+18,
 1.4208873236848353e+18,
 1.302777144519557e+18,
 1.2838670053686395e+18,
 1.5220004835394724e+18,
 1.5238978376202404e+18,
 1.5245352665011036e+18,
 1.524531015183929e+18,
 1.5245303890582118e+18,
 1.52346304635349e+18,
 1.524608902528123e+18,
 1.5245990933933384e+18,
 1.524583918393082e+18,
 1.5245684685610516e+18,
 1.5219689797990728e+18,
 1.521616786508206e+18,
 1.4865361186025062e+18,
 1.5178318339620946e+18,
 1.52303910

In [123]:
tweets_dataset.loc[tweets_dataset["id"].astype(str)=='9.008736941296558e+17']

Unnamed: 0,id,attachments,context_annotations,conversation_id,created_at,account_id,in_reply_to_user_id,lang,retweet_count,reply_count,like_count,quote_count,reply_settings,source,text,mentions,hashtags,urls
20086,9.008737e+17,,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",9.008737e+17,2017-08-25T00:13:20.000Z,2283108739,,en,400.0,0.0,0.0,0.0,everyone,Twitter for Android,RT @every1bets: Giovannis Gems Slots Is LIVE W...,['every1bets'],"['free', 'casino', 'slots', 'games', 'xbox', '...",['http://bit.ly/blackdiamond7']
51620,9.008737e+17,,"[{'domain': {'id': '13', 'name': 'Place', 'des...",9.008737e+17,2017-08-25T00:13:20.000Z,4928186374,,en,1230.0,0.0,0.0,0.0,everyone,Twitter for Android,RT @mohitfreedom: High Commissioner of #India ...,"['RuchiraKamboj', 'SAPresident']","['India', 'SouthAfrica']",[]


In [1]:
from twitter_account_enrichment import *
tweets_per_account = get_accounts_tweets([370890435])

200


In [2]:
tweets_per_account[370890435]

[{'created_at': '2017-08-18T21:08:15.000Z',
  'source': 'Twitter for Android',
  'lang': 'en',
  'public_metrics': {'retweet_count': 8460,
   'reply_count': 0,
   'like_count': 0,
   'quote_count': 0},
  'text': 'RT @STU_ACTOR: Devastated to hear the Legend that is Sir Bruce Forsyth has sadly passed away. A true gent and a talent beyond compare #RIPB…',
  'id': '898652788011352064',
  'entities': {'mentions': [{'start': 3,
     'end': 13,
     'username': 'STU_ACTOR',
     'id': '351541320'}],
   'annotations': [{'start': 53,
     'end': 69,
     'probability': 0.856,
     'type': 'Person',
     'normalized_text': 'Sir Bruce Forsyth'}]},
  'conversation_id': '898652788011352064',
  'reply_settings': 'everyone',
  'referenced_tweets': [{'type': 'retweeted', 'id': '898571054922289152'}]},
 {'created_at': '2017-08-18T20:57:42.000Z',
  'entities': {'urls': [{'start': 70,
     'end': 93,
     'url': 'https://t.co/kI8w7nnOyv',
     'expanded_url': 'https://youtu.be/r0ZavqtlaHc',
     'displa

In [124]:
tweets_dataset.to_csv(f"twitter_human_bots_dataset/tweets140522.csv", index=True)

In [100]:
total_seconds_deltas = [123,60,80,34,1,2,7,77,48,9]
less_than_1min_deltas = filter(lambda s: s<=60, total_seconds_deltas)
len([t for t in total_seconds_deltas if t<=60])

7