In [1]:
# Config related imports
import config
from config import load_users_dataframe
unique_users = load_users_dataframe()
from config import load_networkx_friends
network_friends = load_networkx_friends()
from config import dump_ml_data

import networkx as nx

# Other imports
from tqdm import tqdm
import pandas as pd
import numpy as np

{'calculate': {'analysis': True,
               'friends': False,
               'network': True,
               'uniquetweets': True,
               'uniqueusers': True},
 'data': {'dates': ['2018-03-11', '2018-03-12', '2018-03-13'],
          'eventname': "Givenchy's Death",
          'phrases': ['givenchy%20death', 'givenchy%20passed%20away'],
          'starttime': 'Mar 12 08:20:00 -0500 2018'},
 'path': {'cwd': '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy',
          'ml': '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle',
          'networkx': {'all': '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/networkx_all.dat',
                       'friends': '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/networkx_friends.dat',
                       'potential': '/Users/lzhou/git/github/uclresearchanalysis/data/givenchy/pickle/networkx_potential.dat'},
          'newcrawl': '/Users/lzhou/git/github/uclresearchanalysis/ot

In [3]:
unique_users.columns

Index(['user', 'user_id', 'time_lapsed', 'followers_count', 'friends_count',
       'user_created_days', 'user_statuses_count', 'user_listed_count',
       'user_favourites_count', 'normalized_user_statuses_count',
       'normalized_user_followers_count', 'normalized_user_favourites_count',
       'normalized_user_listed_count', 'normalized_user_friends_count',
       'mention_and_reply', 'source_candidates', 'source_index', 'seed_index',
       'generation', 'time_since_seed'],
      dtype='object')

In [4]:
# Settings
# intervals = [60, 30, 15, 7]
intervals = [30]

# parametersVector = ["tCurrent", "kIn", "kOut0", "t0", "kOut-1", "t-1", 
#                     "kOutMax", "kOutMin", "kOutAverage", "tAverage", 
#                     "nNodes", "deltaDays", "statusesCount", "followersCount", 
#                     "favouritesCount", "friendsCount", "listedCount", "label"]

# Calculating in and out degrees
nodeInDegreeDict = network_friends.in_degree()
nodeOutDegreeDict = network_friends.out_degree()

# Self defined function for mean value calculation
def mean(numbers):
    return float(sum(numbers)) / max(len(numbers), 1)

In [5]:
convert_dictionary_to_sorted_list = lambda x: [x[a] for a in sorted(x)]

# Assortativity
# https://networkx.github.io/documentation/networkx-1.9.1/reference/algorithms.assortativity.html
average_neighbor_degree = convert_dictionary_to_sorted_list(nx.average_neighbor_degree(network_friends))

# Centrality
# https://networkx.github.io/documentation/networkx-1.9.1/reference/algorithms.centrality.html
degree_centrality = convert_dictionary_to_sorted_list(nx.degree_centrality(network_friends))
in_degree_centrality = convert_dictionary_to_sorted_list(nx.in_degree_centrality(network_friends))
out_degree_centrality = convert_dictionary_to_sorted_list(nx.out_degree_centrality(network_friends))
closeness_centrality = convert_dictionary_to_sorted_list(nx.closeness_centrality(network_friends))
betweenness_centrality = convert_dictionary_to_sorted_list(nx.betweenness_centrality(network_friends))
eigenvector_centrality = convert_dictionary_to_sorted_list(nx.eigenvector_centrality(network_friends))

In [6]:
def process_data(interval):
    # Prepare dataframe
    processed_dataframe = pd.DataFrame() #columns=parametersVector
#     # Order the columns
#     processed_dataframe = processed_dataframe[parametersVector]

    with tqdm(total=len(list(unique_users.iterrows()))) as pbar: 
        for index, user_row in unique_users.iterrows():
            if user_row['source_index'] is not None:
                source_candidates = user_row['source_candidates']
                source_first = source_candidates[0]
                source_first_row = unique_users.iloc[source_first]
                source_first_time_lapsed = source_first_row.time_lapsed
                start_bar = int(source_first_time_lapsed / interval) + 1                
                
                label = []
                t0 = []
                t_1 = []
                kIn = []
                kOut0 = []
                kOut_1 = []                
                nNodes = []

                averageNeighborDegree0 = average_neighbor_degree[source_first]
                degreeCentrality0 = degree_centrality[source_first]
                inDegreeCentrality0 = in_degree_centrality[source_first]
                outDegreeCentrality0 = out_degree_centrality[source_first]
                closenessCentrality0 = closeness_centrality[source_first]
                betweennessCentrality0 = betweenness_centrality[source_first]
                eigenvectorCentrality0 = eigenvector_centrality[source_first]
                averageNeighborDegree_1 = []
                degreeCentrality_1 = []
                inDegreeCentrality_1 = []
                outDegreeCentrality_1 = []
                closenessCentrality_1 = []
                betweennessCentrality_1 = []
                eigenvectorCentrality_1 = []
                
                UsM_deltaDays0 = source_first_row.user_created_days
                UsM_statusesCount0 = source_first_row.user_statuses_count
                UsM_followersCount0 = source_first_row.followers_count
                UsM_favouritesCount0 = source_first_row.user_favourites_count
                UsM_friendsCount0 = source_first_row.friends_count
                UsM_listedCount0 = source_first_row.user_listed_count
                UsM_normalizedUserStatusesCount0 = source_first_row.normalized_user_statuses_count
                UsM_normalizedUserFollowersCount0 = source_first_row.normalized_user_followers_count
                UsM_normalizedUserFavouritesCount0 = source_first_row.normalized_user_favourites_count
                UsM_normalizedUserListedCount0 = source_first_row.normalized_user_listed_count
                UsM_normalizedUserFriendsCount0 = source_first_row.normalized_user_friends_count
                UsM_deltaDays_1 = []
                UsM_statusesCount_1 = []
                UsM_followersCount_1 = []
                UsM_favouritesCount_1 = []
                UsM_friendsCount_1 = []
                UsM_listedCount_1 = []
                UsM_normalizedUserStatusesCount_1 = []
                UsM_normalizedUserFollowersCount_1 = []
                UsM_normalizedUserFavouritesCount_1 = []
                UsM_normalizedUserListedCount_1 = []
                UsM_normalizedUserFriendsCount_1 = []
                
                # Stat
                kOutAverage = []
                tAverage = []
                Stat_average_deltaDays = []
                Stat_average_statusesCount = []
                Stat_average_followersCount = []
                Stat_average_favouritesCount = []
                Stat_average_friendsCount = []
                Stat_average_listedCount = []
                Stat_average_normalizedUserStatusesCount = []
                Stat_average_normalizedUserFollowersCount = []
                Stat_average_normalizedUserFavouritesCount = []
                Stat_average_normalizedUserListedCount = []
                Stat_average_normalizedUserFriendsCount = []
                kOutMax = []
                kOutMin = []
                

                bars = list(np.arange(start_bar * interval, 24 * 60, interval))

                for current_time in bars:
                    # all sources up to the current time
                    sources = [x for x in source_candidates if unique_users.iloc[x].time_lapsed <= current_time]
                    sources_dataframe = unique_users.iloc[sources]
                    
                    averageNeighborDegreeList = list(average_neighbor_degree[i] for i in sources)
                    degreeCentralityList = list(degree_centrality[i] for i in sources)
                    inDegreeCentralityList = list(in_degree_centrality[i] for i in sources)
                    outDegreeCentralityList = list(out_degree_centrality[i] for i in sources)
                    closenessCentralityList = list(closeness_centrality[i] for i in sources)
                    betweennessCentralityList = list(betweenness_centrality[i] for i in sources)
                    eigenvectorCentralityList = list(eigenvector_centrality[i] for i in sources)
                    
                    degreeList = [nodeOutDegreeDict[x] for x in sources]
                    timeList = [current_time - unique_users.iloc[x].time_lapsed for x in sources]
                    
                    first_source_index = sources[0]
                    first_source_row = unique_users.iloc[first_source_index]
                    last_source_index = sources[-1]
                    last_source_row = unique_users.iloc[last_source_index]

                    label.append(int(current_time >= user_row['time_lapsed']))
                    t0.append(round(timeList[0], 1))
                    t_1.append(round(timeList[-1], 1))
                    kIn.append(nodeInDegreeDict[index])
                    kOut0.append(nodeOutDegreeDict[first_source_index])
                    kOut_1.append(nodeOutDegreeDict[last_source_index])
                    nNodes.append(len(sources))

                    averageNeighborDegree_1.append(average_neighbor_degree[last_source_index])
                    degreeCentrality_1.append(degree_centrality[last_source_index])
                    inDegreeCentrality_1.append(in_degree_centrality[last_source_index])
                    outDegreeCentrality_1.append(out_degree_centrality[last_source_index])
                    closenessCentrality_1.append(closeness_centrality[last_source_index])
                    betweennessCentrality_1.append(betweenness_centrality[last_source_index])
                    eigenvectorCentrality_1.append(eigenvector_centrality[last_source_index])

                    UsM_deltaDays_1.append(last_source_row.user_created_days)
                    UsM_statusesCount_1.append(last_source_row.user_statuses_count)
                    UsM_followersCount_1.append(last_source_row.followers_count)
                    UsM_favouritesCount_1.append(last_source_row.user_favourites_count)
                    UsM_friendsCount_1.append(last_source_row.friends_count)
                    UsM_listedCount_1.append(last_source_row.user_listed_count)
                    UsM_normalizedUserStatusesCount_1.append(last_source_row.normalized_user_statuses_count)
                    UsM_normalizedUserFollowersCount_1.append(last_source_row.normalized_user_followers_count)
                    UsM_normalizedUserFavouritesCount_1.append(last_source_row.normalized_user_favourites_count)
                    UsM_normalizedUserListedCount_1.append(last_source_row.normalized_user_listed_count)
                    UsM_normalizedUserFriendsCount_1.append(last_source_row.normalized_user_friends_count)
                    
                    # Stat
                    kOutAverage.append(round(mean(degreeList), 1))
                    tAverage.append(round(mean(timeList), 1))
                    Stat_average_deltaDays.append(sources_dataframe.user_created_days.mean())
                    Stat_average_statusesCount.append(sources_dataframe.user_statuses_count.mean())
                    Stat_average_followersCount.append(sources_dataframe.followers_count.mean())
                    Stat_average_favouritesCount.append(sources_dataframe.user_favourites_count.mean())
                    Stat_average_friendsCount.append(sources_dataframe.friends_count.mean())
                    Stat_average_listedCount.append(sources_dataframe.user_listed_count.mean())
                    Stat_average_normalizedUserStatusesCount.append(sources_dataframe.normalized_user_statuses_count.mean())
                    Stat_average_normalizedUserFollowersCount.append(sources_dataframe.normalized_user_followers_count.mean())
                    Stat_average_normalizedUserFavouritesCount.append(sources_dataframe.normalized_user_favourites_count.mean())
                    Stat_average_normalizedUserListedCount.append(sources_dataframe.normalized_user_listed_count.mean())
                    Stat_average_normalizedUserFriendsCount.append(sources_dataframe.normalized_user_friends_count.mean())
                    kOutMax.append(max(degreeList))
                    kOutMin.append(min(degreeList))
                    
                # UsM: User metadata
                # TwM: Tweet metadata
                # Ling: linguistic
                # Nw: Network
                # Stat: Statistical
                # Txt: Texual
                # Vis: Visual
                processed_dataframe = processed_dataframe.append(pd.DataFrame({
                    'label': label,
                    
                    # UsM
                    'UsM_deltaDays': user_row['user_created_days'],
                    'UsM_statusesCount': user_row['user_statuses_count'],
                    'UsM_followersCount': user_row['followers_count'],
                    'UsM_favouritesCount': user_row['user_favourites_count'], 
                    'UsM_friendsCount': user_row['friends_count'], 
                    'UsM_listedCount': user_row['user_listed_count'],
                    'UsM_normalizedUserStatusesCount': user_row['normalized_user_statuses_count'],
                    'UsM_normalizedUserFollowersCount': user_row['normalized_user_followers_count'],
                    'UsM_normalizedUserFavouritesCount': user_row['normalized_user_favourites_count'],
                    'UsM_normalizedUserListedCount': user_row['normalized_user_listed_count'],
                    'UsM_normalizedUserFriendsCount': user_row['normalized_user_friends_count'],                    
                    'UsM_deltaDays0': UsM_deltaDays0,
                    'UsM_statusesCount0': UsM_statusesCount0,
                    'UsM_followersCount0': UsM_followersCount0,
                    'UsM_favouritesCount0': UsM_favouritesCount0,
                    'UsM_friendsCount0': UsM_friendsCount0,
                    'UsM_listedCount0': UsM_listedCount0,
                    'UsM_normalizedUserStatusesCount0': UsM_normalizedUserStatusesCount0,
                    'UsM_normalizedUserFollowersCount0': UsM_normalizedUserFollowersCount0,
                    'UsM_normalizedUserFavouritesCount0': UsM_normalizedUserFavouritesCount0,
                    'UsM_normalizedUserListedCount0': UsM_normalizedUserListedCount0,
                    'UsM_normalizedUserFriendsCount0': UsM_normalizedUserFriendsCount0,
                    'UsM_deltaDays-1': UsM_deltaDays_1,
                    'UsM_statusesCount-1': UsM_statusesCount_1,
                    'UsM_followersCount-1': UsM_followersCount_1,
                    'UsM_favouritesCount-1': UsM_favouritesCount_1,
                    'UsM_friendsCount-1': UsM_friendsCount_1,
                    'UsM_listedCount-1': UsM_listedCount_1,
                    'UsM_normalizedUserStatusesCount-1': UsM_normalizedUserStatusesCount_1,
                    'UsM_normalizedUserFollowersCount-1': UsM_normalizedUserFollowersCount_1,
                    'UsM_normalizedUserFavouritesCount-1': UsM_normalizedUserFavouritesCount_1,
                    'UsM_normalizedUserListedCount-1': UsM_normalizedUserListedCount_1,
                    'UsM_normalizedUserFriendsCount-1': UsM_normalizedUserFriendsCount_1,
                    # TwM: Tweet metadata
                    'TwM_t0': t0,
                    'TwM_t-1': t_1,
                    'TwM_tCurrent': bars,
                    
                    # Nw
#                     'Nw_kIn': kIn,
#                     'Nw_kOut0': kOut0,
#                     'Nw_kOut-1': kOut_1,
                    'Nw_nNodes': nNodes,
                    'Nw_averageNeighborDegree': average_neighbor_degree[index],
                    'Nw_degreeCentrality': degree_centrality[index],
                    'Nw_inDegreeCentrality': in_degree_centrality[index],
                    'Nw_outDegreeCentrality': out_degree_centrality[index],
                    'Nw_closenessCentrality': closeness_centrality[index],
                    'Nw_betweennessCentrality': betweenness_centrality[index],
                    'Nw_eigenvectorCentrality': eigenvector_centrality[index],
                    'Nw_averageNeighborDegree0': averageNeighborDegree0,
                    'Nw_degreeCentrality0': degreeCentrality0,
                    'Nw_inDegreeCentrality0': inDegreeCentrality0,
                    'Nw_outDegreeCentrality0': outDegreeCentrality0,
                    'Nw_closenessCentrality0': closenessCentrality0,
                    'Nw_betweennessCentrality0': betweennessCentrality0,
                    'Nw_eigenvectorCentrality0': eigenvectorCentrality0,
                    'Nw_averageNeighborDegree-1': averageNeighborDegree_1,
                    'Nw_degreeCentrality-1': degreeCentrality_1,
                    'Nw_inDegreeCentrality-1': inDegreeCentrality_1,
                    'Nw_outDegreeCentrality-1': outDegreeCentrality_1,
                    'Nw_closenessCentrality-1': closenessCentrality_1,
                    'Nw_betweennessCentrality-1': betweennessCentrality_1,
                    'Nw_eigenvectorCentrality-1': eigenvectorCentrality_1,
                    
                    # Stat
                    'Stat_average_kOut': kOutAverage,
                    'Stat_average_t': tAverage,
                    'Stat_average_deltaDays': Stat_average_deltaDays,
                    'Stat_average_statusesCount': Stat_average_statusesCount,
                    'Stat_average_followersCount': Stat_average_followersCount,
                    'Stat_average_favouritesCount': Stat_average_favouritesCount,
                    'Stat_average_friendsCount': Stat_average_friendsCount,
                    'Stat_average_listedCount': Stat_average_listedCount,
                    'Stat_average_normalizedUserStatusesCount': Stat_average_normalizedUserStatusesCount,
                    'Stat_average_normalizedUserFollowersCount': Stat_average_normalizedUserFollowersCount,
                    'Stat_average_normalizedUserFavouritesCount': Stat_average_normalizedUserFavouritesCount,
                    'Stat_average_normalizedUserListedCount': Stat_average_normalizedUserListedCount,
                    'Stat_average_normalizedUserFriendsCount': Stat_average_normalizedUserFriendsCount,                    
                    'Stat_max_kOut': kOutMax,
                    'Stat_min_kOut': kOutMin
                    }))
            pbar.update(1)
    return processed_dataframe

for interval in intervals:
    df = process_data(interval)
    print(df.head())
    dump_ml_data(df, interval)

100%|██████████| 5011/5011 [26:04<00:00,  2.03it/s]


   Nw_averageNeighborDegree  Nw_averageNeighborDegree-1  \
0                       6.5                     4.42623   
1                       6.5                     4.42623   
2                       6.5                     4.42623   
3                       6.5                     4.42623   
4                       6.5                     4.42623   

   Nw_averageNeighborDegree0  Nw_betweennessCentrality  \
0                    4.42623                       0.0   
1                    4.42623                       0.0   
2                    4.42623                       0.0   
3                    4.42623                       0.0   
4                    4.42623                       0.0   

   Nw_betweennessCentrality-1  Nw_betweennessCentrality0  \
0                    0.012358                   0.012358   
1                    0.012358                   0.012358   
2                    0.012358                   0.012358   
3                    0.012358                   0.012358