In [1]:
import numpy as np
import pandas as pd
import csv
import pickle
import time
import datetime
import re
import os

# Prep

In [2]:
# For each file, read in dataframe, correct columns, grouped by reduction, file write.

In [4]:
file_list = []
file_names = []
for f in os.listdir('./Brexit_Data/'):
    if f[-4:] == '.csv':
        file_list.append(f)
        file_names.append(re.findall(r'[0-9]+[a-z]*_[0-9].*.csv', f)[0][:-4])
print(file_list)
print(file_names)

['huge_query-20160713-edgesBrexit_10_16jun.csv', 'huge_query-20160713-edgesBrexit_24_30jun.csv', 'huge_query-20160713-edgesBrexit_27may_2jun.csv', 'huge_query-20160713-edgesBrexit_3_9jun.csv', 'huge_query-20160713-edgesBrexit_17_23jun.csv']
['10_16jun', '24_30jun', '27may_2jun', '3_9jun', '17_23jun']


In [5]:
for i in range(len(file_list)):
    print('Processing file %s ' % file_names[i])
    
    # Read in the edges file
    df_edges = pd.read_csv('./Brexit_Data/%s' % file_list[i])
    
    # Remove space from beginning of some of the columns for ease of use later
    columns = df_edges.columns
    renaming_dict = {}
    for c in columns:
        if c[0] == ' ':
            renaming_dict[c] = c[1:]
    df_edges = df_edges.rename(index=str, columns=renaming_dict)
    
    # Do grouped by first, save to file (reduces size) then read in new data frame
    df_edges = df_edges[['source_tweet_created_at', 'source', 'target', 'Weight', 'Red']]
    grouped = df_edges.groupby(['source_tweet_created_at','source', 'target', 'Red']).mean() # NOTE need to change 
                                                                            # here for different weightings

    file_path = './edge_list_%s.csv' % file_names[i]
    grouped.to_csv(file_path)

    with open(file_path, 'r') as f:
        k = 0
        for line in f:
            print(line)
            k += 1
            if k > 2:
                break
    
    print('Finished with %s' % file_names[i])

Processing file 10_16jun 
source_tweet_created_at,source,target,Red,Weight

2016-06-10 05:01:01,daily_biz_news,business,Retweet,1

2016-06-10 05:01:24,ahmdabdallah12,business,Reply,1

Finished with 10_16jun
Processing file 24_30jun 
source_tweet_created_at,source,target,Red,Weight

2016-06-24 05:00:05,trendinaliaMX,trendinaliaMX,Reply,1

2016-06-24 05:00:12,JoseGRojasZ,bolsamania,Retweet,1

Finished with 24_30jun
Processing file 27may_2jun 
source_tweet_created_at,source,target,Red,Weight

2016-05-27 05:01:54,maca_13_9,BlueEyedSoulMan,Retweet,1

2016-05-27 05:02:07,maca_13_9,yogs1961,Retweet,1

Finished with 27may_2jun
Processing file 3_9jun 
source_tweet_created_at,source,target,Red,Weight

2016-06-03 05:01:13,bruce_bwkm,Stop_The_EU,Retweet,1

2016-06-03 05:01:49,mistamark,Stop_The_EU,Retweet,1

Finished with 3_9jun
Processing file 17_23jun 
source_tweet_created_at,source,target,Red,Weight

2016-06-17 05:02:05,shattered_c,bartle_booth,Retweet,1

2016-06-17 05:02:52,REPORTER_47,DailyAg

In [7]:
# Put the may 27 - jun 2 file name first
fn = file_names[2] 
file_names[2] = file_names[0]
file_names[0] = fn
print(file_names)

['27may_2jun', '24_30jun', '10_16jun', '3_9jun', '17_23jun']


# Main Calculations

In [9]:
# Get names and node id transformation, along with transformation of date_time to int time 
names = []
_format= "%Y-%m-%d %H:%M:%S"  # format for transforming date strings to integer times

for i in range(len(file_names)):
    file_path = './edge_list_%s.csv' % file_names[i]
    df_edges = pd.read_csv(file_path)
    
    # add the unique names from this data frame
    names_i = list(np.union1d(df_edges.target.unique(), df_edges.source.unique()))
    names = list(np.union1d(names, names_i))
    
    # sort by times
    edge_list = df_edges[['source_tweet_created_at', 'source', 'target', 'Weight', 'Red']]
    edge_list_sort = edge_list.sort_values(by=['source_tweet_created_at'])
    print(edge_list_sort.head())
    
    # get the times column for transforming
    num_times = edge_list_sort['source_tweet_created_at']
    
    # find min time (in 27 may - 2 jun)
    if file_names[i] == '27may_2jun':
        print('finding first time:')
        ft = num_times[0]
        print('\tFirst time is %s' % ft)
        first_time = int(time.mktime(datetime.datetime.strptime(ft, _format).timetuple()))
        
        # define function to transform all other date strings to ordered times
        minutes = lambda s : int(time.mktime(datetime.datetime.strptime(s, _format).timetuple())) - first_time
    
    new_num_times = num_times.apply(minutes)
    edge_list_sort['source_tweet_created_at'] = new_num_times # replace the source_tweet_created_at with int times calculated
    
    print(edge_list_sort.head())
    
    # write new files with new ordering, OVER THE OLD FILES. (for saving space, otherwise would've done in dictionaries)
    print('Overwriting %s' % file_path)
    edge_list_sort.to_csv(file_path, index=False, index_label=False)

  source_tweet_created_at          source           target  Weight      Red
0     2016-05-27 05:01:54       maca_13_9  BlueEyedSoulMan       1  Retweet
1     2016-05-27 05:02:07       maca_13_9         yogs1961       1  Retweet
2     2016-05-27 05:03:47  TheJackieBrook         bbc5live       1  Retweet
3     2016-05-27 05:03:49    BrentBicycle        TheFogeys       1  Retweet
4     2016-05-27 05:05:42      JcPhilipot     RTenfrancais       1  Retweet
finding first time:
	First time is 2016-05-27 05:01:54
   source_tweet_created_at          source           target  Weight      Red
0                        0       maca_13_9  BlueEyedSoulMan       1  Retweet
1                       13       maca_13_9         yogs1961       1  Retweet
2                      113  TheJackieBrook         bbc5live       1  Retweet
3                      115    BrentBicycle        TheFogeys       1  Retweet
4                      228      JcPhilipot     RTenfrancais       1  Retweet
Overwriting ./edge_list_27m

In [10]:
# Create the transformation from node_id to Name and vv
n_nodes = len(names)
node_id2Name = dict(zip(range(n_nodes), names))
name2Node_id = dict(zip(names, range(n_nodes)))  # for use in this script since will use later on

with open('nodes.txt', 'w') as f:
    f.write('node_id,name\n')
    for i in range(n_nodes):
        line = ','.join([str(i), str(names[i])])
        f.write(line)
        f.write("\n")

print('Done writing file nodes.txt')

Done writing file nodes.txt


In [26]:
# Transform Twitter names to node id's as determined by above 
for i in range(len(file_names)):
    # read in data frame
    file_path = './edge_list_%s.csv' % file_names[i]
    df_edges = pd.read_csv(file_path)
    
    # source id's
    source_col_ids = df_edges.source.apply(lambda x: name2Node_id[x])
    df_edges['source_id'] = source_col_ids
    
    # target id's 
    target_col_ids = df_edges.target.apply(lambda x: name2Node_id[x])
    df_edges['target_id'] = target_col_ids
    
    # drop old columns, 
    df_edges = df_edges.drop(columns = ['source', 'target'], axis=1)
    df_edges = df_edges.reindex(columns = ['source_tweet_created_at', 'source_id', 'target_id', 'Weight', 'Red'])
    
    # write new files with new ordering, OVER THE OLD FILES. (for saving space, otherwise would've done in dictionaries)
    print('Overwriting %s' % file_path)
    df_edges.to_csv(file_path, index=False, index_label=False)

   source_tweet_created_at  Weight      Red  source_id  target_id
0                        0       1  Retweet    1045153      85824
1                       13       1  Retweet    1045153    1314918
2                      113       1  Retweet     600269     735667
3                      115       1  Retweet      91437     599474
4                      228       1  Retweet     296748     500483
   source_tweet_created_at  source_id  target_id  Weight      Red
0                        0    1045153      85824       1  Retweet
1                       13    1045153    1314918       1  Retweet
2                      113     600269     735667       1  Retweet
3                      115      91437     599474       1  Retweet
4                      228     296748     500483       1  Retweet
Overwriting ./edge_list_27may_2jun.csv
   source_tweet_created_at  Weight      Red  source_id  target_id
0                  2419091       1    Reply    1277170    1277170
1                  2419098       1  R

# Check

In [27]:
fp = './edge_list_%s.csv' % file_names[2]
df = pd.read_csv(fp)
df.head()

Unnamed: 0,source_tweet_created_at,source_id,target_id,Weight,Red
0,1209547,805893,761043,1,Retweet
1,1209570,688888,761043,1,Reply
2,1209572,728896,761043,1,Retweet
3,1209593,1305127,761043,1,Retweet
4,1209600,388917,942100,1,Retweet


# Create the Edge Dictionary

In [44]:
# instantiate the dictionary that will hold all of the edge lists
full_edge_dict = {}

for k in range(len(file_names)):
    # read in data frame
    #file_path = './full_data/edge_list_%s.csv' % file_names[k]
    file_path = './full_data/edge_list_%s.csv' % file_names[k]
    print('Working on %s ' % file_path)
    df_edges = pd.read_csv(file_path)
    
    #count = 0
    tic =time.clock()
    for row in df_edges.iterrows():
        r = row[1]
        t, i, j, w = r.source_tweet_created_at, int(r.source_id), int(r.target_id), float(r.Weight)
        if (i,j) not in full_edge_dict.keys():
            full_edge_dict[(i,j)] = [(t,w)]
        else:
            full_edge_dict[(i,j)].append((t,w))
        #count += 1
        #if count > 5:
        #    break
    toc = time.clock()
    print('Data frame %s took %f seconds' % (file_names[k],(toc - tic)))

Working on ./full_data/edge_list_27may_2jun.csv 
Data frame 27may_2jun took 64.073657 seconds
Working on ./full_data/edge_list_24_30jun.csv 
Data frame 24_30jun took 74.299402 seconds
Working on ./full_data/edge_list_10_16jun.csv 
Data frame 10_16jun took 129.985792 seconds
Working on ./full_data/edge_list_3_9jun.csv 
Data frame 3_9jun took 99.577079 seconds
Working on ./full_data/edge_list_17_23jun.csv 
Data frame 17_23jun took 329.422978 seconds


In [45]:
# Write to pickle file
fpath = './full_data/full_edge_dict.pkl'
#fpath = './full_edge_dict.pkl'
f = open(fpath,"wb")
pickle.dump(full_edge_dict,f)
f.close()