In [1]:
### Tweet Data Munging  ###
### Programmer: Dan Qin ###
### Date: 27.05.2018    ###
# This code is to standardize the tweet data filtered by GetTweetLoc.ipynb and append the tweet counts to the geojson file for later visualization.

# import libraries
import pandas as pd
import numpy as np
import json

In [2]:
### Tweet counts by borough population ###
# standardize the twitter data by borough population through the week
# Tuesday data is missing due to misoperation when retrieving data
# Thursday data, unix time wrong

# load tweet data 
input = ["data/TwtBorough_Sun15042018.csv","data/TwtBorough_Mon16042018.csv","data/TwtBorough_Wed18042018.csv",
          "data/TwtBorough_Fri20042018.csv","data/TwtBorough_Sat21042018.csv"]

output = ["data/twtcount_15042018.csv","data/twtcount_16042018.csv","data/twtcount_18042018.csv",
          "data/twtcount_20042018.csv","data/twtcount_21042018.csv"]


In [3]:
# load borough population data
borough_pop = pd.read_csv("data/london-borough-profiles.csv")

for i in range(len(input)):
    twtdata = pd.read_csv(input[i])
    
    # count the number of tweets by borough
    twt_count = pd.DataFrame(twtdata.borough.value_counts().rename_axis('borough').reset_index(name='counts'))
    # create copies of tweet counts
    wk_count = twt_count.copy()
    # merge them with population data
    wk_count = pd.merge(twt_count,borough_pop,on='borough')
    
    # create new columns of tweet counts by thousand population
    wk_count['countByPop'] = wk_count.counts / ((wk_count.population_2017)/1000)
    
    # save it to file 
    wk_count.to_csv(output[i], index=False)

In [27]:
### Average tweet counts through week ###
# After standardizing data for all days,
# average them to get tweet counts for weekday and weekend

# weekday average
wk_count = pd.read_csv(output[1])
wk_count = wk_count.drop(columns = ["counts","population_2017"])

for i in range(2, 4):
    temp_count = pd.read_csv(output[i])
    temp_count = temp_count.drop(columns = ["counts","population_2017"])
    wk_count = pd.merge(wk_count, temp_count, on="borough")
   
wk_count['countAvr'] = (wk_count.countByPop_x + wk_count.countByPop_y + wk_count.countByPop)/3

# weekend average
wkend_count = pd.read_csv(output[0])
wkend_count = wkend_count.drop(columns = ["counts","population_2017"])
temp_count = pd.read_csv(output[4])
temp_count = temp_count.drop(columns = ["counts","population_2017"])
wkend_count = pd.merge(wkend_count, temp_count, on="borough")
wkend_count['countAvr'] = (wkend_count.countByPop_x + wkend_count.countByPop_y)/2
wkend_count.head()

Unnamed: 0,borough,countByPop_x,countByPop_y,countAvr
0,Westminster,3.333333,3.742255,3.537794
1,Lambeth,1.289146,1.313469,1.301307
2,Camden,1.616495,1.859794,1.738144
3,Southwark,1.244034,1.336303,1.290169
4,Wandsworth,1.074766,1.028037,1.051402


In [29]:
# save csv
wk_count.to_csv('data/wkcount_borough.csv', index=False)
wkend_count.to_csv('data/wkendcount_borough.csv', index=False)

In [30]:
# save count value to JSON 
# load json file
with open('data/london_boroughs.geojson') as json_data:
    wkjson = json.load(json_data)
    
with open('data/london_boroughs.geojson') as json_data:
    wkendjson = json.load(json_data)  

In [31]:
for i in range(len(wkjson['features'])):
    for j in range(len(wk_count.index)):
        if wkjson['features'][i]['properties']['name'] == wk_count.borough[j]:
            wkjson['features'][i]['properties']['twt_count'] = float(wk_count.countAvr[j])

In [32]:
for i in range(len(wkendjson['features'])):
    for j in range(len(wkend_count.index)):
        if wkendjson['features'][i]['properties']['name'] == wkend_count.borough[j]:
            wkendjson['features'][i]['properties']['twt_count'] = float(wkend_count.countAvr[j])

In [33]:
# check
print(wkjson['features'][0]['properties']['twt_count'])
print(wkendjson['features'][0]['properties']['twt_count'])

0.23923444976076555
0.3038277511961723


In [34]:
# save json to file
with open('data/london_wkcount.json', 'w') as outfile:
    json.dump(wkjson, outfile)
    
with open('data/london_wkendcount.json', 'w') as outfile:
    json.dump(wkendjson, outfile)

In [35]:
### Compare wkday and wkend tweet counts ###
twtcount = pd.merge(wk_count,wkend_count, on="borough")

In [36]:
twtcount['variance'] = twtcount.countAvr_y - twtcount.countAvr_x
twtcount.head()

Unnamed: 0,borough,countByPop_x_x,countByPop_y_x,countByPop,countAvr_x,countByPop_x_y,countByPop_y_y,countAvr_y,variance
0,Westminster,3.837257,3.568773,3.143329,3.516453,3.333333,3.742255,3.537794,0.021341
1,Lambeth,1.413804,1.270903,1.17969,1.288132,1.289146,1.313469,1.301307,0.013175
2,Southwark,1.38721,1.269488,1.164493,1.27373,1.244034,1.336303,1.290169,0.016439
3,Camden,1.707216,1.661856,1.781443,1.716838,1.616495,1.859794,1.738144,0.021306
4,Tower Hamlets,1.194079,0.927632,0.861842,0.994518,1.013158,1.049342,1.03125,0.036732


In [37]:
with open('data/london_boroughs.geojson') as json_data:
    varyjson = json.load(json_data)

In [38]:
for i in range(len(varyjson['features'])):
    for j in range(len(twtcount.index)):
        if varyjson['features'][i]['properties']['name'] == twtcount.borough[j]:
            varyjson['features'][i]['properties']['count_var'] = float(twtcount.variance[j])

In [39]:
with open('data/london_count_var.json', 'w') as outfile:
    json.dump(varyjson, outfile)

In [40]:
twtcount.describe()

Unnamed: 0,countByPop_x_x,countByPop_y_x,countByPop,countAvr_x,countByPop_x_y,countByPop_y_y,countAvr_y,variance
count,33.0,33.0,33.0,33.0,33.0,33.0,33.0,33.0
mean,1.489971,1.331027,1.280239,1.367079,1.242041,1.384033,1.313037,-0.054042
std,3.505277,3.105281,2.96191,3.190235,2.272088,2.543998,2.406439,0.806861
min,0.244019,0.22488,0.248804,0.239234,0.277512,0.330144,0.303828,-4.507576
25%,0.545017,0.467951,0.452225,0.485047,0.528134,0.619985,0.552516,0.028114
50%,0.689852,0.586177,0.631229,0.648043,0.729761,0.672748,0.701586,0.064103
75%,1.194079,0.928225,0.912035,0.994518,1.089711,1.203062,1.109985,0.092324
max,20.681818,18.295455,17.5,18.825758,13.522727,15.113636,14.318182,0.553548


In [75]:
### Tweet counts variance over time ###
# extract datetime from unixtime
twt_datetime = []

for i in range(len(input)):
    temp_twt = pd.read_csv(input[i])
    timestamp = temp_twt.timestamp.values
    datetime = []
    
    for j in range(len(temp_twt.index)):
        datetime.append(pd.to_datetime(timestamp[j], unit='ms'))
    temp_twt['datetime'] = pd.Series(datetime, index=temp_twt.index)
    temp_twt = temp_twt.drop(columns = ['user_id','screen_name','created_at','amount_tweeted','language','location','primary_geo','borough','timestamp','text_tweet'])
    temp_twt['count'] = 1
    twt_datetime.append(temp_twt)


In [129]:
# aggregate tweets by hour
twtByHour = []

for i in range(len(twt_datetime)):
    temp = twt_datetime[i].set_index('datetime')
    temp = temp.resample('1H').count()
    temp['hour'] = temp.index.hour + 1
    temp = temp.reset_index(drop = True)
    if len(temp.index)>12:
        n = len(temp.index) - 12
        temp = temp[:-n]
    twtByHour.append(temp)
    
# 18:00-19:00 data was missed for Wednesday
# fill in the average value of weekday
temp_avr = (twtByHour[1]['count'][0] + twtByHour[3]['count'][0])/2
line = pd.DataFrame({'count':temp_avr,'hour':18}, index=[0])
twtByHour[2] = pd.concat([line,twtByHour[2]]).reset_index(drop=True)

# Thursday data was retrieve from two weeks
# thus causing err in resampling, dump useless rows
# twtByHour[3] = twtByHour[3].drop(twtByHour[3].index[5:173])
# twtByHour[3] = twtByHour[3].reset_index(drop = True)
# if len(twtByHour[3].index)>12:
#     n = len(twtByHour[3].index) - 12
#     twtByHour[3] = twtByHour[3][:-n]

In [131]:
wk_avr = twtByHour[1].copy()
for i in range(2, 4):
    temp = twtByHour[i]
    wk_avr['count'] = wk_avr['count'] + temp['count']    
wk_avr['count'] = wk_avr['count']/3

wkend_avr = twtByHour[0].copy()
wkend_avr['count'] = (wkend_avr['count'] + twtByHour[4]['count'])/2

In [134]:
# save it to file 
wk_avr.to_csv('data/wkcount_time.csv', index=False)
wkend_avr.to_csv('data/wkendcount_time.csv', index=False)