# Mapping Twitter Data
This notebook provides the basis for developing spatial representations of a variety of interesting data that is presented in the Ferguson Twitter dataset.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
import json
import time
import pandas as pd
import cartopy.crs as ccrs
from ipywidgets import widgets
import matplotlib.cm as cm

Let's begin with the data. We have two datasets. The first is data from the two weeks following the initial shooting. The second is data from the two weeks following the indictment. Twitter has provided an extensive amount of data for us, and we have developed a cleaning mechanism which can be observed in a seperate script. The data is placed in a data frame:

In [None]:
def ReadAndAggregate(linesAtATime,filepath, aggregationFunction):
    i = 0
    data = [] 
    
    #Open and read in the file
    with open(filename) as cleanedTweets:
        for tweet in cleanedTweets:
            i += 1
            jsonline = json.loads(tweet)
            data.append(jsonline)
            #aggregate once we've read in the appropriate number of liens
            if (i % linesAtATime == 0):
                print i
                df = pd.DataFrame(data=data)
                #First time running through
                if (i == linesAtATime):
                    agg = aggregationFunction(df)
                else:
                    agg = aggregationFunction(df, agg)
                #reset the data
                data = []
        #Handle the last few tweets
        df = pd.DataFrame(data=data)
        agg = aggregationFunction(df, agg)
    #return the aggregation
    return agg

In [None]:
def createHashtagsDf (df, hashtagsDataFrame=pd.DataFrame()):
    
    #time based recodes
    df['DoW'] = df['created_at'].apply(lambda x: str(x)[0:3:])
    df['Month'] = df['created_at'].apply(lambda x: str(x)[4:7:])
    df['DoM'] = df['created_at'].apply(lambda x: str(x)[8:10:])
    df['Hour'] = df['created_at'].apply(lambda x: int(str(x)[11:13:]))
    df['Minute'] = df['created_at'].apply(lambda x: int(str(x)[14:16:]))

    #let's grab the coordinates from the coordinates field, which is actually a dictionary
    coords = []
    for i in range(len(df)):
        location = df['coordinates'][i]
        try: #want to make sure that null values don't throw an exception
            coords.append(location.get('coordinates'))
        except:
            coords.append([None,None])

    df['Coords'] = coords
    df['x'] = df['Coords'].apply(lambda x: x[0])
    df['y'] = df['Coords'].apply(lambda x: x[1])

    #now, let's grab the information from place, which is similarly a very detailed dictionary of elements
    name = []
    country = []
    coordinates = []
    full_name = []
    for i in range(len(df)):
        location = df['place'][i]
        try:
            coordinates.append(location.get('bounding_box').get('coordinates'))
            name.append(location.get('name'))
            full_name.append(location.get('full_name'))
            country.append(location.get('country'))
        except:
            coordinates.append([[[0,0],[0,0],[0,0],[0,0]]])
            name.append('')
            full_name.append('')
            country.append('')

    mean_coords = []
    for box in coordinates:
        mean_coord_x = (box[0][0][0]+box[0][1][0]+box[0][2][0]+box[0][3][0])/4
        mean_coord_y = (box[0][0][1]+box[0][1][1]+box[0][2][1]+box[0][3][1])/4
        mean_coords.append([mean_coord_x,mean_coord_y])

#     df['city'] = name
#     df['country'] = country
#     df['full_name'] = full_name
    df['P_Coords'] = mean_coords
    df['x_p'] = df['P_Coords'].apply(lambda x: x[0])
    df['y_p'] = df['P_Coords'].apply(lambda x: x[1])
    
    rowsToAdd = []
    for index,tweet in df.iterrows():
        for hashtag in tweet['entities_hashtags_text']:
            rowsToAdd.append({
                    'DoW': tweet['DoW'],
                    'Month': tweet['Month'],
                    'DoM': tweet['DoM'],
                    'Hour': tweet['Hour'],
                    'Minute': tweet['Minute'],
                    'Hashtag': hashtag.upper(),
                    'x': tweet['x'],
                    'y': tweet['y'],
                    'x_p': tweet['x_p'],
                    'y_p': tweet['y_p'],
                    'tweetId': tweet['id_str']
                })
    tempDf = pd.DataFrame(rowsToAdd)
    hashtagsDataFrame = hashtagsDataFrame.append(tempDf)

    return hashtagsDataFrame

In [None]:
filename='./data/cleanedShootingTweets.json'
data = ReadAndAggregate(100000,filename, createHashtagsDf)


In [None]:
data['x'].fillna(value = data['x_p'], inplace = True)
data['y'].fillna(value = data['y_p'], inplace = True)

In [None]:
data

In [None]:
data['x']

In [None]:
data.info()

In [None]:
#the cartopy map, going simple outline for now
plt.figure(figsize=(15,15))
ax = plt.axes(projection=ccrs.PlateCarree())
ax.coastlines()

#straight-up coordinate data which we just finished recoding
plt.scatter(data.x,data.y,color='r')
plt.scatter(data.x_p,data.y_p,color='g')

plt.show()

Now, to add a temporal element to this, we add a slider which pulls in some of the time columns we also just pulled out of interest.

In [None]:
latLonPopulated = data[(data['x'] != 0) & (data['y'] != 0)]

In [None]:
latLonPopulated

In [None]:
hashtagGrouped = latLonPopulated.groupby('Hashtag').count().reset_index().sort_values(by = 'DoM', ascending = False)
sortedHashtags = hashtagGrouped['Hashtag'].tolist()
topHashtags = sortedHashtags[0:10]

In [None]:
topHashtags

In [None]:
def plotHashtagLocation(DoM, Hour):
    plt.figure(figsize=(15,15))
    ax = plt.axes(projection=ccrs.PlateCarree())
    ax.coastlines()
    
    for i,hashtag in enumerate(topHashtags):
        subset = latLonPopulated[
            (latLonPopulated['DoM'] == str(DoM)) & 
            (latLonPopulated['Hour']==Hour) & 
            (latLonPopulated['Hashtag'] == hashtag)]
        if (len(subset) == 0):
            continue
        
        plt.scatter(subset.x,subset.y, color= cm.jet(i/float(len(topHashtags))), label = hashtag)
#         plt.scatter(subset.x_p,subset.y_p, color= cm.jet(i/float(len(topHashtags))), label = hashtag)
    plt.axis([-180, 180, -75, 75])
#     plt.axis([-180, 0, -75, 75])
    plt.legend()

In [None]:
DoM = widgets.IntSlider(min=10, max=27, value=17)
Hour = widgets.IntSlider(min=0, max=23, value=17)
widgets.interact(plotHashtagLocation,DoM = DoM, Hour=Hour)