# Project Result - Map & Histogram
# Team Hadochi 

## Michiel Voermans
## Jorn van der Ent

#### 2/2/2017

### <font color='red'> Make sure "Twitter Streamer.ipynb" is launched and running correctly, before attempting to run this script</font>


### Import libraries and set vincent

In [1]:
# Database Connection
import psycopg2

# Time
import datetime
import os, time

# Spatial operations
from osgeo import ogr
from osgeo import osr
from shapely.geometry import Point

# Vincent 
import vincent
from vincent import AxisProperties, PropertySet, ValueRef
vincent.core.initialize_notebook()

# Other
import re
import shutil
import folium
import geopandas as gpd

## Enter your database connection details:

In [2]:
dbname = "tweets"
user = "user"
password = "user"
table = "rawtweets"

## Calculation Functions

In [39]:
def count(alist):
    adict = {}
    for elem in alist:
        if elem not in adict:
            adict[elem] = int(1)
        else:
            adict[elem] += int(1)
    return adict

def point_transform(pointwkt):
    # Create source SRS
    source = osr.SpatialReference()
    source.ImportFromEPSG(4326)

    # Create target SRS
    target = osr.SpatialReference()
    target.ImportFromEPSG(27700)
    
    transform = osr.CoordinateTransformation(source, target)
    
    # Read POINT
    match1 = re.search('(POINT \(.*[0-9]*.[0-9]* .*[0-9]*.[0-9]*\))', pointwkt)
    pointstr = match1.group(1)
    
    # Transform
    point = ogr.CreateGeometryFromWkt(pointstr)
    point.Transform(transform)
    
    # Read new lat/lon
    match2 = re.search('POINT \((.*[0-9]*.[0-9]*) (.*[0-9]*.[0-9]*)\)', str(point))
    lon = float(match2.group(1))
    lat = float(match2.group(2))
    
    return (lon, lat)
  
def json_histograms(tweet_id, hashcountdict):
    
    # Initialize new list
    hashcount_not1 = {}
    
    # Add to new list when count >= 2
    for hashtag in hashcountdict:
        if hashcountdict[hashtag] >= 2:
            hashcount_not1[hashtag] = hashcountdict[hashtag]
        
    if hashcount_not1 != {}:
        # Get the number of hashtags to be able to set the width of the graph
        numberhashes = len(hashcount_not1)
        # Create the Bar-Graph
        hist = vincent.Bar(hashcount_not1, width = (numberhashes * 50), height = 100)
        # Set the angle of the x-labels
        ax = AxisProperties( labels = PropertySet(angle = ValueRef(value = 8)))
        hist.axes[0].properties = ax
        # Set distance between bars
        hist.scales['x'].padding = 0.2
        # Set axis titles
        hist.axis_titles(x='HashTag', y ='Count')
        # Create JSON file
        filepath = 'data/%s.json' % tweet_id
        hist.to_json(filepath, pretty_print = False)
        return(hist, filepath)
    else:
        return(None, None)

def getlookbacktime(starttime, lookback):
    lookbacktime = starttime - 60*lookback
    # Convert to UTC-time
    lookbacktime = datetime.datetime.utcfromtimestamp(lookbacktime)
    # Create time-string
    lookbacktime = lookbacktime.strftime('%H:%M:%S')
    return lookbacktime

def getGDF(startdate, lookbacktime):
    # Setup connection details
    con = psycopg2.connect(database=dbname, user=user,password=password)
    # Build query
    sql = "SELECT * FROM rawtweets WHERE date = '%s' AND time > '%s'" % (startdate, lookbacktime)
    # Read PostGis into GeoPandas DataFrame
    gdf = gpd.GeoDataFrame.from_postgis(sql, con, geom_col = 'geom')
    return gdf

def extractcoords(point):
    # Extract lat and lon from geometry format in geopandas
    match = re.search('POINT \((.*[0-9]*.[0-9]*) (.*[0-9]*.[0-9]*)\)', point)
    lon_wgs = float(match.group(1))
    lat_wgs = float(match.group(2))
    return (lon_wgs, lat_wgs)

def hashtaglist_from_row(gdf_row):
    # Retrieve the hashtags out of the DataFrame
    text = gdf_row['text']
    match = re.search('[0-9]*\s*(.*)\s*Name', str(text))
    hashtags = match.group(1)
    # Split by spaces and put the hashtags in a list
    hashtags = hashtags.split(" ")

    # Create hashtaglist
    new_hashtags = []
    for elem in hashtags:
        if elem != '':
            new_hashtags += [elem]
    return new_hashtags

def make_tweet_hash_count_dict(gdf, bufferdist):
    # Initialize full twitter dictionaty
    tweet_dict = {}
    
    # Make point of each row in DataFrame
    for tweet_id_orig in gdf['id']:     
        
        # Get one row of information
        row_orig = gdf[gdf.id == tweet_id_orig]  
        
        # Retrieve a list of hashtags from the row
        hashtags_orig = hashtaglist_from_row(row_orig)
        
        # Retrieve point geometry
        point_orig = str(row_orig['geom'])
        lon_wgs, lat_wgs = extractcoords(point_orig)
             
        # Get lon lat in UK CRS
        lon_uk_orig, lat_uk_orig = point_transform(point_orig)

        # Create buffer for point
        single_buf = Point(lat_uk_orig, lon_uk_orig).buffer(bufferdist)     
        
        # Initialize list for overlapping tweet id's
        overlaplist_id = []     
        
        # Look through all the other points whether they are within the buffer
        for tweet_id_child in gdf['id']:
            row_child = gdf[gdf.id == tweet_id_child]
            point_child = str(row_child['geom'])

            # Get lon lat in UK CRS
            lon_uk_child, lat_uk_child = point_transform(point_child)
            
            # Actual check if it lies within the buffer
            if single_buf.contains(Point(lat_uk_child, lon_uk_child)):
                # Add point id to overlaplist
                overlaplist_id += [tweet_id_child]
                
        # Combine the id with the count dictionary of the tweet, and its coordinates
        tweet_dict[tweet_id_orig] = {'hashlist' : hashtags_orig, 'coordinates' : [lat_wgs, lon_wgs], 'overlapid' : overlaplist_id}
        
    return tweet_dict
        
def make_map(tweet_dict):
    
    # Initialize final output map
    tweetmap = folium.Map(location =[51.507351, -0.127758], zoom_start = 12, control_scale=True)
    
    # Initialize a MarkerCluster 
    marker_cluster = folium.MarkerCluster("tweets").add_to(tweetmap)
    
    # Create graph and plot each tweet
    for tweet_id_orig in tweet_dict:
        overlap_ids = tweet_dict[tweet_id_orig]['overlapid']
        count_overlapid = len(overlap_ids)
        
        # Retrieve hastags from origin
        hashtags_orig = tweet_dict[tweet_id_orig]['hashlist']
        
        # Create list of hashtags which are used by multiple id's
        hashlist = []
        for tweet_id_child in overlap_ids:
            # Check whether child id is not equal to orig id
            if tweet_id_child != tweet_id_orig:
                childhashes = tweet_dict[tweet_id_child]['hashlist']
                # Pass each hashtag in child
                for hashchild in childhashes:
                    # Add to list if hashtag in child is also in origin
                    if hashchild in hashtags_orig:
                        hashlist += [hashchild]

        # Make count dictionary out of it
        count_hashes = count(hashlist)
        
        if count_overlapid > 1: # To tackle own id in overlapids
            # Get the vincent.Bar object
            hist, filepath = json_histograms(tweet_id_orig, count_hashes)
       
            # Plot point and area
            if hist != None:
                # Retrieve the width of the histogram
                width = hist.grammar()['width']
                tweetmap.simple_marker(tweet_dict[tweet_id_orig]['coordinates'], popup = (hist, filepath), 
                                       popup_width = (width + 50), marker_color = 'blue', marker_icon = 'stats')
                tweetmap.circle_marker(tweet_dict[tweet_id_orig]['coordinates'], 
                                       radius = bufferdist, fill_color = '#1dcaff', fill_opacity = 0.1, 
                                       line_color = '#00aced')
        # If there's no histogram, make a thumb-up point with popup
        elif tweet_dict[tweet_id_orig]['hashlist'] != []:
            folium.Marker(tweet_dict[tweet_id_orig]['coordinates'], popup = str(tweet_dict[tweet_id_orig]['hashlist']), icon = folium.Icon(color = 'lightgray', icon = 'thumbs-up')).add_to(marker_cluster)
        # If there's no hashtag, make a thumb-down point without popup    
        else:
            folium.Marker(tweet_dict[tweet_id_orig]['coordinates'], icon = folium.Icon(color = 'lightgray', icon = 'thumbs-down')).add_to(marker_cluster)
    return tweetmap

## Make map

In [27]:
# Let user set the lookbacktime and bufferdistance
lookback = input("How far back in time (minutes) would you like to go? ")
bufferdist = input("How large can your buffer (meters) distance be? ")

# Start calculation time
start = time.time() 
lookbacktime = getlookbacktime(start, lookback)

# Clear the data folder
if os.path.isdir('data'):
    shutil.rmtree('data')
os.mkdir('data')

# Create output folder if needed
if os.path.isdir('output') == False:
    os.mkdir('output')

# Get the date
starttime = datetime.datetime.utcfromtimestamp(start)
startdate = starttime.strftime('%Y-%m-%d')

# Get the data from the DataBase in a GeoPandas DataFrame
gdf = getGDF(startdate, lookbacktime)

# Make a dictionary of all tweets, contain counts of hashtags of all points within buffer
tweet_dict = make_tweet_hash_count_dict(gdf, bufferdist)

# Make the map out of the gdf
tweetmap = make_map(tweet_dict)
tweetmap.save('output/MAP_time_%s_lb_%s_buf_%s.html' % (starttime, lookback, bufferdist))

# Plot map
tweetmap

How far back in time (minutes) would you like to go? 120
How large can your buffer (meters) distance be? 200




## Function used to make a histogram of all collected hashtags

In [49]:
def general_histogram(tweetdict):
    # Initialize
    all_hasheslist = []
    general_dict = {}
    count_threshold = input("What is the minimum count of hashtags you want to display? \
                            \n (Think about the amount data that comes with the specified lookbacktime) ")
    # Make a list of all hashes
    for tweet_id in tweet_dict:
        all_hasheslist += tweet_dict[tweet_id]['hashlist']
    
    # Make count dictionary of hashes
    allhashdict = count(all_hasheslist)
    
    # Create new dictionary with all hashes with a count higher than 1
    for hashtag in allhashdict:
        if allhashdict[hashtag] > count_threshold:
            general_dict[hashtag] = allhashdict[hashtag]
    
    # Make histogram    
    if general_dict != {}:
        # Get the number of hashtags to be able to set the width of the graph
        numberhashes = len(general_dict)
        # Create the Bar-Graph
        hist = vincent.Bar(general_dict, width = (numberhashes * 50), height = 200)
        # Set the angle of the x-labels
        ax = AxisProperties( labels = PropertySet(angle = ValueRef(value = 10)))
        hist.axes[0].properties = ax
        # Set distance between bars
        hist.scales['x'].padding = 0.2
        # Set axis titles
        hist.axis_titles(x='HashTag', y ='Count')
        return hist
    else:
        return None

## Make the Histogram

In [50]:
# Get the General Histogram
histogram = general_histogram(tweet_dict) # <-- Be aware that it needs the tweet_dictionary created in the previous cell.
if histogram != None:
    histogram.to_json('output/HIST_time_%s_lb_%s_buf_%s.json' % (starttime, lookback, bufferdist), 
                      html_out = True, html_path = 'output/HIST_time_%s_lb_%s_buf_%s.html' % (starttime, lookback, bufferdist))
# Show Histogram
histogram

What is the minimum count of hashtags you want to display?                             
 (Think about the amount data that comes with the specified lookbacktime) 5
