### Geography of Tweet Sentiment
This notebook does the following:
- Loads the tweet data from files specified below and filters geotagged tweets
- Uses NLTK to get sentiment for each tweet
- Converts the tweet sentiments to a geo dataframe
- Creates a geo data frame from provided locations and buffers the points
- Performs a spatial join between tweets and locations
- Averages sentiment by locations
- Visualize the results

In [32]:
tweet_directory = 'tweets/'
Locations = {'timeSquare' : [40.759232, -73.984694], 
'woldTradecenter' : [40.711702, -74.012722],
'brooklynBridge' : [40.705720, -73.996349]}
GeoBuffer = 0.008

In [13]:
from __future__ import print_function
import os
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt
import requests 
import json
import numpy as np
import io
import json
import nltk
import shapely
from geopandas.tools import sjoin
from fiona.crs import from_epsg
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

%matplotlib inline

In [14]:
def getSentimentTweet(t):
    sentences = nltk.tokenize.sent_tokenize(t)
    s = []
    for sentence in sentences:
        s.append(sid.polarity_scores(sentence)["compound"])
    return (np.mean(s))

In [15]:
def loadGeoTaggedTwitterData():
    with open(tweetFiles[0]) as json_file:  
        data = json.load(json_file)
    for i in range(1, len(tweetFiles)):
        with open(tweetFiles[i]) as json_file:  
            data += json.load(json_file)
    print("Tweets: ",len(data))
    tweets = [(d['text'],d['geo']['coordinates']) for i,d in enumerate(data) if d['geo'] is not None]
    print('Geotagged: ',len(tweets))
    return tweets

In [16]:
def getSentimentGeoDf(tweets):
    sentiment = [getSentimentTweet(t[0]) for t in tweets]
    lat = [t[1][0] for t in tweets]
    lon = [t[1][1] for t in tweets]
    
    df = pd.DataFrame({'lat':lat,'lon':lon,'sentiment':sentiment})
    df['lonlat'] = list(zip(df["lon"].values,df["lat"].values))
    df['geometry'] = df[['lonlat']].applymap(lambda x:shapely.geometry.Point(x))
    gdf = gpd.GeoDataFrame(df)
    return gdf

In [17]:
def getGeoDataFrameLocations(locationDict):
    loc = locationDict.keys()
    lat = [locationDict[l][0] for l in loc]
    lon = [locationDict[l][1] for l in loc]
    
    df = pd.DataFrame({'lat':lat,'lon':lon,'Landmark':loc})
    df['lonlat'] = list(zip(df["lon"].values,df["lat"].values))
    df['geometry'] = df[['lonlat']].applymap(lambda x:shapely.geometry.Point(x))
    gdf = gpd.GeoDataFrame(df)
    gdf['geometry'] = gdf.geometry.apply(lambda x: x.buffer(GeoBuffer))
    
    return gdf

In [26]:
landmarkGdf = getGeoDataFrameLocations(Locations)

In [19]:
tweetFiles = os.listdir(tweet_directory)
tweetFiles = [tweet_directory+t for t in tweetFiles]
tweets = loadGeoTaggedTwitterData()

Tweets:  64776
Geotagged:  9942


In [27]:
tweetGdf = getSentimentGeoDf(tweets)
print(tweetGdf.shape)
tweetGdf.head(3)

(9942, 5)


Unnamed: 0,lat,lon,sentiment,lonlat,geometry
0,40.898333,-74.029827,0.0,"(-74.02982712, 40.89833309)",POINT (-74.02982711999999 40.89833309)
1,40.847698,-73.169601,0.0,"(-73.169601, 40.847698)",POINT (-73.169601 40.847698)
2,40.847698,-73.169601,0.0,"(-73.169601, 40.847698)",POINT (-73.169601 40.847698)


In [28]:
tweetGdf.dropna(subset=["lat", "lon"]).shape

(9942, 5)

In [29]:
resultGdf = gpd.sjoin(landmarkGdf,tweetGdf,how='inner')
resultGdf.shape

(2008, 10)

In [30]:
landmarkSent = resultGdf[['Landmark','sentiment']].groupby('Landmark').mean()
print(landmarkSent.shape)
landmarkCount = resultGdf[['Landmark','sentiment']].groupby('Landmark').count()
landmarkCount.columns = ['count']
landmarkSent = landmarkSent.merge(landmarkCount, on='Landmark', how='left')

(3, 1)


In [31]:
landmarkSent

Unnamed: 0_level_0,sentiment,count
Landmark,Unnamed: 1_level_1,Unnamed: 2_level_1
brooklynBridge,0.111466,70
timeSquare,0.112754,334
woldTradecenter,0.120509,1604


In [33]:
landmarkSent = landmarkSent.reset_index()
landmarkSent.to_csv('landmarkSent.csv',index=False)