In [None]:
import json
import csv
import os
import glob
import sys
import datetime
import time
import re
import ast
import itertools
import collections
import nltk
from nltk import bigrams
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point, Polygon, box, MultiPolygon
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

### Get tweet data and create pandas dataframe

In [None]:
date = "aug_oct22"
print (date)

In [None]:
#path to directory containing tweet json files
directory = "[path to folder with fracking_local_***.json files]"

#create pandas dataframe for tweets
df_tweets = pd.DataFrame()

#loop through each tweet file in directory, check number of tweets in each file and add to dataframe
for file in os.listdir(directory):
    filename = str(file)
    print(filename)
    with open(directory + '/' + filename) as file:
        data = json.load(file)
        print(len(data)) #to check the number of tweets in the file
    df = pd.json_normalize(data)
    if df_tweets.empty:
        df_tweets = df
    else:
        df_tweets = pd.concat([df_tweets, df])

#check number of tweets in dataframe is as expected and look at the first 5 rows
print(len(df_tweets))

#.head() returns the top 5(by default ) lines of the dataframe
df_tweets.head()

#### Calculate sentiment of tweet text and add to dataframe

In [None]:
df_tweets['compound'] = [analyzer.polarity_scores(x)['compound'] for x in df_tweets['source_text']]
df_tweets['neg'] = [analyzer.polarity_scores(x)['neg'] for x in df_tweets['source_text']]
df_tweets['neu'] = [analyzer.polarity_scores(x)['neu'] for x in df_tweets['source_text']]
df_tweets['pos'] = [analyzer.polarity_scores(x)['pos'] for x in df_tweets['source_text']]
df_tweets.head()

### Use pandas for some initial analysis of the dataset

In [None]:
print(df_tweets['source_id'].count())

In [None]:
# check for any duplicate tweets in dataset and remove them
df_tweets.drop_duplicates(subset=None, inplace=True)
print(df_tweets['source_id'].count())

In [None]:
# convert 'source_timestamp' values to datetime format
df_tweets['source_timestamp'] = pd.to_datetime(df_tweets['source_timestamp'], format='%Y-%m-%d %H:%M:%S')
# sort dataset by date/time
df_tweets = df_tweets.sort_values(by='source_timestamp')
print(df_tweets.head())

#### Plot a timeseries

In [None]:
# prepare data to plot tweet count and sentiment by hour
tweet_counts_h = df_tweets.resample('H', on='source_timestamp').source_id.count()
sentiment_mean_h = df_tweets.resample('H', on='source_timestamp').compound.mean()

In [None]:
# plot tweet count timeseries by hour
plt.figure()
tweet_counts_d.plot(kind="line", fontsize=10)
plt.title("Number of tweets by day")
plt.tight_layout()
plt.show()

In [None]:
# plot tweet count timeseries by hour
plt.subplot(2,1,1)
tweet_counts_h.plot(kind="line", fontsize=10, xlabel='Date', ylabel='number of tweets')
plt.title("Number of tweets by hour (retweets excluded)")

# plot tweet sentiment timeseries by hour
plt.subplot(2,1,2)
sentiment_mean_h.plot(kind="line", fontsize=10, xlabel='Date', ylabel='mean sentiment poloarity score', color="green")

plt.tight_layout()
plt.show()

In [None]:
# Hourly plot is a bit messy, resample the dataframe to plot tweets by day instead
tweet_counts_d = df_tweets.resample('D', on='source_timestamp').source_id.count()
sentiment_mean_d = df_tweets.resample('D', on='source_timestamp').compound.mean()

In [None]:
plt.subplot(2,1,1)
tweet_counts_d.plot(kind="line", fontsize=10, xlabel='Date', ylabel='number of tweets')
plt.title("Number of tweets by day (retweets excluded)")

plt.subplot(2,1,2)
sentiment_mean_d.plot(kind="line", fontsize=10, xlabel='Date', ylabel='mean sentiment poloarity score', color="green")

plt.tight_layout()
plt.show()

### Now use Geopandas to plot the data on a map

#### First you need to use location data for tweet to create geodataframe ready for mapping with GeoPandas

In [None]:
gdf = gpd.GeoDataFrame(df_tweets, crs=4326, geometry=gpd.GeoSeries.from_wkt(df_tweets.location))
gdf.head()

## Tweets by world country

#### Need to get Countries_WGS84.shp file
#### Use GeoPandas to read shape data into a new geodataframe

In [None]:
countries = "[path/to/folder/containing/shapefile/]Countries_WGS84.shp"
map_countries = gpd.read_file(countries)
map_countries.head()

#### Now let's merge our tweet geodataframe and our countries geodataframe using 'sjoin' - this will allow us to assign tweet location to a country polygon from the shapefile data

In [None]:
gdf_countries = gdf.to_crs(epsg=4326) # this transforms the geometry to the correct coordinate reference system for the shapefile.
tweets_countries =gpd.sjoin(gdf_countries, map_countries, how="inner", predicate='intersects')
tweets_countries.head()

##### OPTIONAL: Send data to a csv file for checking/further analysis

In [None]:
outfile = "[path/to/folder/]fracking_local_" + date + ".csv"
tweets_countries.to_csv(outfile)

#### List the number of tweets (tweet count) by country - check if this looks sensible

In [None]:
tweets_by_country = tweets_countries.groupby( [ "CNTRY_NAME"] ).size().to_frame(name = 'count').reset_index()
tweets_by_country

#### Merge the geodataframe with the cleaned up csv dataframe using country name column heading

In [None]:
# joining the geodataframe with the cleaned up csv dataframe
merged = map_countries.set_index('CNTRY_NAME').join(tweets_by_country.set_index('CNTRY_NAME'))
# update any NaN values in the count column to 0
merged["count"] = merged["count"].fillna(0)
#.head() returns the top 5(by default ) lines of the dataframe
merged.max(numeric_only=True)

In [None]:
merged.head()

### Now let's make a map!

#### Plotting tweet count for each country polygon

In [None]:
# set a variable that will call whatever column we want to visualise on the map
variable = 'count'
# set the range for the choropleth
vmin, vmax = merged['count'].min(), merged['count'].max()
# create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(30,12))

merged.plot(column=variable, cmap='OrRd', linewidth=0.8, ax=ax, edgecolor='0.8')

# remove the axis
ax.axis('off')
# add a title
title = 'Number of fracking tweets by country ' + date
ax.set_title(title, fontdict={'fontsize': '25', 'fontweight' : '3'})
# create an annotation for the data source
ax.annotate('using located tweets containing the keyword "fracking" in english',xy=(0.1, .08), xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=12, color='#555555')

# Create colorbar as a legend
sm = plt.cm.ScalarMappable(cmap='OrRd', norm=plt.Normalize(vmin=vmin, vmax=vmax))
# empty array for the data range
sm._A = []
# add the colorbar to the figure
cbar = fig.colorbar(sm, label=r'Number of tweets', format='%1.0f', ax=ax)
#saving our map as .png file.
#fig_name = '/path/to/folder/frackingtweets_country_english_' + date + '.png'
# fig.savefig(fig_name, dpi=300)

## Your turn!!

### Plot tweet text sentiment instead of tweet count - using the same steps as above for tweet count map

In [None]:
#use the tweet dataframe to group the compound sentiment of tweets by country


In [None]:
# join the geodataframe with the cleaned up csv dataframe



In [None]:
# set a variable that will call whatever column we want to visualise on the map
variable = 

# set the range for the choropleth


# create figure and axes for Matplotlib

# remove the axis

# add a title

# create an annotation for the data source

# Create colorbar as a legend

# empty array for the data range

# add the colorbar to the figure



### Plot Tweet count/sentiment by European country admin level 1 using the same steps as above for country

In [None]:
# You will need the "ref-nuts-2016-3035_LEVL_1" shapefile. Read the shapefile using Geopandas.


In [None]:
# use sjoin to merge the map data with the located tweet data
# you may need to transform the Geodataframe to a different coordinate reference system (crs)
# https://geopandas.org/en/stable/docs/user_guide/projections.html
# https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.to_crs.html



In [None]:
#outfile = "[path to outfile]_tweets_europe.csv"
#tweets_europe.to_csv(outfile)

#### Plot tweet count by admin area

In [None]:
#group tweet counts by NUTS_NAME


In [None]:
# join the geodataframe with the cleaned up csv dataframe

# update any NaN values in the count column to 0 (optional)


In [None]:
# Make a map of tweet count by European country admin level 1!


#### Now plot sentiment by admin area

In [None]:
#group tweet sentiment columns ('compound', 'neg' and 'pos') by NUTS_NAME


In [None]:
# joining the geodataframe with the cleaned up csv dataframe


In [None]:
#take a look at the minimum and maximum compound sentiment values (helps with deciding scale for colouring on map)


#### Plot compound sentiment of tweets by admin area

#### Now have a go at plotting 'neg' and 'pos' sentiment values in two subplots, side by side.

### Now let's use the European country admin level 3 shapefile for a specific date in the dataset

In [None]:
# Get the nuts level 3 shapefile (NUTS_RG_01M_2016_3035_LEVL_3.shp) and use geopandas to create the geodataframe


In [None]:
# Decide what specific date you would like to plot tweets for. Use format yyyymmdd.
specific_date = '20221019'
# boolean variable that gives us the option to plot for a spcific date or for the whole dataset
use_date = True

In [None]:
gdf_europe_L3 = gdf.to_crs(epsg=3035)
tweets_europe_L3 = gpd.sjoin(gdf_europe_L3, map_europe_L3, how="inner", predicate='intersects')
if use_date:
    tweets_europe_L3 = tweets_europe_L3[tweets_europe_L3.source_date == specific_date]
tweets_europe_L3.head()

In [None]:
#option to save output to csv again
#tweets_europe_L3.to_csv("/path/to/folder/fracking_local_europeL3_" + specific_date + ".csv")

In [None]:
tweets_europe_by_country_L3 = tweets_europe_L3.groupby( [ "NUTS_NAME"] ).size().to_frame(name = 'count').reset_index()
tweets_europe_by_country_L3

In [None]:
# joining the geodataframe with the cleaned up csv dataframe
merged_europe_L3 = map_europe_L3.set_index('NUTS_NAME').join(tweets_europe_by_country_L3.set_index('NUTS_NAME'))
#.head() returns the top 5(by default ) lines of the dataframe
merged_europe_L3.max(numeric_only=True)

In [None]:
# set a variable that will call whatever column we want to visualise on the map
variable = 'count'
# set the range for the choropleth
vmin, vmax = merged_europe_L3['count'].min(), merged_europe_L3['count'].max()
# set date variable to use in title and filename
if use_date:
    date = specific_date
# create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(30,12))

merged_europe_L3.plot(column=variable, cmap='OrRd', linewidth=0.8, ax=ax, edgecolor='0.8')

# add a point and label on the map to show the location of London
plt.plot(3621142.41, 3204082.16, markersize=5, marker='o', color='black')       ##transform long and lat co-ordinates to EPSG 3035 using https://epsg.io/transform
ax.text(3621142.41, 3204082.16, '  London', color='black', size=15, ha='left', va='center')


# remove the axis
ax.axis('off')
# add a title
title = 'Number of fracking tweets by European admin level 3 ' + date
ax.set_title(title, fontdict={'fontsize': '25', 'fontweight' : '3'})
# create an annotation for the data source
#ax.annotate('using tweets containing the keyword "fracking" in english',xy=(0.02, .02), xycoords='axes fraction', horizontalalignment='left', verticalalignment='top', fontsize=12, color='#555555')

# Create colorbar as a legend
sm = plt.cm.ScalarMappable(cmap='OrRd', norm=plt.Normalize(vmin=vmin, vmax=vmax))
# empty array for the data range
sm._A = []
# add the colorbar to the figure
cbar = fig.colorbar(sm, label=r'Number of tweets', format='%1.0f', ax=ax)
#saving our map as .png file.
#fig_name = '/path/to/folder/frackingtweets_europeL3_english_' + date + '.png'
#fig.savefig(fig_name, dpi=300)

### Plot sentiment ('compound') for this date as well

In [None]:
# group mean compound sentiment by NUTS_NAME


In [None]:
# joining the geodataframe with the cleaned up csv dataframe


In [None]:
# take a look at max and min values


In [None]:
# plot the map


### Plot negative tweets only

### Plot Positive tweets only

### Plot tweets using London borough shapefile

In [None]:
london = "/Users/mds228/social_sensing/mapping/statistical-gis-boundaries-london/ESRI/London_Borough_Excluding_MHW.shp"


In [None]:
#Things to think about:
    # - How to normalise for tweet activity in a location for population/propensity for tweets from that location 