# Getting Twitter Data for Cities that Have Declared a Climate Emergency #

This notebook focuses on getting Twitter data (all tweets) from the 10 largest cities (population-wise) that have declared a climate emergency. 

    -Los Angeles
    -Seattle 
    -Denver
    -New York
    -Chicago
    -San Diego
    -San Jose
    -Austin
    -San Francisco
    -Boston

This first portion of the notebook is dedicated to printing an output that we'll use for a library called "Twitterscraper." This package uses CL for data collection. We'll load in the data back into this notebook. 

https://github.com/taspinar/twitterscraper
    
Once the data from twitterscraper is loaded, for the last portion, we'll then merge all of the cities' data into one large dataset for analysis. 

In [304]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import json      # library for working with JSON-formatted text strings
import pprint as pp    # library for cleanly printing Python data structures
import seaborn as sns
import twitterscraper as ts
from twitterscraper import query_tweets #library downloaded
import os as os

import subprocess #this enables us to pass CL code directly from Jupyter Notebooks 
from subprocess import Popen

## Creating a Twitterscraper Command ## 

The code below scrapes Twitter accounts from each city, scrapes *all* of their tweets, and makes one big JSON file. Rather than pasting the command into the CL, this function uses "subprocess" (a standard library already with Python) to pass the command directly through Jupyter Notebooks. 


In [440]:
def json_to_df(json_files):
    data_frames = []
    
    for file in json_files:
        print (file)
        with open(file) as f:
            data = json.load(f)
        
        d = {'username': [x['username'] for x in data],
        'time': [x['timestamp'] for x in data],
        'tweet': [x['text'] for x in data],
        'likes': [x['likes'] for x in data],
        'replies': [x['replies'] for x in data],
        'user_ID' : [x['screen_name'] for x in data]}
    
        data_frames.append(pd.DataFrame.from_dict(d))
    return data_frames

def combine_data(data_frames): #this will allow us to merge dataframes "*" allows us to pass X dataframes
    return pd.concat(data_frames)


def buildQuery(accounts):
    scraper_query = ''
    
    #this builds our search query
    for index, each_account in enumerate (accounts):
        next_index = index + 1 #this is so that we don't have an extra "OR" at the end, it "knows" the last thing
        if next_index > len(accounts) - 1: 
            scraper_query = scraper_query + "from:"+ each_account
        else:
            scraper_query = scraper_query + "from:"+ each_account + " OR "
            
    return scraper_query

def launch(command, output):
    print (command)
    
    outputFile = open(output, 'w+')
    p = Popen(command, stdout=outputFile, stderr=outputFile, universal_newlines=True)
    output, errors = p.communicate()
    #p.wait() # Wait for sub process to finish before moving on to make frame 
    
    if errors:
        print (errors)
    myoutput.close()
            
def scrape(accounts):
    data_files = []
    
    for user in accounts:
        path_to_output_file = user + ".txt" #we'll get both txt and json, but just ignore txt
        path_to_data_file = user + ".JSON"
        data_files.append(path_to_data_file)
        
        query = 'from: ' + user
        command = ["twitterscraper", query, 
                   "--lang", "en", "--all", "-ow", "-p", "40", "-o", path_to_data_file]
        launch(command, path_to_output_file)
 
    return data_files 

Below, I created a list of all the accounts I wish to scrape (I broke it up into 3 "searches" because this process is extremely time-consuming). However, using "scrape()" you can input all the accounts, it'll just an hour or so to get all the data.

In [385]:
climate_emergency_accounts = ["SeattleOPCD", "CityofSeattle", "seattledot", "SeattleOSE", "kcmetrobus", 
                             "LACity", "LADOTofficial", "lacountyparks", "HCIDLA", "Planning4LA", "metrolosangeles", "PortofLA", 
                             "NYC_DOT", "NYCParks", "NYCHA", "NYCPlanning", "nycemergencymgt", "MTA"]
                             
climate_emergency_output = scrape(climate_emergency_accounts) 



['twitterscraper', 'from: SeattleOPCD', '--lang', 'en', '-o', 'SeattleOPCD.JSON', '--all', '-ow', '-p', '40']
None
['twitterscraper', 'from: CityofSeattle', '--lang', 'en', '-o', 'CityofSeattle.JSON', '--all', '-ow', '-p', '40']
None
['twitterscraper', 'from: seattledot', '--lang', 'en', '-o', 'seattledot.JSON', '--all', '-ow', '-p', '40']
None
['twitterscraper', 'from: SeattleOSE', '--lang', 'en', '-o', 'SeattleOSE.JSON', '--all', '-ow', '-p', '40']
None
['twitterscraper', 'from: kcmetrobus', '--lang', 'en', '-o', 'kcmetrobus.JSON', '--all', '-ow', '-p', '40']
None
['twitterscraper', 'from: LACity', '--lang', 'en', '-o', 'LACity.JSON', '--all', '-ow', '-p', '40']
None
['twitterscraper', 'from: LADOTofficial', '--lang', 'en', '-o', 'LADOTofficial.JSON', '--all', '-ow', '-p', '40']
None
['twitterscraper', 'from: lacountyparks', '--lang', 'en', '-o', 'lacountyparks.JSON', '--all', '-ow', '-p', '40']
None
['twitterscraper', 'from: HCIDLA', '--lang', 'en', '-o', 'HCIDLA.JSON', '--all', '-o

In [392]:
climate_emergency_accounts_2 = ["chicago", "ChicagoDOT", "ChicagoParks", "ChicagoDOH", "ChicagoDPD", "ChicagoOEMC", "cta", 
                             "CityofSanDiego", "sandiegoparks", "SanDiegoPlan", "ReadySanDiego", "sdcountydpw", "sdmts", "portofsandiego"]


climate_emergency_output_2 = scrape(climate_emergency_accounts_2)

['twitterscraper', 'from: chicago', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'chicago.JSON']
['twitterscraper', 'from: ChicagoDOT', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'ChicagoDOT.JSON']
['twitterscraper', 'from: ChicagoParks', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'ChicagoParks.JSON']
['twitterscraper', 'from: ChicagoDOH', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'ChicagoDOH.JSON']
['twitterscraper', 'from: ChicagoDPD', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'ChicagoDPD.JSON']
['twitterscraper', 'from: ChicagoOEMC', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'ChicagoOEMC.JSON']
['twitterscraper', 'from: cta', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'cta.JSON']
['twitterscraper', 'from: CityofSanDiego', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'CityofSanDiego.JSON']
['twitterscraper', 'from: sandiegoparks', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'sandiegoparks.JSON']
['twitterscraper', 'from: Sa

In [393]:
climate_emergency_accounts_3 = ["CityofSanJose", "SanJoseDOT", "sjparksandrec", "sjcityhousing", "buildingsanjose", "VTA", 
                             "austintexasgo", "austinmobility", "AustinCityParks", "Hacanet", "ImagineAustin", "AustinHSEM", "AusPublicHealth", "CapMetroATX", 
                             "SFEnvironment", "sfmta_muni", "RecParkSF", "sfplanning", "SF_emergency", "sfpublicworks", "SFPort", 
                             "CityOfBoston", "BostonEnviro", "BostonBTD", "BostonParksDept", "BHA_Boston", "BostonPlans", "AlertBoston", "HealthyBoston", "MBTA", 
                             "DenverCityGov", "SustainableDen", "DenverDOTI", "denverparksrec", "DenverCPD", "DDPHE"] 

climate_emergency_output_3 = scrape(climate_emergency_accounts_3)


['twitterscraper', 'from: CityofSanJose', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'CityofSanJose.JSON']
['twitterscraper', 'from: SanJoseDOT', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'SanJoseDOT.JSON']
['twitterscraper', 'from: sjparksandrec', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'sjparksandrec.JSON']
['twitterscraper', 'from: sjcityhousing', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'sjcityhousing.JSON']
['twitterscraper', 'from: buildingsanjose', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'buildingsanjose.JSON']
['twitterscraper', 'from: VTA', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'VTA.JSON']
['twitterscraper', 'from: austintexasgo', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'austintexasgo.JSON']
['twitterscraper', 'from: austinmobility', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'austinmobility.JSON']
['twitterscraper', 'from: AustinCityParks', '--lang', 'en', '--all', '-ow', '-p', '40', '-o', 'AustinCityPar

In [434]:
#ignore the below, my original function didn't return a list with .JSON - and I didn't want to re-run the scraping process. 

climate_emergency_accounts = ["SeattleOPCD", "CityofSeattle", "seattledot", "SeattleOSE", "kcmetrobus", 
                             "LACity", "LADOTofficial", "lacountyparks", "HCIDLA", "Planning4LA", "metrolosangeles", "PortofLA", 
                             "NYC_DOT", "NYCParks", "NYCHA", "NYCPlanning", "nycemergencymgt", "MTA"]

climate_emergency_output_1 = []
for account in climate_emergency_accounts:
    climate_emergency_output_1.append(account + ".JSON")
    
print (climate_emergency_output_1)

['SeattleOPCD.JSON', 'CityofSeattle.JSON', 'seattledot.JSON', 'SeattleOSE.JSON', 'kcmetrobus.JSON', 'LACity.JSON', 'LADOTofficial.JSON', 'lacountyparks.JSON', 'HCIDLA.JSON', 'Planning4LA.JSON', 'metrolosangeles.JSON', 'PortofLA.JSON', 'NYC_DOT.JSON', 'NYCParks.JSON', 'NYCHA.JSON', 'NYCPlanning.JSON', 'nycemergencymgt.JSON', 'MTA.JSON']


## Converting JSONs to DataFrames ##

json_to_df() takes the json list output above and converts all the data into a list of dataframes. 

In [433]:
dataframe_1 = json_to_df(climate_emergency_output_1)

dataframe_2 = json_to_df(climate_emergency_output_2)

dataframe_3 = json_to_df(climate_emergency_output_3)

SeattleOPCD.JSON
CityofSeattle.JSON
seattledot.JSON
SeattleOSE.JSON
kcmetrobus.JSON
LACity.JSON
LADOTofficial.JSON
lacountyparks.JSON
HCIDLA.JSON
Planning4LA.JSON
metrolosangeles.JSON
PortofLA.JSON
NYC_DOT.JSON
NYCParks.JSON
NYCHA.JSON
NYCPlanning.JSON
nycemergencymgt.JSON
MTA.JSON
chicago.JSON
ChicagoDOT.JSON
ChicagoParks.JSON
ChicagoDOH.JSON
ChicagoDPD.JSON
ChicagoOEMC.JSON
cta.JSON
CityofSanDiego.JSON
sandiegoparks.JSON
SanDiegoPlan.JSON
ReadySanDiego.JSON
sdcountydpw.JSON
sdmts.JSON
portofsandiego.JSON
CityofSanJose.JSON
SanJoseDOT.JSON
sjparksandrec.JSON
sjcityhousing.JSON
buildingsanjose.JSON
VTA.JSON
austintexasgo.JSON
austinmobility.JSON
AustinCityParks.JSON
Hacanet.JSON
ImagineAustin.JSON
AustinHSEM.JSON
AusPublicHealth.JSON
CapMetroATX.JSON
SFEnvironment.JSON
sfmta_muni.JSON
RecParkSF.JSON
sfplanning.JSON
SF_emergency.JSON
sfpublicworks.JSON
SFPort.JSON
CityOfBoston.JSON
BostonEnviro.JSON
BostonBTD.JSON
BostonParksDept.JSON
BHA_Boston.JSON
BostonPlans.JSON
AlertBoston.JSON
He

In [459]:
merge1 = combine_data(dataframe_1)
merge2 = combine_data(dataframe_2)
merge3 = combine_data(dataframe_3)

frames = [merge1, merge2, merge3]

result = pd.concat(frames)
result

to_keep = ["SeattleOPCD", "CityofSeattle", "seattledot", "SeattleOSE", "kcmetrobus", 
            "LACity", "LADOTofficial", "lacountyparks", "HCIDLA", "Planning4LA", "metrolosangeles", "PortofLA", 
            "NYC_DOT", "NYCParks", "NYCHA", "NYCPlanning", "nycemergencymgt", "MTA",
            "CityofSanJose", "SanJoseDOT", "sjparksandrec", "sjcityhousing", "buildingsanjose", "VTA", 
            "austintexasgo", "austinmobility", "AustinCityParks", "Hacanet", "ImagineAustin", "AustinHSEM", "AusPublicHealth", "CapMetroATX", 
            "SFEnvironment", "sfmta_muni", "RecParkSF", "sfplanning", "SF_emergency", "sfpublicworks", "SFPort", 
            "CityOfBoston", "BostonEnviro", "BostonBTD", "BostonParksDept", "BHA_Boston", "BostonPlans", "AlertBoston", "HealthyBoston", "MBTA", 
            "DenverCityGov", "SustainableDen", "DenverDOTI", "denverparksrec", "DenverCPD", "DDPHE", 
             "chicago", "ChicagoDOT", "ChicagoParks", "ChicagoDOH", "ChicagoDPD", "ChicagoOEMC", "cta", 
            "CityofSanDiego", "sandiegoparks", "SanDiegoPlan", "ReadySanDiego", "sdcountydpw", "sdmts", "portofsandiego"]

final_results = result[~result['user_ID'].isin(to_keep) == False] # the code above got all mentions & replies
final_results.head(5)

Unnamed: 0,username,time,tweet,likes,replies,user_ID
0,Seattle Office of Planning & Community Develop...,2016-01-08T23:37:18,OPCD staff visited Lake City today to meet wit...,1,0,SeattleOPCD
1,Seattle Office of Planning & Community Develop...,2016-01-08T23:37:18,OPCD staff visited Lake City today to meet wit...,1,0,SeattleOPCD
3,Seattle Office of Planning & Community Develop...,2017-10-05T18:31:02,Bonus #ThrowbackThursday post: Ticket stub fro...,0,0,SeattleOPCD
4,Seattle Office of Planning & Community Develop...,2017-09-29T20:17:27,Here's some photos from yesterday's successful...,1,0,SeattleOPCD
5,Seattle Office of Planning & Community Develop...,2017-09-15T15:30:05,"A startup aims to stop gentrification, with he...",1,0,SeattleOPCD


In [None]:
final_results.to_csv("Final Results.csv")