# Scrape tweets regarding global warming and fossil fuels from 2015 - 2018

### Get tweets from 2015 - 2018 regarding global warming and fossil fuels:

In [None]:
from searchtweets import ResultStream, gen_rule_payload, load_credentials, collect_results

In [None]:
#credentials
premium_search_args = load_credentials("/Users/keerthanasankar/Documents/input-onlineyamltools.txt", 
                                       account_type="premium")

In [None]:
#query criteria
rule = gen_rule_payload("global warming", results_per_call = 100, from_date = "2017-06-01", to_date = "2017-12-31")
print(rule)

In [None]:
#call api and collect tweets
tweets = collect_results(rule,
                         max_results=500,
                         result_stream_args=premium_search_args) # change this if you need to

In [None]:
#see results
[print(tweet.all_text) for tweet in tweets[0:10]];

In [None]:
#write data to file
import json
with open('2017-global.json', 'w') as outfile:
    json.dump(tweets, outfile)

### Obtain location data for tweets:

In [None]:
import json

# Tweets are stored in in file "fname". In the file used for this script, 
# each tweet was stored on one line

fname = '2017-global.json'
counter=0
with open(fname, 'r') as f:
    
    #Create dictionary to later be stored as JSON. All data will be included
    # in the list 'data'
    users_with_geodata = {
        "data": []
    }
    all_users = []
    total_tweets = 0
    geo_tweets  = 0
    for line in f:
        
        while counter < 500:
            print(counter)
            tweet = json.loads(line)
            #print(tweet[2])
            if tweet[counter]['user']['id']:
                total_tweets += 1 
                user_id = tweet[counter]['user']['id']
                if user_id not in all_users:
                    all_users.append(user_id)

                    #Give users some data to find them by. User_id listed separately 
                    # to make iterating this data later easier
                    user_data = {
                        "user_id" : tweet[counter]['user']['id'],
                        "tweet": tweet[counter]["text"],
                        "features" : {
                            "name" : tweet[counter]['user']['name'],
                            "id": tweet[counter]['user']['id'],
                            "screen_name": tweet[counter]['user']['screen_name'],
                            "tweets" : 1,
                            "location": tweet[counter]['user']['location'],
                        }
                    }

                    #Iterate through different types of geodata to get the variable primary_geo
                    if tweet[0]['coordinates']:
                        user_data["features"]["primary_geo"] = str(tweet[counter]['coordinates'][tweet[counter]['coordinates'].keys()[1]][1]) + ", " + str(tweet[counter]['coordinates'][tweet[counter]['coordinates'].keys()[1]][0])
                        user_data["features"]["geo_type"] = "Tweet coordinates"
                    elif tweet[0]['place']:
                        user_data["features"]["primary_geo"] = tweet[counter]['place']['full_name'] + ", " + tweet[counter]['place']['country']
                        user_data["features"]["geo_type"] = "Tweet place"
                    else:
                        user_data["features"]["primary_geo"] = tweet[counter]['user']['location']
                        user_data["features"]["geo_type"] = "User location"

                    #Add only tweets with some geo data to .json. Comment this if you want to include all tweets.
                    if user_data["features"]["primary_geo"]:
                        users_with_geodata['data'].append(user_data)
                        geo_tweets += 1

                #If user already listed, increase their tweet count
                elif user_id in all_users:
                    for user in users_with_geodata["data"]:
                        if user_id == user["user_id"]:
                            user["features"]["tweets"] += 1
            counter+=1
    #Count the total amount of tweets for those users that had geodata            
    for user in users_with_geodata["data"]:
        geo_tweets = geo_tweets + user["features"]["tweets"]

    #Get some aggregated numbers on the data
    print ("The file included " + str(len(all_users)) + " unique users who tweeted with or without geo data")
    print ("The file included " + str(len(users_with_geodata['data'])) + " unique users who tweeted with geo data, including 'location'")
    print ("The users with geo data tweeted " + str(geo_tweets) + " out of the total " + str(total_tweets) + " of tweets.")

# Save data to JSON file
with open('2017-global-locations.json', 'w') as fout:
    fout.write(json.dumps(users_with_geodata, indent=4))

### Clean location data and get state name:

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [None]:
from geopy.geocoders import Nominatim

In [None]:
geolocator = Nominatim(user_agent="viz_project")

In [None]:
#convert json to csv before running this code

global_15 = pd.read_csv('2015-global-locations.csv')
global_16 = pd.read_csv('2016-global-locations.csv')
fossil_15 = pd.read_csv('2015-fossil-locations.csv')
fossil_16 = pd.read_csv('2016-fossil-locations.csv')
global_17 = pd.read_csv('2017-global-locations.csv')
global_18 = pd.read_csv('2018-global-locations.csv')
fossil_17 = pd.read_csv('2017-fossil-locations.csv')
fossil_17 = pd.read_csv('2018-fossil-locations.csv')

In [None]:
#new column for states
global_17['state']=''

In [None]:
#return the state name
def find_state(df):
    for index, row in df.iterrows():
        try:
            state = geolocator.geocode(row.features__location).raw['display_name'].split(',')
            if (state[-1]==' USA'):
                df.set_value(index, 'state', state[-2])
            else:
                df.set_value(index, 'state', '')
        except:
            df.set_value(index, 'state', '')

In [None]:
#call function
find_state(global_17)

In [None]:
#run function on all files
find_state(global_15)
find_state(fossil_15)
find_state(fossil_16)
find_state(global_16)
find_state(fossil_17)
find_state(fossil_18)
find_state(global_17)
find_state(global_18)

In [None]:
#print data to file
global_15.to_csv('global_15_tweets.csv', index=False)
global_16.to_csv('global_16_tweets.csv', index=False)
fossil_15.to_csv('fossil_15_tweets.csv', index=False)
fossil_16.to_csv('fossil_16_tweets.csv', index=False)
global_17.to_csv('global_17_tweets.csv', index=False)
global_18.to_csv('global_18_tweets.csv', index=False)
fossil_17.to_csv('fossil_17_tweets.csv', index=False)
fossil_18.to_csv('fossil_18_tweets.csv', index=False)

### Resources:

https://pypi.org/project/searchtweets/1.0/

http://www.mikaelbrunila.fi/2017/03/27/scraping-extracting-mapping-geodata-twitter/

https://pypi.org/project/geopy/