## Tweepy API Scrape 
_Authors: Yichen Hu & Christopher Thompson_

### Imports ###

In [2]:
import tweepy
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import json

#pip install may be required for some libraries 

### GeoPy Functions / Code ###

In [3]:
geolocator = Nominatim(user_agent = "DSI_US_8_LA")

In [4]:
# Function to take a user-entered address and return it's coordinates
def latlong(point):
    coord = geolocator.geocode(point)
    return (coord.latitude,coord.longitude)

In [5]:
cord = latlong('360 East 2nd Street, Los Angeles, CA')

### Twitter Authentication / Tweepy Initialization ###

In [6]:
json_file = open('creds/twitter_creds.json')
json_str = json_file.read()
json_data = json.loads(json_str)

In [7]:
#Code taken from http://www.dealingdata.net/2016/07/23/PoGo-Series-Tweepy/
auth = tweepy.OAuthHandler(json_data['API key'],json_data['API secret key'])
auth.set_access_token(json_data['Access token'], json_data['Access token secret'])

#Creating a twitter API wrapper using tweepy
api = tweepy.API(auth)

#Error handling
if (not api):
    print ("Problem connecting to API")

In [8]:
#Function to return the hashtags in a tweet
def hashtag_process(raw):
    if len(raw)==0:
        return None
    else:
        output = []
        for x in raw:
            if x.get('text') != 'LAtraffic':
                output.append(x.get('text'))
        return output

TweetSearch Function Parameters

- location - exact latitude and longitude to start search from, should be the output of latlong function
- distance - distance in miles to search for (default is 25)
- user - twitter hashtag to pull Tweets from (default is TotalTrafficLA)
- limit - number of tweets to initially pull from the user before applying location filtering (default is 100)

In [9]:
def TweetSearch(location,distance = 25,user = 'TotalTrafficLA',limit = 100):
    full_text = []
    author = []
    creation_time = []
    hashtags = []
    geo = []
    
    for tweet in tweepy.Cursor(api.user_timeline,id=user,tweet_mode='extended').items(limit):
        if tweet.geo != None:
            if geodesic(location,tweet.geo['coordinates']).miles <= distance: #only allow tweets from within the location radius
                #parse out tweet data and save to lists
                full_text.append(tweet.full_text)
                author.append(tweet.author.screen_name)
                creation_time.append(tweet.created_at)
                hashtags.append(hashtag_process(tweet.entities['hashtags'])) #run function to return a tweet's hashbags in a good form
                geo.append(tweet.geo['coordinates'])
    
    return pd.DataFrame([full_text,author,geo,hashtags,creation_time],\
                        index=['Tweet Text','Author','Coordinates','Hashtags (Location)','Creation Time (UTC)']).T

In [10]:
#Pass in latitude and longitude points from Flask 
df = TweetSearch("34.1581656060606, -118.333947181818")

In [11]:
df

Unnamed: 0,Tweet Text,Author,Coordinates,Hashtags (Location),Creation Time (UTC)
0,A crash is blocking the carpool lane. in #Arca...,TotalTrafficLA,"[34.14892, -118.0632]",[Arcadia],2019-07-29 16:23:49
1,A crash is blocking the carpool lane. in #Arca...,TotalTrafficLA,"[34.14892, -118.0632]",[Arcadia],2019-07-29 16:12:13
2,A motorcycle crash was moved to the right shou...,TotalTrafficLA,"[33.97027, -118.3747]",[Westchester],2019-07-29 15:19:49
3,A motorcycle crash is blocking the left lane. ...,TotalTrafficLA,"[33.97027, -118.3747]",[Westchester],2019-07-29 14:53:08
4,A crash was moved to the right shoulder. in #W...,TotalTrafficLA,"[34.17033, -118.6103]",[WoodlandHills],2019-07-29 14:52:02
5,A motorcycle crash is blocking the left lane. ...,TotalTrafficLA,"[33.97577, -118.3852]",[CulverCity],2019-07-29 14:43:22
6,A crash was moved to the right shoulder. in #W...,TotalTrafficLA,"[34.17033, -118.6103]",[WoodlandHills],2019-07-29 14:38:27
7,"Accident, left lane blocked in #WoodlandHills ...",TotalTrafficLA,"[34.17033, -118.6103]",[WoodlandHills],2019-07-29 14:26:26
8,A stalled car was cleared from the left lane. ...,TotalTrafficLA,"[33.92564, -118.3216]",[Hawthorne],2019-07-29 13:41:56
9,Crash blocking the trans to the 110 north in #...,TotalTrafficLA,"[34.07887, -118.22]",[ElysianPark],2019-07-28 18:08:04


In [None]:
df.to_csv('data/first_scrape.csv',index=False)