In [1]:
import tweepy
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import cnfg
from tqdm import tqdm
from pymongo import MongoClient
import json
import pandas as pd
import numpy as np
from textblob import TextBlob

import sys
import os

%matplotlib inline

In [2]:
config = cnfg.load(".twitter_config.txt")

auth = tweepy.OAuthHandler(config["consumer_key"],
                           config["consumer_secret"])
auth.set_access_token(config["access_token"],
                      config["access_token_secret"])

# api=tweepy.API(auth,wait_on_rate_limit=True)

api = tweepy.API(auth)

In [3]:
#You can check how many queries you have left using rate_limit_status() method
api.rate_limit_status()['resources']['search']

{'/search/tweets': {'limit': 180, 'remaining': 180, 'reset': 1542332264}}

In [4]:
#Switching to application authentication
auth = tweepy.AppAuthHandler(config["consumer_key"], config["consumer_secret"])

#Setting up new api wrapper, using authentication only
api = tweepy.API(auth, wait_on_rate_limit=True,wait_on_rate_limit_notify=True)
 
#Error handling
if (not api):
    print ("Problem Connecting to API")
 

In [5]:
#You can check how many queries you have left using rate_limit_status() method
api.rate_limit_status()['resources']['search']

{'/search/tweets': {'limit': 450, 'remaining': 450, 'reset': 1542332266}}

# Set keywords for Twitter search

In [6]:
#This is what we are searching for
#We can restrict the location of tweets using place:id 
#We can search for multiple phrases using OR
searchQuery = '#globalwarming OR #climatechange OR #climate OR' \
              '"global warming" OR "climate change" OR "emmisions" OR "sea level" OR "sea-level" OR' \
              '"fossil fuels" OR "greenhouse gas" '\
              '     -filter:retweets AND -filter:replies'

In [7]:
#Maximum number of tweets we want to collect 
maxTweets = 30000

#The twitter Search API allows up to 100 tweets per query
tweetsPerQry = 100

# Get tweets in the last week

In [8]:
tweetCount = 0
tweet_dict_list = []


#Tell the Cursor method that we want to use the Search API (api.search)
#Also tell Cursor our query, and the maximum number of tweets to return
for t in tqdm(tweepy.Cursor(api.search,q=searchQuery).items(maxTweets)) : 
    d = {}
    d['created_at'] = t.created_at
    d['favorite_count'] = t.favorite_count
    d['retweet_count'] = t.retweet_count
    d['text'] = t.text
    d['screen_name'] = t.user.screen_name
    
    tweet_dict_list.append(d)
    
    tweetCount += 1

#Display how many tweets we have collected
print("Downloaded {0} tweets".format(tweetCount))


6718it [00:52, 116.74it/s]Rate limit reached. Sleeping for: 849
13445it [16:03, 100.94it/s]Rate limit reached. Sleeping for: 847
20123it [31:09, 109.45it/s]Rate limit reached. Sleeping for: 848
26824it [46:21, 116.13it/s]Rate limit reached. Sleeping for: 843
30000it [1:00:57,  8.20it/s] 

Downloaded 30000 tweets





# Add document to MongoDB


In [9]:
# Connect to monogodb server.  Make sure it's running first!
client = MongoClient('localhost', 27017)

client.list_database_names()

['admin', 'climate_db', 'climate_db2', 'config', 'local', 'wildlife_db']

In [11]:
# Connect to the database for the first time
climate_db3 = client['climate_db3']

# Create a collection
climate_collection3 = climate_db3.tweets

In [12]:
climate_collection3.insert_many(tweet_dict_list)

<pymongo.results.InsertManyResult at 0x1a213691c8>

# Get updated document from database and create a dataframe

In [13]:
res = climate_db3['tweets'].find()


In [14]:
res_list = [item for item in res]


In [15]:
df = pd.DataFrame(res_list)

df.head()

Unnamed: 0,_id,created_at,favorite_count,retweet_count,screen_name,text
0,5bee2ad7b2857344b2eca5c5,2018-11-16 01:22:48,0,0,DPonizil,"Incredibly proud , heartened and relived to ha..."
1,5bee2ad7b2857344b2eca5c6,2018-11-16 01:22:47,0,0,AtNickJackson,Fun drinking game for this weekend: Take one s...
2,5bee2ad7b2857344b2eca5c7,2018-11-16 01:22:43,0,0,PoliticPR,"Climate Change, Steel, Migration Bedevil G20 C..."
3,5bee2ad7b2857344b2eca5c8,2018-11-16 01:22:36,0,0,Carol_Daniels,Your Children’s Yellowstone Will Be Radically ...
4,5bee2ad7b2857344b2eca5c9,2018-11-16 01:22:32,0,0,Boxer751,https://t.co/PilHN2Grbx \n\nCongratulations De...


In [89]:
auth = tweepy.OAuthHandler(config["consumer_key"],
                           config["consumer_secret"])
auth.set_access_token(config["access_token"],
                      config["access_token_secret"])

# api=tweepy.API(auth,wait_on_rate_limit=True)

api = tweepy.API(auth)

In [90]:
#You can check how many queries you have left using rate_limit_status() method
api.rate_limit_status()['resources']['search']

{'/search/tweets': {'limit': 180, 'remaining': 180, 'reset': 1541976133}}

# Set up a listener

In [100]:
from tweepy import Stream
from tweepy.streaming import StreamListener


In [133]:
class listener(StreamListener):

    def on_data(self, data):
        data = json.loads(data)
        
        # Pull the fields we want, and throw it into our mongodb database
        if data['lang'] == 'en' and (not data['retweeted']) and ('RT @' not in data['text']):
            tweet_document = {}

            tweet_document['created_at'] = data['created_at']
            tweet_document['favorite_count'] = data['favorite_count']
            tweet_document['retweet_count'] = data['retweet_count']

            #Get full text if it is an extended tweet
            if 'extended_tweet' in data.keys():
                tweet_document['text'] = data['extended_tweet']['full_text']
            else:
                tweet_document['text'] = data['text']

            tweet_document['screen_name'] = data['user']['screen_name']


           # print(tweet_document)
           # print('\n')
            climate_collection.insert_one(tweet_document)
        
        return True

    def on_error(self, status):
        print(status)


In [134]:
twitterStream = Stream(auth, listener())
data = twitterStream.filter(track=["global warming", "climate", "climate change", "environment"])

401
401
401
401
401
401
401
401
401
401
401
401
401
401
401
401


KeyboardInterrupt: 