# Scraper Pickle

In [26]:
!pip install twitterscraper==1.4.0



In [27]:
import pandas as pd
import time, datetime
import re
import pickle

from datetime import timedelta
from twitterscraper import query_tweets
from nltk.tokenize import RegexpTokenizer

___

In [28]:
    def scrape_twitter(place):
        # set empty lists that we will fill with tweet data
        text = []
        # scrape twitter for tweets containing certain keywords
        query_string = f'"power outage" OR "power is out" OR "power\'s out" OR "blackouts" OR "blackout" -"video game" OR "power failure" OR "power failures" OR "no electricity" OR "power shortage" OR "electrical failure" OR "power loss" OR "power cuts" OR "power cut" OR "power went out" OR "power interuption" OR "brownout" OR "power goes out" OR "brownouts" OR "without power" near:"{place}" within:15mi -filter:retweets'
        list_of_tweets = query_tweets(query_string,
                                      begindate = datetime.datetime.today().date(),
                                      enddate = datetime.datetime.today().date() + timedelta(days=1),
                                      poolsize = 2,
                                      lang="en")
        # loop through each tweet to grab data and append the data to their respective lists
        for tweet in list_of_tweets:
            text.append(tweet.text)

        # build the dataframe
        df = pd.DataFrame({
            'tweet': text,
        })

        # remove any twitter pic urls
        df['tweet'] = [re.sub(r'pic.twitter.com\S+', '', post).strip() for post in df['tweet']]
        # remove any http urls
        df['tweet'] = [re.sub(r'http\S+', '', post).strip() for post in df['tweet']]

        # instatiate the tokenizer
        tknr = RegexpTokenizer(r'[a-zA-Z&0-9]+')
        # start with empty lists
        tokens = []
        # fill the list with tokenized versions of each post title
        for post in df['tweet']:
            tokens.append(" ".join(tknr.tokenize(post.lower())))
        df['tweet'] = tokens

        # drop duplicates
        #df = df.drop_duplicates()

        return df['tweet']

In [29]:
df = scrape_twitter('California')

INFO: queries: ['"power outage" OR "power is out" OR "power\'s out" OR "blackouts" OR "blackout" -"video game" OR "power failure" OR "power failures" OR "no electricity" OR "power shortage" OR "electrical failure" OR "power loss" OR "power cuts" OR "power cut" OR "power went out" OR "power interuption" OR "brownout" OR "power goes out" OR "brownouts" OR "without power" near:"California" within:15mi -filter:retweets since:2019-11-08 until:2019-11-09']
INFO: Querying "power outage" OR "power is out" OR "power's out" OR "blackouts" OR "blackout" -"video game" OR "power failure" OR "power failures" OR "no electricity" OR "power shortage" OR "electrical failure" OR "power loss" OR "power cuts" OR "power cut" OR "power went out" OR "power interuption" OR "brownout" OR "power goes out" OR "brownouts" OR "without power" near:"California" within:15mi -filter:retweets since:2019-11-08 until:2019-11-09
INFO: Scraping tweets from https://twitter.com/search?f=tweets&vertical=default&q="power%20outa

In [30]:
df.shape

(52,)

___
Bring in our pickled model and run a prediction

In [12]:
model = pickle.load(open('../Files/final_model.p', 'rb'))

In [13]:
prediction = model.predict_proba(df)

In [14]:
avg_prob(prediction)

0.30866537937779914

In [7]:
def avg_prob(some_array):
    counter = 0
    for i in some_array:
        counter += i[1]
    return counter/len(some_array)

avg_prob(prediction)

NameError: name 'prediction' is not defined

In [7]:
model.predict_proba([clean_scrape(df)])

array([[0.75, 0.25]])

In [22]:
preds = []
for tweet in df.drop_duplicates()['tweet']:
    preds.append(model.predict([tweet]))
    
print(sum(preds)/len(preds))

[0.16666667]


In [23]:
# New York
df = scrape_twitter('New York')
print(model.predict([clean_scrape(df)]))
print(model.predict_proba([clean_scrape(df)]))

INFO: queries: ['"power outage" OR "power is out" OR "power\'s out" OR "blackouts" OR "blackout" -"video game" OR "power failure" OR "power failures" OR "no electricity" OR "power shortage" OR "electrical failure" OR "power loss" OR "power cuts" OR "power cut" OR "power went out" OR "power interuption" OR "brownout" OR "power goes out" OR "brownouts" OR "without power" near:"New York" within:15mi -filter:retweets since:2019-11-07 until:2019-11-08']
INFO: Querying "power outage" OR "power is out" OR "power's out" OR "blackouts" OR "blackout" -"video game" OR "power failure" OR "power failures" OR "no electricity" OR "power shortage" OR "electrical failure" OR "power loss" OR "power cuts" OR "power cut" OR "power went out" OR "power interuption" OR "brownout" OR "power goes out" OR "brownouts" OR "without power" near:"New York" within:15mi -filter:retweets since:2019-11-07 until:2019-11-08
INFO: Scraping tweets from https://twitter.com/search?f=tweets&vertical=default&q="power%20outage"%

[0]
[[0.75 0.25]]


In [24]:
preds = []
for tweet in df.drop_duplicates()['tweet']:
    preds.append(model.predict([tweet]))
    
print(sum(preds)/len(preds))

[0.22807018]


In [25]:
clean_scrape(df)

'wait did a bobcat cause your power outage i literally just tweeted the same thing last night it sounds conspiratorial but feels very much part of the broad media blackout on the sanders campaign designed to confuse voters into thinking that she is the real threat i can t wait to blackout by 4pm on saturday me blackout drunk when my friend asks if i want a patr n shot so a blackout drunk rapist with gambling debts is considered a life to emulate no wonder the gop is so rotten appropriately for a brand that once created a blackout in a can four loko s hard seltzer will be 12 abv pg&e says wildfires the bankruptcy case blackouts and other expenses will cost up to 6 3 billion this year those things really start to add up and that doesn t even include the kincade fire by markchediak willwwade pcg nwcnewstream nwc operations now being restored after jpsco power outage sections of st catherine and st andrew affected nwcnewstream nwc operations now being restored after jpsco power outage sect

In [30]:
tweet1= df['tweet'][0]
tweet2= df['tweet'][1]
tweet3= df['tweet'][2]
model.predict_proba([tweet1 + tweet2 + tweet3])

array([[0.51388889, 0.48611111]])

In [28]:
model.predict_proba([tweet1])

array([[0.71590909, 0.28409091]])

In [29]:
model.predict_proba([tweet2])

array([[0.625, 0.375]])

In [31]:
model.predict_proba([tweet3])

array([[0.26858836, 0.73141164]])

In [36]:
(model.predict_proba([tweet1])[0][1] +model.predict_proba([tweet2])[0][1] + model.predict_proba([tweet3])[0][1])/3

0.4635008490073145

In [38]:
preds_proba = []
for tweet in df.drop_duplicates()['tweet']:
    preds_proba.append(model.predict_proba([tweet])[0][1])
    
sum(preds_proba)/len(preds_proba)

0.39003812273359606

In [39]:
# Michigan
df = scrape_twitter('Michigan')

INFO: queries: ['"power outage" OR "power is out" OR "power\'s out" OR "blackouts" OR "blackout" -"video game" OR "power failure" OR "power failures" OR "no electricity" OR "power shortage" OR "electrical failure" OR "power loss" OR "power cuts" OR "power cut" OR "power went out" OR "power interuption" OR "brownout" OR "power goes out" OR "brownouts" OR "without power" near:"Michigan" within:15mi -filter:retweets since:2019-11-07 until:2019-11-08']
INFO: Querying "power outage" OR "power is out" OR "power's out" OR "blackouts" OR "blackout" -"video game" OR "power failure" OR "power failures" OR "no electricity" OR "power shortage" OR "electrical failure" OR "power loss" OR "power cuts" OR "power cut" OR "power went out" OR "power interuption" OR "brownout" OR "power goes out" OR "brownouts" OR "without power" near:"Michigan" within:15mi -filter:retweets since:2019-11-07 until:2019-11-08
INFO: Scraping tweets from https://twitter.com/search?f=tweets&vertical=default&q="power%20outage"%

In [40]:
df.head()

Unnamed: 0,tweet,location,time_stamp
0,Blackout*,Michigan,2019-11-07 18:53:14
1,Going live very soon!! Come hang out at http:/...,Michigan,2019-11-07 15:03:27
2,Don't give @arobach too much credit here. The ...,Michigan,2019-11-07 14:57:05
3,BREAKING: Lindsey Graham is proud of sexual as...,Michigan,2019-11-07 14:05:33
4,I'm thinking early[ish] morning stream tomorro...,Michigan,2019-11-07 06:34:06


In [44]:
model.predict_proba([clean_scrape(df)])

array([[0.375, 0.625]])

In [42]:
df.head()

Unnamed: 0,tweet,location,time_stamp
0,blackout,Michigan,2019-11-07 18:53:14
1,going live very soon come hang out at gonna st...,Michigan,2019-11-07 15:03:27
2,don t give arobach too much credit here the vi...,Michigan,2019-11-07 14:57:05
3,breaking lindsey graham is proud of sexual ass...,Michigan,2019-11-07 14:05:33
4,i m thinking early ish morning stream tomorrow...,Michigan,2019-11-07 06:34:06


In [43]:
preds_proba = []
for tweet in df.drop_duplicates()['tweet']:
    preds_proba.append(model.predict_proba([tweet])[0][1])
    
sum(preds_proba)/len(preds_proba)

0.4211761979305609

In [45]:
# California
df = scrape_twitter('California')

INFO: queries: ['"power outage" OR "power is out" OR "power\'s out" OR "blackouts" OR "blackout" -"video game" OR "power failure" OR "power failures" OR "no electricity" OR "power shortage" OR "electrical failure" OR "power loss" OR "power cuts" OR "power cut" OR "power went out" OR "power interuption" OR "brownout" OR "power goes out" OR "brownouts" OR "without power" near:"California" within:15mi -filter:retweets since:2019-11-07 until:2019-11-08']
INFO: Querying "power outage" OR "power is out" OR "power's out" OR "blackouts" OR "blackout" -"video game" OR "power failure" OR "power failures" OR "no electricity" OR "power shortage" OR "electrical failure" OR "power loss" OR "power cuts" OR "power cut" OR "power went out" OR "power interuption" OR "brownout" OR "power goes out" OR "brownouts" OR "without power" near:"California" within:15mi -filter:retweets since:2019-11-07 until:2019-11-08
INFO: Scraping tweets from https://twitter.com/search?f=tweets&vertical=default&q="power%20outa

In [46]:
model.predict_proba([clean_scrape(df)])

array([[1., 0.]])

In [47]:
preds_proba = []
for tweet in df.drop_duplicates()['tweet']:
    preds_proba.append(model.predict_proba([tweet])[0][1])
    
sum(preds_proba)/len(preds_proba)

0.3888219018216934

In [69]:
# Texas
df = scrape_twitter('New York')

INFO: queries: ['"power outage" OR "power is out" OR "power\'s out" OR "blackouts" OR "blackout" -"video game" OR "power failure" OR "power failures" OR "no electricity" OR "power shortage" OR "electrical failure" OR "power loss" OR "power cuts" OR "power cut" OR "power went out" OR "power interuption" OR "brownout" OR "power goes out" OR "brownouts" OR "without power" near:"New York" within:15mi -filter:retweets since:2019-06-29 until:2019-06-30', '"power outage" OR "power is out" OR "power\'s out" OR "blackouts" OR "blackout" -"video game" OR "power failure" OR "power failures" OR "no electricity" OR "power shortage" OR "electrical failure" OR "power loss" OR "power cuts" OR "power cut" OR "power went out" OR "power interuption" OR "brownout" OR "power goes out" OR "brownouts" OR "without power" near:"New York" within:15mi -filter:retweets since:2019-06-30 until:2019-07-01']
INFO: Querying "power outage" OR "power is out" OR "power's out" OR "blackouts" OR "blackout" -"video game" OR

In [72]:
model.predict_proba([clean_scrape(df)])

array([[0.5, 0.5]])

In [73]:
preds_proba = []
for tweet in df.drop_duplicates()['tweet']:
    preds_proba.append(model.predict_proba([tweet])[0][1])
    
sum(preds_proba)/len(preds_proba)

0.5397915698943349

In [75]:
df.drop_duplicates()

Unnamed: 0,tweet,location,time_stamp
0,iight so i got the bunny skin on cod and it s ...,New York,2019-06-30 22:10:01
1,listening to i couldn t be more in love by the...,New York,2019-06-30 21:08:01
2,good morning world time to blackout at nyc pri...,New York,2019-06-30 15:45:25
3,we need a blackout party this summer,New York,2019-06-30 02:49:43
4,we re one blackout away from full anarchy at c...,New York,2019-06-30 00:46:19
5,just here sitting in the dark with grandma dri...,New York,2019-06-30 00:34:26
12,had an incredible sleep at the ny marriott dow...,New York,2019-06-29 17:08:43
13,wow the blackout blinds here are the best ever...,New York,2019-06-29 17:05:28
14,the dessert brooklyn blackout by twolittleredh...,New York,2019-06-29 01:38:15
