Much of the idea of this code and some of the code itself was taken from the DC-2 group's project found here: https://github.com/ariellem2/Disaster_Response_Maps

In [1]:
#Imports

import pandas as pd
import regex as re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

In [2]:
#Load in full twitter scrape

df = pd.read_csv('FullWoosleyTweets.csv')

In [3]:
#Confirm data looks good
df.head()

Unnamed: 0,Full Text,Author,Creation Time,Hashtags
0,Autumn Sun | sailor_marimoon soaking up some g...,StormulaOne,Fri Nov 30 20:16:20 +0000 2018,
1,!!woolsey fire!! all lanes shutdown until furt...,TotalTrafficLA,Fri Nov 30 12:16:39 +0000 2018,#Malibu #LAtraffic
2,!!woolsey fire!! all lanes shutdown until furt...,TotalTrafficLA,Fri Nov 30 12:16:38 +0000 2018,#Malibu #LAtraffic
3,Nov 8th the Woolsey fire broke out and burned ...,irishcntrychic3,Fri Nov 30 04:18:35 +0000 2018,
4,Yes! This is one of the groups the boys & I he...,BondKathi,Thu Nov 29 05:01:38 +0000 2018,#Repost


In [4]:
#Examine why lines 1 and 2 look duplicate
df.loc[1,"Full Text"]

'!!woolsey fire!! all lanes shutdown until further notice in #Malibu on Hwy 23 SB between Mulholland Hwy and CA-1 #LAtraffic'

In [5]:
df.loc[2,"Full Text"]

'!!woolsey fire!! all lanes shutdown until further notice in #Malibu on Hwy 23 NB between CA-1 and Mulholland Hwy #LAtraffic'

In [6]:
#instantiate tokenizer and lemmatizer
tokenizer = RegexpTokenizer('\s+', gaps=True)
lemmatizer = WordNetLemmatizer()

In [7]:
#function that removes websites, removes punctionation (except hashbags), coverts text to lowercase, and lemmatizes
def processText(raw_text):
    
    tokens = tokenizer.tokenize(raw_text)
    
    token_new=[]
    
    for token in tokens:
        if 'http' in token: #remove webaddress
            pass
        else:
            result = re.sub("[^A-Za-z0-9#]", "",token.lower())
            if result != '':
                token_new.append(lemmatizer.lemmatize(result))
    
    return(" ".join(token_new))

In [8]:
#apply function to Full Text column
df['text'] = df['Full Text'].apply(processText)

In [9]:
df.head()

Unnamed: 0,Full Text,Author,Creation Time,Hashtags,text
0,Autumn Sun | sailor_marimoon soaking up some g...,StormulaOne,Fri Nov 30 20:16:20 +0000 2018,,autumn sun sailormarimoon soaking up some gold...
1,!!woolsey fire!! all lanes shutdown until furt...,TotalTrafficLA,Fri Nov 30 12:16:39 +0000 2018,#Malibu #LAtraffic,woolsey fire all lane shutdown until further n...
2,!!woolsey fire!! all lanes shutdown until furt...,TotalTrafficLA,Fri Nov 30 12:16:38 +0000 2018,#Malibu #LAtraffic,woolsey fire all lane shutdown until further n...
3,Nov 8th the Woolsey fire broke out and burned ...,irishcntrychic3,Fri Nov 30 04:18:35 +0000 2018,,nov 8th the woolsey fire broke out and burned ...
4,Yes! This is one of the groups the boys & I he...,BondKathi,Thu Nov 29 05:01:38 +0000 2018,#Repost,yes this is one of the group the boy i helped ...


In [10]:
# Create a list of words to filter for, initial word list take from DSI - DC - 2 group and then modified
# based on analysis from Chris T. and Yichen H.
word_list1 = [' road', ' street',' rd', ' hwy', ' highway', ' ave', ' avenue',' intersection',' lane']
word_list2 = [' closed',' closure', ' blocked', ' flooded',' shutdown']
not_word_list = [' cleared', ' opened', ' reopen']

#function to determine if tweet text is a qualified tweet, defined as having a word in word_list1 and word_list2
#but not in not_word_list

def qualified_tweet(tweet_text):
    first,second,delete = 0,0,0
    for i in word_list1:
        if i in " "+tweet_text:
            first = 1
    for i in word_list2:
        if i in " "+tweet_text:
            second = 1
    for i in not_word_list:
        if i in " "+tweet_text:
            delete=1
    if first & second &~ delete:
        return True
    else:
        return False

In [11]:
df['useful tweet'] = df['text'].apply(qualified_tweet)

In [12]:
#How many 'road closure' tweets did we get
df_closed = df[df['useful tweet']==True]
df_closed.shape

(98, 6)

In [13]:
#Display full text of tweet
#Code taken from https://stackoverflow.com/questions/25351968/how-to-display-full-non-truncated-dataframe-information-in-html-when-convertin

pd.set_option('display.max_colwidth', -1)

In [15]:
#Explore the first couple road closure tweets
df_closed['Full Text'].head()

1     !!woolsey fire!! all lanes shutdown until further notice in #Malibu on Hwy 23 SB between Mulholland Hwy and CA-1 #LAtraffic                 
2     !!woolsey fire!! all lanes shutdown until further notice in #Malibu on Hwy 23 NB between CA-1 and Mulholland Hwy #LAtraffic                 
6     !!woolsey fire!! all lanes shutdown until further notice in #LosAngeles on Hwy 23 SB between Mulholland Hwy and CA-1 #LAtraffic             
26    !! #woolseyfire closure !! road closed due to the woolsey fire. in #PointMugu on Pacific Coast Hwy SB before Mulh... http://bit.ly/10F395r  
28    !! #woolseyfire closure !! road closed due to the woolsey fire. in #PointMagu on Pacific Coast Hwy SB before Las Posas Rd to Heathercliff Rd
Name: Full Text, dtype: object

In [16]:
#How many non-road closure tweets do we have?
df_notclosed = df[df['useful tweet']==False]
df_notclosed.shape

(1866, 6)

In [17]:
#Export excluded tweets to see if there are additional keywords we should add
df_notclosed.to_csv('current_excluded_tweets.csv',index=False)