In [24]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
import spacy
import en_core_web_sm

In [2]:
## data read
df_storm = pd.read_csv('stormWords.csv') 
df_twitter = pd.read_csv('twitterWords.csv') 

In [3]:
# storm data 
df_storm.head()

Unnamed: 0,word
0,USFS
1,NWAC
2,station
3,Hurricane
4,Ridge


In [4]:
# twitter data
df_twitter.head()

Unnamed: 0,word
0,28
1,deaths
2,due
3,Hurricane
4,Sandy


In [5]:
# Data Pre-processing 
nlp = en_core_web_sm.load()

# Covert to lower case
df_storm["word"] = df_storm["word"].str.lower()
df_twitter["word"] = df_twitter["word"].str.lower()

In [6]:
# Lemmetize the text 

df_storm["word"] = df_storm["word"].apply(lambda row: ' '.join([w.lemma_ for w in nlp(row)]))
df_twitter["word"] = df_twitter["word"].apply(lambda row: ' '.join([w.lemma_ for w in nlp(row)]))

In [15]:
# POS Tagging
def getPosTag(text):
    text = word_tokenize(text)
    return nltk.pos_tag(text) 

df_storm["Pos_Tag"] = df_storm.apply(lambda row: getPosTag(row['word']), axis=1)
df_twitter["Pos_Tag"] = df_twitter.apply(lambda row: getPosTag(row['word']), axis=1)

In [16]:
from collections import Counter 
pos = df_storm["Pos_Tag"]
tags = []
for word in df_storm["Pos_Tag"]:
    tags.append(word[0][1])
count_storm = Counter(tags)
print("POS Tagging for Storm Data: ")
print(count_storm)

POS Tagging for Storm Data: 
Counter({'NN': 7026, 'JJ': 1235, 'CD': 838, 'RB': 497, 'IN': 409, 'DT': 214, '(': 177, 'NNS': 154, 'VB': 127, 'VBG': 93, 'VBN': 61, '$': 29, 'JJS': 23, 'LS': 14, 'VBD': 9, 'SYM': 8, 'RBR': 6, 'MD': 3, 'CC': 1})


In [17]:
postags = []
for word in df_twitter["Pos_Tag"]:
    postags.append(word[0][1])
count_twitter = Counter(postags)
print("POS Tagging for Twitter Data:")
print(count_twitter)

POS Tagging for Twitter Data:
Counter({'NN': 1357, '#': 206, 'JJ': 151, 'CD': 102, 'RB': 77, 'VB': 76, 'IN': 73, 'NNS': 61, 'DT': 34, ':': 24, 'VBG': 16, 'CC': 13, 'MD': 10, 'RBR': 8, '$': 8, '(': 8, 'VBD': 7, 'JJS': 7, '.': 5, 'SYM': 4, 'VBN': 3, "''": 2, 'WP': 2, 'WRB': 2, 'TO': 2, 'POS': 2})


In [18]:
# percentage of Nouns in Storm Data
perStorm = ((7026+154)/ len(df_storm["Pos_Tag"]) ) * 100
print("Percenatge of  Nouns in Storm Data: ", perStorm)

Percenatge of  Nouns in Storm Data:  65.72683998535335


In [19]:
# percentage of Nouns in Twitter Data
perTwitter = ((1399 + 61)/ len(df_twitter["Pos_Tag"]) ) * 100
print("Percenatge of  Nouns in Storm Data: ", perTwitter)

Percenatge of  Nouns in Storm Data:  64.60176991150442


In [20]:
# Sentiment Analysis
from textblob import TextBlob

def getSentiment(text):
    senti = TextBlob(text)
    return senti.sentiment.polarity
df_storm["Sentiment"] = df_storm.apply(lambda row: getSentiment(row['word']), axis=1)
df_twitter["Sentiment"] = df_twitter.apply(lambda row: getSentiment(row['word']), axis=1)

In [21]:
print("Average Sentiment Score for Storm data: ",df_storm["Sentiment"].mean())

Average Sentiment Score for Storm data:  0.011878668927597858


In [22]:
print("Average Sentiment Score for Twitter data: ",df_twitter["Sentiment"].mean())

Average Sentiment Score for Twitter data:  0.0021541792449615107
