In [50]:
#Install packages
#!pip install python-twitter
!pip install TwitterAPI



In [4]:
# Import packages
import json
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from TwitterAPI import TwitterAPI
from nltk import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from string import punctuation
from bs4 import BeautifulSoup
from nltk.corpus import stopwords 

In [232]:
train_file = "/Users/nescobar/Dropbox/Indiana/Social_Media_Mining/Project/smm2018/data/raw/csv/annotation/training_set_final_2.csv"
test_file = "/Users/nescobar/Dropbox/Indiana/Social_Media_Mining/Project/smm2018/data/raw/csv/annotation/random_new_t.csv"

consumer_key = ''
consumer_secret = ''
access_token_key = ''
access_token_secret= ''

In [233]:
#api = TwitterAPI(consumer_key, consumer_secret, auth_type='oAuth2')

In [244]:
class ProcessTweets:
    
    def __init__(self):
        self._stopwords=set(list(punctuation)+['ATUSER','URL','IMG'])
        
    def get_data(query="", source="file", path=train_file, feed="search/tweets",api=api, maxid=0, n=100):
        try:
            if source == "file":
                harvey_df = pd.read_csv(path, encoding = 'ISO-8859-1')
                return harvey_df
            else:
                if maxid == 0:
                    return pd.read_json(json.dumps([t for t in api.request(feed, {'q':query,'count':n})]))
                else:
                    return pd.read_json(json.dumps([t for t in api.request(feed, {'q': query, 'count': n, 'max_id': maxid})]))            
        except:
            print("Error while getting data")
            return None
    
    def process_tweets(self, list_of_tweets):
        processed_tweets=[]
        for tweet in list_of_tweets:
            processed_tweets.append((self._process_tweet(tweet)))
        return processed_tweets
    
    def _process_tweet(self,tweet):
        try:
            # Unescape from HTML
            #tweet = html.unescape(tweet)
            #tweet = BeautifulSoup(tweet, 'lxml').get_text()
            # 3a. Convert to lower case
            tweet = tweet.lower()
            # 3b. Replace links with the word URL 
            tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',tweet) 
            # 3c. Replace @username with "AT_USER"
            tweet = re.sub('@[^\s]+',' ',tweet)                
            # 3d. Replace #word with word 
            tweet = re.sub(r'#([^\s]+)',r'\1',tweet)
            # 3e. Replace images with the word IMG 
            tweet = re.sub(r'\bpic.twitter.com\s+', ' ', tweet)
            # 3f Keep only words with letters
            tweet = re.sub('[^a-zA-Z]',' ',tweet)
            # 3g. Remove RT
            tweet = re.sub(r'\brt([\b\s])', ' ', tweet)
            # Apply Lemmatization
            lemmatizer = WordNetLemmatizer()           
            tweet = [lemmatizer.lemmatize(word) for word in tweet.split() if word not in self._stopwords and len(word)>1]  
            return (" ".join(tweet)).strip()
        except:
            print("Error with tweet: ", tweet)
            return None

In [251]:
# Process tweets from file
tweet_processor = ProcessTweets()
raw_tweets = tweet_processor.get_data()
#[print(t) for t in raw_tweets['text'][:10]]
cleaned_tweets = tweet_processor.process_tweets(raw_tweets['text'])

In [246]:
# Create train DF 
train_df = pd.DataFrame(cleaned_tweets, columns=['text'])
train_df['target'] = raw_tweets.label
train_df=train_df.drop_duplicates('text')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333 entries, 0 to 349
Data columns (total 2 columns):
text      333 non-null object
target    333 non-null object
dtypes: object(2)
memory usage: 7.8+ KB


In [467]:
# Top ngrams in tweets
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

filter_file = "/Users/nescobar/Dropbox/Indiana/Social_Media_Mining/Project/data/csv/filter_tweets_water.csv"

filtered_tweets = tweet_processor.get_data(path=filter_file)
cleaned_filtered_tweets = tweet_processor.process_tweets(filtered_tweets['text'])

vect = TfidfVectorizer(ngram_range=(2,5))
summaries = "".join(cleaned_filtered_tweets)
ngrams_summaries = vect.build_analyzer()(summaries)

[bigram for bigram in Counter(ngrams_summaries).most_common(20) if "harvey" not in bigram[0]]

[('high water', 104),
 ('water rescue', 58),
 ('flood water', 53),
 ('water level', 43),
 ('rising water', 20),
 ('water crossing', 20),
 ('low water', 18),
 ('low water crossing', 17),
 ('high water rescue', 16),
 ('corpus christi', 15),
 ('water rise', 14),
 ('stay safe', 14),
 ('turn around', 13),
 ('food water', 13),
 ('port lavaca', 13),
 ('lot water', 12)]

In [237]:
# Testing with different classifiers
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from time import time
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Start running time
t0 = time()

# Build a pipeline that contains vectorizer, transform and classifier

# Multinomial NB
clf = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', MultinomialNB()),
                ])
scores = cross_val_score(clf, train_df.text, train_df.target, cv=5)
print("Multinomial Naive Bayes: ", np.mean(scores))

# Logistic Regression
clf_lr = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', LogisticRegression()),
                ])
scores = cross_val_score(clf_lr, train_df.text, train_df.target, cv=5)
print("Logistic Regression: ", np.mean(scores))

# Logistic Regression with SGD 
clf_lr_sgd = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', SGDClassifier(loss='log', penalty='l1',
                                            alpha=1e-3, random_state=42,
                                            max_iter=100, tol=None)),
                ])
scores = cross_val_score(clf_lr_sgd, train_df.text, train_df.target, cv=5)
print("Logistic Regression with SGD: ", np.mean(scores))

# SVM
clf = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', SVC()),
                ])
scores = cross_val_score(clf, train_df.text, train_df.target, cv=5)
print("SVM: ", np.mean(scores))

# SVM with SGD (Stochastic Gradient Descent) 
clf = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', SGDClassifier(loss='hinge', penalty='l1',
                                            alpha=1e-3, random_state=42,
                                            max_iter=100, tol=None)),
                ])
scores = cross_val_score(clf, train_df.text, train_df.target, cv=5)
print("SVM with SGD: ", np.mean(scores))

# Print running time for training and predicting
print('Total running time', time() - t0)


Multinomial Naive Bayes:  0.582501928858
Logistic Regression:  0.690384973528
Logistic Regression with SGD:  0.732677521483
SVM:  0.348335860803
SVM with SGD:  0.738425519462
Total running time 1.1801233291625977


In [238]:
# Parameter tuning with Grid Search
from sklearn.model_selection import GridSearchCV

parameters = {'tfidf__ngram_range' : [(1,1),(1,2)],
             'clf__alpha': (1e-1, 1e-3, 1e-5),
             'clf__max_iter': (10,50,100),
             'clf__penalty': ('l1','l2','elasticnet'),
             'clf__fit_intercept': (True,False),
             'clf__class_weight': (None,'balanced'),
             'clf__warm_start': (True, False)
             }

gs_clf = GridSearchCV(clf_lr_sgd, parameters, n_jobs=-1)
gs_clf.fit(train_df.text, train_df.target)

print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.741741741742
{'clf__alpha': 0.001, 'clf__class_weight': None, 'clf__fit_intercept': True, 'clf__max_iter': 100, 'clf__penalty': 'l1', 'clf__warm_start': True, 'tfidf__ngram_range': (1, 1)}


In [276]:
from time import time
from sklearn.pipeline import Pipeline
from sklearn import metrics
import pickle as pkl
import numpy as np

# Start running time
t0 = time()

# Get test data from recent tweets
harvey_recent_df = pkl.load(open("harvey_april_2018.pkl", "rb"))
#harvey_tweets = tweet_processor.get_data(path=test_file)
#harvey_tweets.columns=['text']
# Drop duplicates
harvey_recent_df.drop_duplicates('text', inplace=True)

# Clean up testing data set
clean_test_df = tweet_processor.process_tweets(harvey_recent_df.text)
#harvey_tweets.drop_duplicates('text', inplace=True)
#clean_test_df = tweet_processor.process_tweets(harvey_tweets['text'])
#clean_test_df
# Test the classifier
#gs_clf.fit(train_df.text, train_df.target)
predicted = gs_clf.predict_proba(clean_test_df)

# Print running time for training and predicting
print('Total running time', time() - t0)

#print(metrics.classification_report(y_test, predicted))
print(gs_clf.classes_)

#print(metrics.classification_report(y_test, predicted))
# Print predicted category for each tweet
c_donation=c_other=c_relocation=c_volunteering=c_na=0
for tweet, category in zip(harvey_recent_df.text, predicted):
    print('%r => %s' % (tweet, category))
    # Consider only predictions with > 0.5 probability
    if category[0] > 0.5:
        c_donation += 1
    elif category[1] > 0.5:
        c_other += 1
    elif category[2] > 0.5:
        c_relocation += 1
    elif category[3] > 0.5:
        c_volunteering += 1
    else:
        c_na += 1

total_tweets = len(harvey_recent_df.text)
print("Total tweets analyzed", total_tweets)
print("Total tweets classified as DONATION {} {}%"
      .format(c_donation, (c_donation/total_tweets)*100))
print("Total tweets classified as OTHER {} {}%"
      .format(c_other, (c_other/total_tweets)*100))
print("Total tweets classified as RELOCATION {} {}%"
      .format(c_relocation, (c_relocation/total_tweets)*100))
print("Total tweets classified as VOLUNTEERING {} {}%"
      .format(c_volunteering, (c_volunteering/total_tweets)*100))
print("Total tweets classified as NOT CLASSIFIED {} {}%"
      .format(c_na, (c_na/total_tweets)*100))


Total running time 0.33081698417663574
['donation' 'other' 'relocation' 'volunteering']
'Respiratory problems, pneumonia, headaches, nausea and dizziness – all are side effects that some first responders and community members cited after being exposed to a toxic mix of chemicals at an Arkema chemical plant during # HurricaneHarvey. http://trib.it/9W pic.twitter.com/vXWuj3dElX' => [ 0.31938091  0.37897407  0.1360919   0.16555311]
'# HurricaneHarvey update. # TexasStronghttps://gov.texas.gov/news/post/commission-to-rebuild-texas-after-hurricane-harvey-update-issue-20 …' => [ 0.33245161  0.32133849  0.17388151  0.17232839]
'# Telehealth in Times of Natural Disaster. https://jamanetwork.com/journals/jama/fullarticle/2677450 … # HurricaneHarvey' => [ 0.33605348  0.31398568  0.17576539  0.17419545]
'Great speech by @ tedcruz honoring # BorderPatrol agents in Edinburg, Texas. # ToughAsTexas # BorderSecurity # JavierVegaJr. # NewtonAzrak # HurricaneHarvey @ BPUnionhttps://twitter.com/dailysign

'Disaster: # HurricaneHarvey Occurred 220 days ago Local charities helping the recovery: - @ HoustonFoodBank - @ HouUnitedWay - @ HoustonHumane - @ Montrose_Center - @ commitforlife - @ YMCAHouston' => [ 0.31441937  0.29560115  0.16445015  0.22552933]
'This 5-part series – “Stacy Lewis: Winning for Houston” – chronicles the moments when Stacy Lewis won the LPGA Portland Classic and decided to donate – and match – her winnings to # HurricaneHarvey relief efforts. @ espnW # hurricaneharvey @ KPMGGolf http://bit.ly/2H6r4ZO pic.twitter.com/A107fmX4Ze' => [ 0.72892973  0.06007525  0.07233846  0.13865656]
'Following analysis of the damage and # flood impacts of # HurricaneHarvey, # Houston’s Dept of # PublicWorks is recommending that all new structures in the 100- and 500-year floodplains be elevated to 2 ft above the 500-year flood elevation. http://ow.ly/tbgw30jeHtn # infrastructure' => [ 0.26460626  0.51027968  0.1383965   0.08671756]
'Could financial assurance mandates help avoid future 

In [275]:
predicted

array([[ 0.68832782,  0.11441899,  0.06118224,  0.13607094],
       [ 0.34031426,  0.30528779,  0.1779939 ,  0.17640405],
       [ 0.34449734,  0.29674851,  0.18018177,  0.17857238],
       ..., 
       [ 0.37510749,  0.35454523,  0.07590795,  0.19443934],
       [ 0.34449734,  0.29674851,  0.18018177,  0.17857238],
       [ 0.41851691,  0.24623704,  0.15017721,  0.18506884]])

In [277]:
train_df.groupby(['target']).size()

target
donation        116
other            75
relocation       70
volunteering     72
dtype: int64

In [278]:
import operator
coefs = [] 
words = []

tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words="english")
tfidf_vectorizer.fit_transform(train_df.text)

# Create list of coefficients and words
for k,v in tfidf_vectorizer.vocabulary_.items():
    coefs.append(clf.named_steps['clf'].coef_[0][tfidf_vectorizer.vocabulary_[k]])
    words.append(k)

# Create pairs of (words, coefficients)
pairs = []
for p in zip(words, coefs): 
    pairs.append(p)

# Sort pairs by coefficient
pairs.sort(reverse=True, key=operator.itemgetter(1))

# Print words with highest coefficients 
[print(k,v) for k,v in pairs[:10]]

AttributeError: 'SGDClassifier' object has no attribute 'coef_'

In [None]:
# Visualization