In [1]:
#Install packages
#!pip install python-twitter
#!pip install TwitterAPI

In [2]:
# Import packages
import json
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from TwitterAPI import TwitterAPI
from nltk import PorterStemmer, WordNetLemmatizer, LancasterStemmer
from nltk.tokenize import word_tokenize
from string import punctuation
from bs4 import BeautifulSoup
from nltk.corpus import stopwords 

In [3]:
#train_file = "/Users/nescobar/Dropbox/Indiana/Social_Media_Mining/Project/smm2018/data/raw/csv/annotation/training_set_final_4.csv"
#test_file = "/Users/nescobar/Dropbox/Indiana/Social_Media_Mining/Project/smm2018/data/raw/csv/annotation/random_new_t.csv"

consumer_key = ''
consumer_secret = ''
access_token_key = ''
access_token_secret= ''

In [4]:
#api = TwitterAPI(consumer_key, consumer_secret, auth_type='oAuth2')

In [19]:
class ProcessTweets:
    
    def __init__(self):
        self.normalization = 'pt_stem'
        self._stopwords=set(stopwords.words('english')+list(punctuation)+['AT_USER','URL','IMG']
                                                                        +['twitter','com','twitter.com','pic','hurricaneharvey'])
                                                                         #'texas','houston'])
        
    def get_data(query="", source="file", path=None, feed="search/tweets",api=None, maxid=0, n=100):
        try:
            if source == "file":
                harvey_df = pd.read_csv(path, encoding = 'ISO-8859-1')
                return harvey_df
            #else:
            #    if maxid == 0:
            #        return pd.read_json(json.dumps([t for t in api.request(feed, {'q':query,'count':n})]))
            #    else:
            #        return pd.read_json(json.dumps([t for t in api.request(feed, {'q': query, 'count': n, 'max_id': maxid})]))            
        except:
            print("Error while getting data")
            return None
    
    def process_tweets(self, list_of_tweets):
        processed_tweets=[]
        for tweet in list_of_tweets:
            processed_tweets.append((self._process_tweet(tweet)))
        return processed_tweets
    
    def _process_tweet(self,tweet):
        try:
            # Unescape from HTML
            #tweet = html.unescape(tweet)
            #tweet = BeautifulSoup(tweet, 'lxml').get_text()
            # 3a. Convert to lower case
            tweet = tweet.lower()
            # 3b. Replace links with the word URL 
            tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',tweet) 
            # 3c. Replace @username with "AT_USER"
            tweet = re.sub('@[^\s]+',' ',tweet)                
            # 3d. Replace #word with word 
            tweet = re.sub(r'#([^\s]+)',r'\1',tweet)
            # 3e. Replace images with the word IMG 
            tweet = re.sub(r'\bpic.twitter.com\s+', ' ', tweet)
            # 3f Keep only words with letters
            tweet = re.sub('[^a-zA-Z]',' ',tweet)
            # 3g. Remove RT
            tweet = re.sub(r'\brt([\b\s])', ' ', tweet)
            
            # Apply Lemmatization
            if (self.normalization == 'wn_lem'):
                lemmatizer = WordNetLemmatizer() 
                tweet = [lemmatizer.lemmatize(word) for word in tweet.split() if word not in self._stopwords and len(word)>1]  
            elif (self.normalization == 'lc_stem'):
                st = LancasterStemmer()
                tweet = [st.stem(word) for word in tweet.split() if word not in self._stopwords and len(word)>1]  
            elif (self.normalization == 'pt_stem'):
                pt = PorterStemmer()
                tweet = [pt.stem(word) for word in tweet.split() if word not in self._stopwords and len(word)>1]  
            else:
                lemmatizer = WordNetLemmatizer() 
                tweet = [lemmatizer.lemmatize(word) for word in tweet.split() if word not in self._stopwords and len(word)>1]  
            
            return (" ".join(tweet)).strip()
        
        except:
            print("Error with tweet: ", tweet)
            return None

In [20]:
# Create train DF 
from sklearn.utils import shuffle
import pickle as pkl

tweet_processor = ProcessTweets()
train_df = pkl.load(open("training_set.pkl", "rb"))
train_df['text'] = tweet_processor.process_tweets(train_df['text'])
train_df.columns = ['text','target']
train_df = shuffle(train_df)

In [21]:
# Testing with different classifiers
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from time import time
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Start running time
t0 = time()

# Build a pipeline that contains vectorizer, transform and classifier

# Multinomial NB
clf = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', MultinomialNB()),
                ])
scores = cross_val_score(clf, train_df.text, train_df.target, cv=5)
print("Multinomial Naive Bayes: ", np.mean(scores))

# Logistic Regression
clf_lr = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', LogisticRegression()),
                ])
scores = cross_val_score(clf_lr, train_df.text, train_df.target, cv=5)
print("Logistic Regression: ", np.mean(scores))

# Logistic Regression with SGD 
clf_lr_sgd = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', SGDClassifier(loss='log', penalty='l1',
                                            alpha=1e-3, random_state=42,
                                            max_iter=100, tol=None)),
                ])
scores = cross_val_score(clf_lr_sgd, train_df.text, train_df.target, cv=5)
print("Logistic Regression with SGD: ", np.mean(scores))

# SVM
clf = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', SVC()),
                ])
scores = cross_val_score(clf, train_df.text, train_df.target, cv=5)
print("SVM: ", np.mean(scores))

# SVM with SGD (Stochastic Gradient Descent) 
clf = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', SGDClassifier(loss='hinge', penalty='l1',
                                            alpha=1e-3, random_state=42,
                                            max_iter=100, tol=None)),
                ])
scores = cross_val_score(clf, train_df.text, train_df.target, cv=5)
print("SVM with SGD: ", np.mean(scores))

# Print running time for training and predicting
print('Total running time', time() - t0)

Multinomial Naive Bayes:  0.878431372549
Logistic Regression:  0.933333333333
Logistic Regression with SGD:  0.933333333333
SVM:  0.870588235294
SVM with SGD:  0.960784313725
Total running time 0.4905397891998291


In [22]:
# Parameter tuning with Grid Search
from sklearn.model_selection import GridSearchCV

parameters = {'tfidf__ngram_range' : [(1,1),(1,2)],
             'clf__alpha': (1e-1, 1e-3, 1e-5),
             'clf__max_iter': (10,50,100),
             'clf__penalty': ('l1','l2','elasticnet'),
             'clf__fit_intercept': (True,False),
             'clf__class_weight': (None,'balanced'),
             'clf__warm_start': (True, False)
             }

gs_clf = GridSearchCV(clf_lr_sgd, parameters, n_jobs=-1)
gs_clf.fit(train_df.text, train_df.target)

print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.972549019608
{'clf__alpha': 1e-05, 'clf__class_weight': None, 'clf__fit_intercept': True, 'clf__max_iter': 50, 'clf__penalty': 'l1', 'clf__warm_start': True, 'tfidf__ngram_range': (1, 2)}


In [23]:
from time import time
from sklearn.pipeline import Pipeline
from sklearn import metrics
import pickle as pkl
import numpy as np

# Start running time
t0 = time()

pkl_files = ['harvey_august_2017.pkl', 'harvey_september_2017.pkl', 'harvey_oct_dec_2017.pkl'
            ,'harvey_january_2018.pkl','harvey_february_2018.pkl', 'harvey_march_2018.pkl', 
             'harvey_april_2018.pkl']

print(gs_clf.classes_)

# Loop over each .pkl file
for file in pkl_files:
    harvey_recent_df = pkl.load(open(file, "rb"))
    harvey_recent_df.drop_duplicates('text', inplace=True)
    clean_test_df = tweet_processor.process_tweets(harvey_recent_df.text)
    predicted = gs_clf.predict_proba(clean_test_df)
    
    c_donation=c_other=c_relocation=c_volunteering=c_na=0
    for tweet, category in zip(harvey_recent_df.text, predicted):

        # Consider only predictions with > 0.5 probability
        if category[0] > 0.5:
            c_donation += 1
        elif category[1] > 0.5:
            c_relocation += 1    
        elif category[2] > 0.5:
            c_volunteering += 1
        else:
            c_na += 1

    total_tweets = len(harvey_recent_df.text)
    print("File: ", file)
    print("Total tweets analyzed:", total_tweets)
    print("Total tweets classified as DONATION: {} {}%"
            .format(c_donation, np.around((c_donation/total_tweets)*100),decimals=2))
    print("Total tweets classified as RELOCATION: {} {}%"
            .format(c_relocation, np.around((c_relocation/total_tweets)*100),decimals=2))
    print("Total tweets classified as VOLUNTEERING: {} {}%"
            .format(c_volunteering, np.around((c_volunteering/total_tweets)*100),decimals=2))
    print("Total tweets classified as NOT CLASSIFIED: {} {}%"
            .format(c_na, np.around((c_na/total_tweets)*100),decimals=2))

    print("*"*50)
    
print('Total running time', time() - t0)

['donation' 'relocation' 'volunteering']
File:  harvey_august_2017.pkl
Total tweets analyzed: 984
Total tweets classified as DONATION: 365 37.0%
Total tweets classified as RELOCATION: 236 24.0%
Total tweets classified as VOLUNTEERING: 199 20.0%
Total tweets classified as NOT CLASSIFIED: 184 19.0%
**************************************************
File:  harvey_september_2017.pkl
Total tweets analyzed: 987
Total tweets classified as DONATION: 497 50.0%
Total tweets classified as RELOCATION: 133 13.0%
Total tweets classified as VOLUNTEERING: 182 18.0%
Total tweets classified as NOT CLASSIFIED: 175 18.0%
**************************************************
File:  harvey_oct_dec_2017.pkl
Total tweets analyzed: 949
Total tweets classified as DONATION: 324 34.0%
Total tweets classified as RELOCATION: 177 19.0%
Total tweets classified as VOLUNTEERING: 191 20.0%
Total tweets classified as NOT CLASSIFIED: 257 27.0%
**************************************************
File:  harvey_january_2018.pkl


In [149]:
predicted

array([[ 0.5444483 ,  0.21327866,  0.24227304],
       [ 0.42658016,  0.3072784 ,  0.26614144],
       [ 0.45202719,  0.31336168,  0.23461113],
       ..., 
       [ 0.40005084,  0.28816854,  0.31178063],
       [ 0.82042923,  0.00932606,  0.17024471],
       [ 0.42658016,  0.3072784 ,  0.26614144]])

In [15]:
train_df.groupby(['target']).size()

target
donation        85
relocation      85
volunteering    85
dtype: int64

In [25]:
import operator
coefs = [] 
words = []

tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words="english")
tfidf_vectorizer.fit_transform(train_df.text)

clf_lr.fit(train_df.text, train_df.target)

# Create list of coefficients and words
for k,v in tfidf_vectorizer.vocabulary_.items():
    coefs.append(clf_lr.named_steps['clf'].coef_[0][tfidf_vectorizer.vocabulary_[k]])
    words.append(k)

# Create pairs of (words, coefficients)
pairs = []
for p in zip(words, coefs): 
    pairs.append(p)

# Sort pairs by coefficient
pairs.sort(reverse=True, key=operator.itemgetter(1))

# Print words with highest coefficients 
[print(k,v) for k,v in pairs[:10]]

donat 2.98541714119
blood 1.98904089138
money 1.61216513005
rais 1.18705777748
consid 0.930953543509
make 0.829254781668
fundrais 0.770181604168
brought 0.770181604168
dc 0.770181604168
autism 0.764404531788


[None, None, None, None, None, None, None, None, None, None]

In [162]:
# Top ngrams in tweets
# from sklearn.feature_extraction.text import TfidfVectorizer
# from collections import Counter

# vect = TfidfVectorizer(ngram_range=(1,2))
# summaries = "".join(clean_test_df)
# ngrams_summaries = vect.build_analyzer()(summaries)

# [bigram for bigram in Counter(ngrams_summaries).most_common(20) ]