In [119]:
#Install packages
#!pip install python-twitter
#!pip install TwitterAPI

In [120]:
# Import packages
import json
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from TwitterAPI import TwitterAPI
from nltk import PorterStemmer, WordNetLemmatizer, LancasterStemmer
from nltk.tokenize import word_tokenize
from string import punctuation
from bs4 import BeautifulSoup
from nltk.corpus import stopwords 

In [121]:
train_file = "/Users/nescobar/Dropbox/Indiana/Social_Media_Mining/Project/smm2018/data/raw/csv/annotation/training_set_final_4.csv"
test_file = "/Users/nescobar/Dropbox/Indiana/Social_Media_Mining/Project/smm2018/data/raw/csv/annotation/random_new_t.csv"

consumer_key = ''
consumer_secret = ''
access_token_key = ''
access_token_secret= ''

In [122]:
#api = TwitterAPI(consumer_key, consumer_secret, auth_type='oAuth2')

In [131]:
class ProcessTweets:
    
    def __init__(self):
        self.normalization = 'pt_stem'
        self._stopwords=set(stopwords.words('english')+list(punctuation)+['AT_USER','URL','IMG']
                                                                        +['twitter','com','twitter.com','pic','hurricaneharvey'])
                                                                         #'texas','houston'])
        
    def get_data(query="", source="file", path=train_file, feed="search/tweets",api=None, maxid=0, n=100):
        try:
            if source == "file":
                harvey_df = pd.read_csv(path, encoding = 'ISO-8859-1')
                return harvey_df
            #else:
            #    if maxid == 0:
            #        return pd.read_json(json.dumps([t for t in api.request(feed, {'q':query,'count':n})]))
            #    else:
            #        return pd.read_json(json.dumps([t for t in api.request(feed, {'q': query, 'count': n, 'max_id': maxid})]))            
        except:
            print("Error while getting data")
            return None
    
    def process_tweets(self, list_of_tweets):
        processed_tweets=[]
        for tweet in list_of_tweets:
            processed_tweets.append((self._process_tweet(tweet)))
        return processed_tweets
    
    def _process_tweet(self,tweet):
        try:
            # Unescape from HTML
            #tweet = html.unescape(tweet)
            #tweet = BeautifulSoup(tweet, 'lxml').get_text()
            # 3a. Convert to lower case
            tweet = tweet.lower()
            # 3b. Replace links with the word URL 
            tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',tweet) 
            # 3c. Replace @username with "AT_USER"
            tweet = re.sub('@[^\s]+',' ',tweet)                
            # 3d. Replace #word with word 
            tweet = re.sub(r'#([^\s]+)',r'\1',tweet)
            # 3e. Replace images with the word IMG 
            tweet = re.sub(r'\bpic.twitter.com\s+', ' ', tweet)
            # 3f Keep only words with letters
            tweet = re.sub('[^a-zA-Z]',' ',tweet)
            # 3g. Remove RT
            tweet = re.sub(r'\brt([\b\s])', ' ', tweet)
            
            # Apply Lemmatization
            if (self.normalization == 'wn_lem'):
                lemmatizer = WordNetLemmatizer() 
                tweet = [lemmatizer.lemmatize(word) for word in tweet.split() if word not in self._stopwords and len(word)>1]  
            elif (self.normalization == 'lc_stem'):
                st = LancasterStemmer()
                tweet = [st.stem(word) for word in tweet.split() if word not in self._stopwords and len(word)>1]  
            elif (self.normalization == 'pt_stem'):
                pt = PorterStemmer()
                tweet = [pt.stem(word) for word in tweet.split() if word not in self._stopwords and len(word)>1]  
            else:
                lemmatizer = WordNetLemmatizer() 
                tweet = [lemmatizer.lemmatize(word) for word in tweet.split() if word not in self._stopwords and len(word)>1]  
            
            return (" ".join(tweet)).strip()
        
        except:
            print("Error with tweet: ", tweet)
            return None

In [132]:
# Process tweets from file
#tweet_processor = ProcessTweets()
#raw_tweets = tweet_processor.get_data()
#train_df = pkl.load(open("training_set.pkl", "rb"))
#raw_tweets.drop('Unnamed: 0', 1, inplace=True)
#raw_tweets.columns=['text','label']
#[print(t) for t in raw_tweets['text'][:10]]
#train_df['text'] = tweet_processor.process_tweets(raw_tweets['text'])
#train_df.columns = ['text','target']

In [133]:
# Create train DF 
from sklearn.utils import shuffle
import pickle as pkl

#train_df = pd.DataFrame(cleaned_tweets, columns=['text'])
#train_df = pkl.load(open("training_set.pkl", "rb"))
#train_df['target'] = raw_tweets.label
#train_df = shuffle(train_df)
#train_df=train_df.drop_duplicates('text')
#train_df.info()
#train_df
tweet_processor = ProcessTweets()
train_df = pkl.load(open("training_set.pkl", "rb"))
train_df['text'] = tweet_processor.process_tweets(raw_tweets['text'])
train_df.columns = ['text','target']
train_df = shuffle(train_df)

In [134]:
# Testing with different classifiers
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from time import time
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Start running time
t0 = time()

# Build a pipeline that contains vectorizer, transform and classifier

# Multinomial NB
clf = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', MultinomialNB()),
                ])
scores = cross_val_score(clf, train_df.text, train_df.target, cv=5)
print("Multinomial Naive Bayes: ", np.mean(scores))

# Logistic Regression
clf_lr = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', LogisticRegression()),
                ])
scores = cross_val_score(clf_lr, train_df.text, train_df.target, cv=5)
print("Logistic Regression: ", np.mean(scores))

# Logistic Regression with SGD 
clf_lr_sgd = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', SGDClassifier(loss='log', penalty='l1',
                                            alpha=1e-3, random_state=42,
                                            max_iter=100, tol=None)),
                ])
scores = cross_val_score(clf_lr_sgd, train_df.text, train_df.target, cv=5)
print("Logistic Regression with SGD: ", np.mean(scores))

# SVM
clf = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', SVC()),
                ])
scores = cross_val_score(clf, train_df.text, train_df.target, cv=5)
print("SVM: ", np.mean(scores))

# SVM with SGD (Stochastic Gradient Descent) 
clf = Pipeline([('tfidf', TfidfVectorizer(lowercase=True, stop_words="english")),
                 ('clf', SGDClassifier(loss='hinge', penalty='l1',
                                            alpha=1e-3, random_state=42,
                                            max_iter=100, tol=None)),
                ])
scores = cross_val_score(clf, train_df.text, train_df.target, cv=5)
print("SVM with SGD: ", np.mean(scores))

# Print running time for training and predicting
print('Total running time', time() - t0)


Multinomial Naive Bayes:  0.878431372549
Logistic Regression:  0.917647058824
Logistic Regression with SGD:  0.933333333333
SVM:  0.870588235294
SVM with SGD:  0.929411764706
Total running time 0.4950888156890869


In [137]:
# Parameter tuning with Grid Search
from sklearn.model_selection import GridSearchCV

parameters = {'tfidf__ngram_range' : [(1,1),(1,2)],
             'clf__alpha': (1e-1, 1e-3, 1e-5),
             'clf__max_iter': (10,50,100),
             'clf__penalty': ('l1','l2','elasticnet'),
             'clf__fit_intercept': (True,False),
             'clf__class_weight': (None,'balanced'),
             'clf__warm_start': (True, False)
             }

gs_clf = GridSearchCV(clf_lr_sgd, parameters, n_jobs=-1)
gs_clf.fit(train_df.text, train_df.target)

print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.933333333333
{'clf__alpha': 0.001, 'clf__class_weight': None, 'clf__fit_intercept': True, 'clf__max_iter': 50, 'clf__penalty': 'l1', 'clf__warm_start': True, 'tfidf__ngram_range': (1, 1)}


In [138]:
from time import time
from sklearn.pipeline import Pipeline
from sklearn import metrics
import pickle as pkl
import numpy as np

# Start running time
t0 = time()

# Get test data from recent tweets
harvey_recent_df = pkl.load(open("harvey_august_2017.pkl", "rb"))
#harvey_tweets = tweet_processor.get_data(path=test_file)
#harvey_tweets.columns=['text']
# Drop duplicates
harvey_recent_df.drop_duplicates('text', inplace=True)

# Clean up testing data set
clean_test_df = tweet_processor.process_tweets(harvey_recent_df.text)
#harvey_tweets.drop_duplicates('text', inplace=True)
#clean_test_df = tweet_processor.process_tweets(harvey_tweets['text'])
#clean_test_df
# Test the classifier
#gs_clf.fit(train_df.text, train_df.target)
predicted = gs_clf.predict_proba(clean_test_df)

# Print running time for training and predicting
print('Total running time', time() - t0)

#print(metrics.classification_report(y_test, predicted))
print(gs_clf.classes_)

#print(metrics.classification_report(y_test, predicted))
# Print predicted category for each tweet
c_donation=c_other=c_relocation=c_volunteering=c_na=0
for tweet, category in zip(harvey_recent_df.text, predicted):
    #print('%r => %s' % (tweet, category))
    # Consider only predictions with > 0.5 probability
    if category[0] > 0.5:
        c_donation += 1
        #print('%r => %s' % (tweet, category))
    #elif category[1] > 0.5:
    #    c_other += 1
    elif category[1] > 0.5:
        c_relocation += 1
        
    elif category[2] > 0.5:
        c_volunteering += 1
    else:
        c_na += 1
        

total_tweets = len(harvey_recent_df.text)
print("Total tweets analyzed", total_tweets)
print("Total tweets classified as DONATION {} {}%"
      .format(c_donation, (c_donation/total_tweets)*100))
#print("Total tweets classified as OTHER {} {}%"
#      .format(c_other, (c_other/total_tweets)*100))
print("Total tweets classified as RELOCATION {} {}%"
      .format(c_relocation, (c_relocation/total_tweets)*100))
print("Total tweets classified as VOLUNTEERING {} {}%"
      .format(c_volunteering, (c_volunteering/total_tweets)*100))
print("Total tweets classified as NOT CLASSIFIED {} {}%"
      .format(c_na, (c_na/total_tweets)*100))


Total running time 0.39755702018737793
['donation' 'relocation' 'volunteering']
Total tweets analyzed 960
Total tweets classified as DONATION 340 35.41666666666667%
Total tweets classified as RELOCATION 25 2.604166666666667%
Total tweets classified as VOLUNTEERING 36 3.75%
Total tweets classified as NOT CLASSIFIED 559 58.229166666666664%


In [326]:
predicted

array([[ 0.57718519,  0.11725008,  0.30556473],
       [ 0.36818036,  0.31979225,  0.31202739],
       [ 0.76682582,  0.03907531,  0.19409887],
       ..., 
       [ 0.35040743,  0.32606952,  0.32352305],
       [ 0.76594021,  0.05720387,  0.17685592],
       [ 0.40584469,  0.30224107,  0.29191424]])

In [98]:
train_df.groupby(['target']).size()

target
donation        85
relocation      85
volunteering    85
dtype: int64

In [99]:
import operator
coefs = [] 
words = []

tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words="english")
tfidf_vectorizer.fit_transform(train_df.text)

# Create list of coefficients and words
for k,v in tfidf_vectorizer.vocabulary_.items():
    coefs.append(clf.named_steps['clf'].coef_[0][tfidf_vectorizer.vocabulary_[k]])
    words.append(k)

# Create pairs of (words, coefficients)
pairs = []
for p in zip(words, coefs): 
    pairs.append(p)

# Sort pairs by coefficient
pairs.sort(reverse=True, key=operator.itemgetter(1))

# Print words with highest coefficients 
[print(k,v) for k,v in pairs[:10]]

AttributeError: 'SGDClassifier' object has no attribute 'coef_'

In [100]:
# Top ngrams in tweets
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

#filter_file = "/Users/nescobar/Dropbox/Indiana/Social_Media_Mining/Project/data/csv/filter_tweets_water.csv"

#filtered_tweets = tweet_processor.get_data(path=filter_file)
#cleaned_filtered_tweets = tweet_processor.process_tweets(filtered_tweets['text'])

vect = TfidfVectorizer(ngram_range=(1,2))
summaries = "".join(clean_test_df)
ngrams_summaries = vect.build_analyzer()(summaries)

[bigram for bigram in Counter(ngrams_summaries).most_common(20) ]

[('help', 136),
 ('donateaphoto', 131),
 ('houston', 122),
 ('hur', 114),
 ('texa', 104),
 ('harvey', 91),
 ('jnj', 87),
 ('jnj donateaphoto', 87),
 ('hurricanemar', 81),
 ('hurricaneirm', 79),
 ('christmas', 77),
 ('famy', 73),
 ('stil', 67),
 ('new', 64),
 ('don', 61),
 ('hom', 59),
 ('nee', 59),
 ('us', 57),
 ('via', 55),
 ('flood', 54)]