In [1]:
# dependencies
import nltk
import string
import re
import unicodedata
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import twitter_samples
import pprint

# Load dependencies
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
import pandas as pd
import datetime
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
# Configuration
# Location MongoDB:
mongo_host = None
# Name collection
client_name = "fundamentals"

In [3]:
# Create connection
try:
    client = MongoClient(mongo_host)
    client.admin.command('ismaster')
    db = client[client_name] 
    twitter_db = db.twitter
    
except ConnectionFailure:
    print("Connection to MongoDB server could not be established")
    exit()

In [4]:
class Sanitizer(object):
    """
    Class for sanitizing twitter messages.

    @example        ```
                    sanitizer = Sanitizer()

                    with open('mytextfile.txt', 'r') as f:
                        for line in f:
                            print(sanitizer.sanitize(line))
                    ```

    @dependencies   nltk, string, re
    @author         Tycho Atsma <tycho.atsma@student.uva.nl>
    @file           Sanitizer.py
    @documentation  public
    @copyright      University of Amsterdam
    """
    punctuation = string.punctuation
    stopwords = nltk.corpus.stopwords
    Lemmatizer = nltk.stem.WordNetLemmatizer
    Tokenizer = nltk.tokenize.TweetTokenizer

    def __init__(self):
        """
        Constructor.
        """
        self.punctuation_table = dict((ord(char), None) for char in string.punctuation) 
        self.lemmatizer = self.Lemmatizer()
        self.tokenizer = self.Tokenizer()

    def remove_punctuation(self, message):
        """
        Method to remove punctuation from a twitter message.
        @param  string  Twitter message.
        @return string
        """
        return message.translate(self.punctuation_table).strip()

    def remove_stopwords(self, message, language="english"):
        """
        Method to remove stopwords from a twitter message.
        @param  string  Twitter message.
        @param  string  Language of the stopwords (default: english).
        @return string
        """
        stops = self.stopwords.words(language)
        tokens = message.split()
        return " ".join([token for token in tokens if token not in stops]).strip()

    def remove_links(self, message):
        """
        Method to remove links from a twitter message.
        @param  string  Twitter message.
        @return string
        """
        # source: https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
        pattern = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
        return re.sub(pattern, "", message).strip()

    def remove_usertags(self, message):
        """
        Method to remove usertags from a twitter message.
        @param  string  Twitter message.
        @return string
        """
        pattern = r"(\@\w*)"
        return re.sub(pattern, "", message).strip()

    def remove_hashtags(self, message):
        """
        Method to remove hashtags from a twitter message.
        @param  string  Twitter message.
        @return string
        """
        pattern = r"(\#\w*)"
        return re.sub(pattern, "", message).strip()

    def lemmatize(self, message):
        """
        Method to lemmatize a twitter message.
        @param  string  Twitter message.
        @return string
        """
        tokens = message.split()
        return " ".join([self.lemmatizer.lemmatize(token) for token in tokens])

    def tokenize(self, message):
        """
        Method to tokenize a twitter message.
        @param  string  Twitter message.
        @return string
        """
        return self.tokenizer.tokenize(message)

    def sanitize(self, message):
        """
        Method to sanitize a twitter message.
        @param  string  Twitter message.
        @return string
        """
        # 1. we need to normalize the message
        message = unicodedata.normalize("NFC", message.lower().encode('utf8').decode('utf8'))

        # 2. we need to get rid of specific types of tokens
        message = self.remove_links(message)
        message = self.remove_usertags(message)
        message = self.remove_hashtags(message)

        # 3. we need to get rid of language noise
        message = self.remove_punctuation(message)
        message = self.remove_stopwords(message)
        message = self.lemmatize(message)

        # 4. we need tokenize the message
        message = self.tokenize(message)

        # expose the sanitized message without single characters
        return [token for token in message if len(token) > 1]

In [55]:
sanitizer = Sanitizer()

# A function that extracts which words exist in a text based on a list of words to which we compare.
def word_feats(words):
        return dict([(word, True) for word in words])

# Get the negative reviews for movies    
neg_tweets = [sanitizer.sanitize(tweet) for tweet
              in twitter_samples.strings('negative_tweets.json')]

# Get the positive reviews for movies
pos_tweets = [sanitizer.sanitize(tweet) for tweet 
              in twitter_samples.strings('positive_tweets.json')]

# Find the features that most correspond to negative reviews    
negfeats = [(word_feats(f), 'neg') for f in neg_tweets]

# Find the features that most correspond to positive reviews
posfeats = [(word_feats(f), 'pos') for f in pos_tweets]

# We would only use 7500 instances to train on. The quarter of the reviews left is for testing purposes.
negcutoff = int(len(negfeats)*3/4)
poscutoff = int(len(posfeats)*3/4)

In [56]:
# Construct the training dataset containing 50% positive reviews and 50% negative reviews
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]

# Construct the negative dataset containing 50% positive reviews and 50% negative reviews
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

print ('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

# Train a NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(trainfeats)

# Test the trained classifier and display the most informative features.
print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
classifier.show_most_informative_features()

train on 7500 instances, test on 2500 instances
accuracy: 0.7208
Most Informative Features
                follower = True              pos : neg    =     26.2 : 1.0
                    glad = True              pos : neg    =     25.7 : 1.0
                 arrived = True              pos : neg    =     21.8 : 1.0
                     sad = True              neg : pos    =     20.3 : 1.0
                    sick = True              neg : pos    =     19.7 : 1.0
               community = True              pos : neg    =     16.3 : 1.0
                      ll = True              neg : pos    =     14.6 : 1.0
                    miss = True              neg : pos    =     13.6 : 1.0
                      aw = True              neg : pos    =     13.0 : 1.0
                     ugh = True              neg : pos    =     13.0 : 1.0


In [44]:

def word_feats(words):
        return [word for word in words]

# Get the negative reviews for movies    
neg_tweets_sk = [sanitizer.sanitize(tweet) for tweet
              in twitter_samples.strings('negative_tweets.json')]

# Get the positive reviews for movies
pos_tweets_sk = [sanitizer.sanitize(tweet) for tweet 
              in twitter_samples.strings('positive_tweets.json')]

# Find the features that most correspond to negative reviews    
negfeats_sk = [(word_feats(f), 'neg') for f in neg_tweets_sk]

# Find the features that most correspond to positive reviews
posfeats_sk = [(word_feats(f), 'pos') for f in pos_tweets_sk]

total_tweet_dataset = negfeats_sk + posfeats_sk
tweets_feat = [" ".join(tweet[0]) for tweet in total_tweet_dataset]
tweets_target = [tweet[1] for tweet in total_tweet_dataset]

In [50]:

# Split dataset into training set and test set
# 75% training and 25% test
x_train, x_test, y_train, y_test = train_test_split(tweets_feat, tweets_target, 
                                                                        test_size=0.25,random_state=109) 

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [53]:
from sklearn.metrics import classification_report
clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring='accuracy')
clf.fit(x_train, y_train)

print(classification_report(y_test, clf.predict(x_test), digits=4))

              precision    recall  f1-score   support

         neg     0.7267    0.7961    0.7598      1236
         pos     0.7801    0.7073    0.7419      1264

    accuracy                         0.7512      2500
   macro avg     0.7534    0.7517    0.7509      2500
weighted avg     0.7537    0.7512    0.7508      2500



In [140]:
# Test setup

# Create a dictionary measuring word frequencies
def get_words_in_tweets(text):
    all_words = []
    for words in text:
        all_words.extend(words.split(" "))
    return all_words

def get_words_in_tweets_correct(text):
    all_words = []
    for words, sentiment in text:
        all_words.extend(words.keys())
    return all_words

# Input: the list of words
# Output: the frequency of those words apearing in tweets
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features


# Construct our features based on which tweets contain which word
def extract_features_fast(document):
    document_words = set(document)
    features = {word: (word in document_words) for word in  tweet_word_features_set}
    return features

In [141]:
# Check if both techniques have the same length vocabulary
get_words = get_words_in_tweets(tweets_feat)
tweet_word_features = get_word_features(get_words)
tweet_word_features_set = set(tweet_word_features)

vector = CountVectorizer()
X = vector.fit_transform(tweets_feat)
len(vector.get_feature_names())
len(tweet_word_features)

In [98]:
# Classify tweets

# Use mongoDB for low memory use and being able to stop and start

# Extract sanitized text and ID of unclassified tweets
pipeline_skl =[{"$match":{"sanitized_text": {"$exists": True }
                            # Find only tweets without a classification label
                           ,"twitter_sentiment_skl":{"$exists" : False}
                           }},
                 {"$project" :{"_id" : 1,"id" :1 , "sanitized_text": 1}}
#                  ,{"$limit": 10}
                ]
                       
# Create cursur to itterate over
cursor_skl =twitter_db.aggregate(pipeline_skl)

In [99]:
number = 0
intervaltime=datetime.datetime.now()
# t_sample_result = []

for item in cursor_skl:
    classification = clf.predict([" ".join(item["sanitized_text"])])
    
    twitter_db.update_one(
        filter = {"_id" : item["_id"]},
        update = {"$set": { "twitter_sentiment_skl" : classification[0]}}
    )
#     result = (item["id"],classification)
#     t_sample_result.extend(result)
    number+= 1
    
    if number % 10000 == 0:
        print(number," tweets classified!")
        print("Classification rate is 10.000 tweets in %s seconds" % str(datetime.datetime.now()-intervaltime))
        print("Last classification was: %s" % classification)
        intervaltime=datetime.datetime.now()

In [100]:
# Count total amount of classified tweets
twitter_db.count_documents({"twitter_sentiment_skl":{"$exists" : True}})

347276

In [7]:
# Test equallity
names = ['realDonaldTrump', 'HillaryClinton']
pipeline_test_eq = [{"$match" : {"place.country_code" : "US",
                                 "movie_sentiment_skl":{"$exists" : True},
                                 "twitter_sentiment_skl":{"$exists" : True},
                                 "entities.user_mentions.screen_name":{"$in": names}
                                  }},
                    {"$project": { "_id" : 1, "id": 1, "sentiment_movies":"$movie_sentiment_skl" ,
                                    "sentiment_tweet":"$twitter_sentiment_skl"
                                   }},
#                     {"$limit" : 0}
                   ]
test_counter = twitter_db.aggregate(pipeline_test_eq)
test_df = pd.DataFrame(test_counter)

print(test_df.shape)
print(test_df[test_df["sentiment_tweet"]  ==  test_df["sentiment_movies"]].shape)

(347276, 4)
(191367, 4)


In [101]:
names = ['realDonaldTrump', 'HillaryClinton']
pipeline_sen_final = [{"$match" : {"place.country_code" : "US",
                                   "twitter_sentiment_skl":{"$exists" : True},
                                   "entities.user_mentions.screen_name":{"$in": names}
                                  }},
                      {"$project": { "_id" : 1, "id": 1, "sentiment":"$twitter_sentiment_skl" ,
                                    "state": "$place.state",
                                    "Mentions_Trump": {"$in": [names[0],
                                                "$entities.user_mentions.screen_name"]},
                                     "Mentions_Clinton": {"$in": [names[1],
                                                "$entities.user_mentions.screen_name"]}
                                   }}]
final_counter = twitter_db.aggregate(pipeline_sen_final)

sen_final_df = pd.DataFrame(final_counter)

In [102]:
# Assign mentions column to Trump, Clinton or Both
series_both = (sen_final_df["Mentions_Trump"] & sen_final_df["Mentions_Clinton"])
series_trump = (sen_final_df["Mentions_Trump"] & ~sen_final_df["Mentions_Clinton"])
series_clinton = (~sen_final_df["Mentions_Trump"] & sen_final_df["Mentions_Clinton"])
sen_final_df["Mentions"] = None
print(sum(sen_final_df["Mentions"].isna()))
sen_final_df.loc[series_both,"Mentions"] = "both"
sen_final_df.loc[series_trump,"Mentions"] = "trump"
sen_final_df.loc[series_clinton,"Mentions"] = "clinton"
print(sum(sen_final_df["Mentions"].isna()))
sen_final_df.head()

347276
0


Unnamed: 0,_id,id,sentiment,state,Mentions_Trump,Mentions_Clinton,Mentions
0,5d7f499f98cd5b074bafb1a2,764039733076897792,neg,Louisiana,True,False,trump
1,5d7f499f98cd5b074bafb1a6,764039917924069376,neg,California,True,False,trump
2,5d7f499f98cd5b074bafb1a8,764039926161604608,neg,New Jersey,True,False,trump
3,5d7f499f98cd5b074bafb1a9,764039928116240384,neg,Texas,False,True,clinton
4,5d7f499f98cd5b074bafb1ab,764039948567576576,pos,Maryland,True,False,trump


In [103]:
state_summary_inc_both = sen_final_df[["id","state", "Mentions","sentiment"]].groupby(
    ["state", "Mentions","sentiment"]).count()
state_summary_twitter = state_summary_inc_both.loc[(slice(None),["clinton", "trump"]),:]
state_summary_twitter.to_pickle("datasets/state_sen_summary_twitter_skl.pkl")

In [104]:
state_summary_twitter.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id
state,Mentions,sentiment,Unnamed: 3_level_1
Alabama,clinton,neg,526
Alabama,clinton,pos,420
Alabama,trump,neg,1921
Alabama,trump,pos,2112
Alaska,clinton,neg,111
