In [1]:
# dependencies
import nltk
import string
import re
import unicodedata
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import twitter_samples
import pprint

# Load dependencies
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
import pandas as pd
import datetime
import numpy as np
import sklearn as sk

In [2]:
# Configuration
# Location MongoDB:
mongo_host = None
# Name collection
client_name = "fundamentals"

In [3]:
# Create connection
try:
    client = MongoClient(mongo_host)
    client.admin.command('ismaster')
    db = client[client_name] 
    twitter_db = db.twitter
    
except ConnectionFailure:
    print("Connection to MongoDB server could not be established")
    exit()

In [4]:
class Sanitizer(object):
    """
    Class for sanitizing twitter messages.

    @example        ```
                    sanitizer = Sanitizer()

                    with open('mytextfile.txt', 'r') as f:
                        for line in f:
                            print(sanitizer.sanitize(line))
                    ```

    @dependencies   nltk, string, re
    @author         Tycho Atsma <tycho.atsma@student.uva.nl>
    @file           Sanitizer.py
    @documentation  public
    @copyright      University of Amsterdam
    """
    punctuation = string.punctuation
    stopwords = nltk.corpus.stopwords
    Lemmatizer = nltk.stem.WordNetLemmatizer
    Tokenizer = nltk.tokenize.TweetTokenizer

    def __init__(self):
        """
        Constructor.
        """
        self.punctuation_table = dict((ord(char), None) for char in string.punctuation) 
        self.lemmatizer = self.Lemmatizer()
        self.tokenizer = self.Tokenizer()

    def remove_punctuation(self, message):
        """
        Method to remove punctuation from a twitter message.
        @param  string  Twitter message.
        @return string
        """
        return message.translate(self.punctuation_table).strip()

    def remove_stopwords(self, message, language="english"):
        """
        Method to remove stopwords from a twitter message.
        @param  string  Twitter message.
        @param  string  Language of the stopwords (default: english).
        @return string
        """
        stops = self.stopwords.words(language)
        tokens = message.split()
        return " ".join([token for token in tokens if token not in stops]).strip()

    def remove_links(self, message):
        """
        Method to remove links from a twitter message.
        @param  string  Twitter message.
        @return string
        """
        # source: https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
        pattern = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
        return re.sub(pattern, "", message).strip()

    def remove_usertags(self, message):
        """
        Method to remove usertags from a twitter message.
        @param  string  Twitter message.
        @return string
        """
        pattern = r"(\@\w*)"
        return re.sub(pattern, "", message).strip()

    def remove_hashtags(self, message):
        """
        Method to remove hashtags from a twitter message.
        @param  string  Twitter message.
        @return string
        """
        pattern = r"(\#\w*)"
        return re.sub(pattern, "", message).strip()

    def lemmatize(self, message):
        """
        Method to lemmatize a twitter message.
        @param  string  Twitter message.
        @return string
        """
        tokens = message.split()
        return " ".join([self.lemmatizer.lemmatize(token) for token in tokens])

    def tokenize(self, message):
        """
        Method to tokenize a twitter message.
        @param  string  Twitter message.
        @return string
        """
        return self.tokenizer.tokenize(message)

    def sanitize(self, message):
        """
        Method to sanitize a twitter message.
        @param  string  Twitter message.
        @return string
        """
        # 1. we need to normalize the message
        message = unicodedata.normalize("NFC", message.lower().encode('utf8').decode('utf8'))

        # 2. we need to get rid of specific types of tokens
        message = self.remove_links(message)
        message = self.remove_usertags(message)
        message = self.remove_hashtags(message)

        # 3. we need to get rid of language noise
        message = self.remove_punctuation(message)
        message = self.remove_stopwords(message)
        message = self.lemmatize(message)

        # 4. we need tokenize the message
        message = self.tokenize(message)

        # expose the sanitized message without single characters
        return [token for token in message if len(token) > 1]

In [6]:
sanitizer = Sanitizer()

# A function that extracts which words exist in a text based on a list of words to which we compare.
def word_feats(words):
        return dict([(word, True) for word in words])

# Get the negative reviews for movies    
neg_tweets = [sanitizer.sanitize(tweet) for tweet
              in twitter_samples.strings('negative_tweets.json')]

# Get the positive reviews for movies
pos_tweets = [sanitizer.sanitize(tweet) for tweet 
              in twitter_samples.strings('positive_tweets.json')]

# Find the features that most correspond to negative reviews    
negfeats = [(word_feats(f), 'neg') for f in neg_tweets]

# Find the features that most correspond to positive reviews
posfeats = [(word_feats(f), 'pos') for f in pos_tweets]

# We would only use 7500 instances to train on. The quarter of the reviews left is for testing purposes.
negcutoff = int(len(negfeats)*3/4)
poscutoff = int(len(posfeats)*3/4)

In [7]:
# Construct the training dataset containing 50% positive reviews and 50% negative reviews
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]

# Construct the negative dataset containing 50% positive reviews and 50% negative reviews
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

print ('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

# Train a NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(trainfeats)

# Test the trained classifier and display the most informative features.
print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
classifier.show_most_informative_features()

train on 7500 instances, test on 2500 instances
accuracy: 0.7208
Most Informative Features
                follower = True              pos : neg    =     26.2 : 1.0
                    glad = True              pos : neg    =     25.7 : 1.0
                 arrived = True              pos : neg    =     21.8 : 1.0
                     sad = True              neg : pos    =     20.3 : 1.0
                    sick = True              neg : pos    =     19.7 : 1.0
               community = True              pos : neg    =     16.3 : 1.0
                      ll = True              neg : pos    =     14.6 : 1.0
                    miss = True              neg : pos    =     13.6 : 1.0
              definitely = True              pos : neg    =     13.0 : 1.0
                     ugh = True              neg : pos    =     13.0 : 1.0


In [8]:
# # Load in tweets:

# # Create pickle of DataFrame of all us tweets with added state locations and mentions
# names = ['realDonaldTrump', 'HillaryClinton']

# print("Number of US country tweets", twitter_db.count_documents(filter = 
#  {"place.country_code" : "US"}))


# # Set up pipeline to find tweets mentioning one or both candidates 
# # and create columns with booleans for the mention of each candidate
# pipeline_sen_state = [{"$match" : {"place.country_code" : "US",
#                                    "place.state":{"$exists" : True},
#                                    "sanitized_text":{"$exists" : True},
#                                    "entities.user_mentions.screen_name":{"$in": names}
#                                   }},
#                       {"$project": { "_id" : 1, "id": 1,"text": 1, "sanitized_text" :1}}
#                         ]
# sen_counter = twitter_db.aggregate(pipeline_sen_state)

# sen_state_df = pd.DataFrame(sen_counter)
# print("Number of tweets from the US mentioning one or both candidates: "+str(
#     sen_state_df.shape[0]))

In [9]:
# sen_state_df.head()

In [10]:
# Sanitize tweet text

# sen_state_df["sanitized_text"] = [sanitizer.sanitize(tweet) for tweet 
#                               in sen_state_df["text"]]

# sen_state_df.head()

In [11]:
# # Drop empty rows

# sen_state_df = sen_state_df[sen_state_df['sanitized_text'].map(lambda d: len(d)) > 0]

# sen_state_df.head()

In [12]:
# Output Sanitized text into MongoDB

# matched = 0
# modified = 0
# for tweet in sen_state_df.itertuples():
#     result = twitter_db.update_many(
#         filter = {"_id" : tweet[1]},
#         update = {"$set": { "sanitized_text" : tweet.sanitized_text}}
#                                    )
#     matched += result.matched_count
#     modified += result.modified_count


# print("Matched: "+str(matched)+", Modified: "+  str(modified))


In [13]:
# pipeline_san =[{"$match":{"_id" : sen_state_df["_id"].values[0],
#                "sanitized_text": {"$exists": True }}},
#             {"$project" :{"_id" : 1, "Feat": {"$in":['collusion',"$sanitized_text"]}}}]
                       

# cursor =twitter_db.aggregate(pipeline_san)
# test = list(cursor)
# test

In [14]:
# Create a dictionary measuring word frequencies
def get_words_in_tweets(text):
    all_words = []
    for words in text:
        all_words.extend(words)
    return all_words

def get_words_in_tweets_correct(text):
    all_words = []
    for words, sentiment in text:
        all_words.extend(words.keys())
    return all_words

In [15]:
get_words = get_words_in_tweets_correct(trainfeats)
# get_words = get_words_in_tweets(sen_state_df["sanitized_text"])

In [16]:
# Input: the list of words
# Output: the frequency of those words apearing in tweets
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [17]:
tweet_word_features = get_word_features(get_words)
tweet_word_features_set = set(tweet_word_features)
len(tweet_word_features_set)

8906

In [18]:
# tweet_word_features_filtered = dict()
# wordlist = nltk.FreqDist(get_words)
# for k, v in wordlist.items():
#     # Filter out all words that are only once in the data
#     if v > 1:
#         tweet_word_features_filtered[k] = v

# tweet_word_features_filtered = tweet_word_features_filtered.keys()
# len(tweet_word_features_filtered)
    

In [19]:
# Construct our features based on which tweets contain which word
def extract_features(document):    
    document_words = set(document)
    features = {}
    for word in tweet_word_features:
        features[word] = (word in document)
    return features

def extract_features_fast(document):
    document_words = set(document)
    features = {word: (word in document_words) for word in  tweet_word_features_set}
    return features

In [20]:
# Test which extractor is faster
import timeit
print(timeit.timeit('extract_features(["this", "is", "a", "test", "follower"])',
                    "from __main__ import extract_features", number=100))
print(timeit.timeit('extract_features_fast(["this", "is", "a", "test", "follower"])',
                    "from __main__ import extract_features_fast", number=100))

# print(timeit.timeit('classifier.classify(extract_features_fast(["this", "is", "a", "test", "follower"]))',
#                     "from __main__ import extract_features_fast, classifier", number=100))
extract_features(["this", "is", "a", "test", "follower"]) == extract_features_fast(["this", "is", "a", "test", "follower"])

0.19429867499999887
0.12867898800000432


True

In [21]:
# del sen_state_df

In [22]:
# Extra code used to delete field from mondogDB

# result = twitter_db.update_many( filter = {"twitter_sentiment_correct": {"$exists": True }},
#                        update = {"$unset": { "twitter_sentiment_correct" : False }})
# matched = result.matched_count
# modified = result.modified_count
# print("Matched: "+str(matched)+", Modified: "+  str(modified))

In [23]:
# pipeline_feat_test =[{"$match":{"twitter_sentiment":{"$exists" : True}}},
#                      {"$project" :{"_id" : 1,"id" :1 ,"state": "$place.state",
#                                    "Mentions_Trump": {"$in": [names[0],
#                                                 "$entities.user_mentions.screen_name"]},
#                                    "Mentions_Clinton": {"$in": [names[1],
#                                                 "$entities.user_mentions.screen_name"]},
#                                    "twitter_sentiment": 1}},
#                     {"$limit" : 2}]
                       

# cursor =twitter_db.aggregate(pipeline_feat_test)
# test = list(cursor)
# for item in test:
#     print(item["twitter_sentiment"])

In [24]:
# for item in test:
#     print(item["features"])
#     print(item["id"],classifier.classify(item["features"]))

In [25]:
# list(sen_state_df["_id"][:2].values[slice(2)])

In [36]:
# Classify tweets

# Use mongoDB for low memory use and being able to stop and start

# Extract sanitized text and ID of unclassified tweets
pipeline_class =[{"$match":{"sanitized_text": {"$exists": True }
                            # Find only tweets without a classification label
                           ,"twitter_sentiment_correct":{"$exists" : False}
                           }},
                 {"$project" :{"_id" : 1,"id" :1 , "sanitized_text": 1}}
#                  ,{"$limit": 10}
                ]
                       
# Create cursur to itterate over
cursor_collection =twitter_db.aggregate(pipeline_class)

In [37]:
number = 0
intervaltime=datetime.datetime.now()
# t_sample_result = []

for item in cursor_collection:
    features = extract_features_fast(item["sanitized_text"])
    classification = classifier.classify(features)
    twitter_db.update_one(
        filter = {"_id" : item["_id"]},
        update = {"$set": { "twitter_sentiment_correct" : classification}}
    )
#     result = (item["id"],classification)
#     t_sample_result.extend(result)
    number+= 1
    
    if number % 1000 == 0:
        print(number," tweets classified!")
        print("Classification rate is 1000 tweets in %s seconds" % str(datetime.datetime.now()-intervaltime))
        print("Last classification was: %s" % classification)
        intervaltime=datetime.datetime.now()


1000  tweets classified!
Classification rate is 1000 tweets in 0:01:14.896998 seconds
Last classification was: pos
2000  tweets classified!
Classification rate is 1000 tweets in 0:00:58.949378 seconds
Last classification was: pos
3000  tweets classified!
Classification rate is 1000 tweets in 0:01:06.724490 seconds
Last classification was: neg
4000  tweets classified!
Classification rate is 1000 tweets in 0:01:04.434036 seconds
Last classification was: neg
5000  tweets classified!
Classification rate is 1000 tweets in 0:00:56.443005 seconds
Last classification was: neg
6000  tweets classified!
Classification rate is 1000 tweets in 0:00:52.427199 seconds
Last classification was: neg
7000  tweets classified!
Classification rate is 1000 tweets in 0:00:53.314005 seconds
Last classification was: neg
8000  tweets classified!
Classification rate is 1000 tweets in 0:01:03.813025 seconds
Last classification was: neg
9000  tweets classified!
Classification rate is 1000 tweets in 0:01:03.493012 se

KeyboardInterrupt: 

In [None]:
names = ['realDonaldTrump', 'HillaryClinton']
pipeline_sen_final = [{"$match" : {"place.country_code" : "US",
                                   "twitter_sentiment_correct":{"$exists" : True},
                                   "entities.user_mentions.screen_name":{"$in": names}
                                  }},
                      {"$project": { "_id" : 1, "id": 1, "sentiment":"$twitter_sentiment_correct" ,
                                    "state": "$place.state",
                                    "Mentions_Trump": {"$in": [names[0],
                                                "$entities.user_mentions.screen_name"]},
                                     "Mentions_Clinton": {"$in": [names[1],
                                                "$entities.user_mentions.screen_name"]}
                                   }}]
final_counter = twitter_db.aggregate(pipeline_sen_final)

sen_final_df = pd.DataFrame(final_counter)

In [35]:
# Count total amount of classified tweets
twitter_db.count_documents({"twitter_sentiment_correct":{"$exists" : True}})

234247

In [None]:
# # The tweet we are about to classify
# san_text = sen_state_df['sanitized_text'].tolist()

# sent_list= [""]*len(sen_state_df['sanitized_text'].tolist())
# feat_list = [extract_features(t) for t in san_text[:5]]


In [None]:
# print(san_text[0],feat_list[0])

In [None]:
# number = 0

# for feat in feat_list:
#     starttime=datetime.datetime.now()
#     foo = classifier.classify(feat)
#     sent_list[number] = foo
#     number+= 1
    
#     if number % 100 == 0:
#         print(number," tweets classified!")
#         print("Classification rate is 100 tweets in %s seconds" % str(datetime.datetime.now()-starttime))
#         print("Last classification was: %s" % foo)

        

In [None]:
# total_df['sentiment'] = pd.Series(sent_list)

In [None]:
# Assign mentions column to Trump, Clinton or Both
series_both = (sen_final_df["Mentions_Trump"] & sen_final_df["Mentions_Clinton"])
series_trump = (sen_final_df["Mentions_Trump"] & ~sen_final_df["Mentions_Clinton"])
series_clinton = (~sen_final_df["Mentions_Trump"] & sen_final_df["Mentions_Clinton"])
sen_final_df["Mentions"] = None
print(sum(sen_final_df["Mentions"].isna()))
sen_final_df.loc[series_both,"Mentions"] = "both"
sen_final_df.loc[series_trump,"Mentions"] = "trump"
sen_final_df.loc[series_clinton,"Mentions"] = "clinton"
print(sum(sen_final_df["Mentions"].isna()))
sen_final_df.head()

In [None]:
state_summary_inc_both = sen_final_df[["id","state", "Mentions","sentiment"]].groupby(
    ["state", "Mentions","sentiment"]).count()
state_summary_twitter = state_summary_inc_both.loc[(slice(None),["clinton", "trump"]),:]
state_summary_twitter.to_pickle("datasets/state_sen_summary_twitter_sample.pkl")

In [None]:
state_summary_twitter.head()