In [1]:
import warnings
warnings.filterwarnings('ignore')

---

## Load Examples

In [2]:
import pandas as pd

tweets = pd.read_csv('data/tweets-examples.tsv', sep='\t', header=None, names=['id', 'message'])
tweets['id'] = tweets['id'].apply(lambda url: url.split('/')[-1])
tweets.set_index('id', inplace=True)

---

## `vaderSentiment`

In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def find_sentiment(tweet):
    sentiment = analyzer.polarity_scores(tweet)
    
    if abs(sentiment['compound']) <= 0.05:
        return 'neutral'
    
    if sentiment['compound'] > 0:
        return 'positive'
    
    return 'negative'

tweets['vader'] = tweets['message'].apply(find_sentiment)

---

# Textblob - `PatternAnalyzer`

In [4]:
from textblob import TextBlob, Blobber
from textblob.sentiments import PatternAnalyzer

blobber = Blobber(analyzer=PatternAnalyzer())

def find_sentiment(tweet):
    sentiment = blobber(tweet).sentiment
    if abs(sentiment.polarity) <= 0.05:
        return 'neutral'
    
    if sentiment.polarity > 0:
        return 'positive'
    
    return 'negative'

tweets['textblob'] = tweets['message'].apply(find_sentiment)

---

## BERTweet

In [5]:
import torch
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)
bertweet = AutoModelForSequenceClassification.from_pretrained("checkpoints/checkpoint-2500/", num_labels=3)

input_ids = torch.LongTensor(tokenizer(list(tweets['message']), padding=True)['input_ids'])
outputs = bertweet(input_ids)
predictions = torch.argmax(torch.nn.functional.softmax(torch.Tensor(outputs[0])), dim=1).tolist()

idx2label = {0: 'neutral', 1: 'positive', 2: 'negative'}
tweets['bertweet'] = [idx2label[pred] for pred in predictions]

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


---

## $n$-gram Naive Bayes

In [6]:
import re

# Clean and prepare messages:
eyes_regex = r'[8:=;]'
nose_regex = r"['`\-]?"

def replace_hash_text(match):
    hash_text = match.group(1)
    if hash_text.isupper():
        return '<HASHTAG> ' + hash_text
    else:
        return '<HASHTAG> ' + ' '.join(re.findall(r'([a-zA-Z0-9]+?)(?=\b|[A-Z0-9_])', hash_text))

tweets['processed'] = tweets['message']
tweets['processed'] = tweets['processed'].str.decode('unicode_escape', errors='ignore')
tweets['processed'] = tweets['processed'].str.strip('"')  # remove left-most and right-most "
tweets['processed'] = tweets['processed'].str.replace('""', '"', regex=False)
tweets['processed'] = tweets['processed'].str.replace(r'https?://\S+\b|www\.(\w+\.)+\S*', '<URL>') # replace URLs
tweets['processed'] = tweets['processed'].str.replace(r'([/()\[\]])',r' \1 ') # Force splitting words appended with slashes/parenthesis/brackets
tweets['processed'] = tweets['processed'].str.replace(r'@\w+', '<USER>') # Replace usernames
tweets['processed'] = tweets['processed'].str.replace(r'[-+]?[.\d]*[\d]+[:,.\d]*', ' <NUMBER> ') # Replace numbers
tweets['processed'] = tweets['processed'].str.replace(r'#(\S+)', replace_hash_text)
tweets['processed'] = tweets['processed'].str.replace(eyes_regex + nose_regex + r'[)d]+|[)d]+' + nose_regex + eyes_regex, '<SMILE>', flags=re.IGNORECASE)
tweets['processed'] = tweets['processed'].str.replace(eyes_regex + nose_regex + r'p+', '<LOLFACE>', flags=re.IGNORECASE)
tweets['processed'] = tweets['processed'].str.replace(eyes_regex + nose_regex + r'\(+|\)+' + nose_regex + eyes_regex, '<SADFACE>')
tweets['processed'] = tweets['processed'].str.replace(eyes_regex + nose_regex + r'[/|l*]', '<NEUTRALFACE>')
tweets['processed'] = tweets['processed'].str.replace(r'<3', '<HEART>')
tweets['processed'] = tweets['processed'].str.replace(r'([!?.]){2,}', r'\1 <REPEAT>') # Mark punctuation repetitions (eg. "!!!" => "! <REPEAT>")
tweets['processed'] = tweets['processed'].str.replace(r'\b(\S*?)(.)\2{2,}\b', r'\1\2 <ELONG>') # Mark elongated words (eg. "wayyyy" => "way <ELONG>")
tweets['processed'] = tweets['processed'].str.replace(r'\s+', ' ') # Replace all whitespace characters by only one space
tweets['processed'] = tweets['processed'].str.strip()
tweets['processed'] = tweets['processed'].str.lower()

In [7]:
import nltk

tokenizer = nltk.tokenize.TweetTokenizer()

def features(tweet):
    tokens = tokenizer.tokenize(tweet)
    
    unigram = dict([(token, True) for token in tokens])
    bigram = dict([(' '.join(tokens), True) for tokens in nltk.bigrams(tokens)])
    trigram = dict([(' '.join(tokens), True) for tokens in nltk.trigrams(tokens)])
    
    return {**unigram, **bigram, **trigram}

In [8]:
import pickle

with open('nb-classifier.pickle', 'rb') as handle:
    classifier = pickle.load(handle)

tweets['nb'] = tweets['processed'].apply(features).apply(classifier.classify)
tweets.drop(columns=['processed'], inplace=True)

# classifier.show_most_informative_features(100)

---

In [9]:
tweets

Unnamed: 0_level_0,message,vader,textblob,bertweet,nb
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1373967439944425475,Why the EU Should Not Create a Separate AI Pro...,negative,neutral,neutral,negative
1383337183013072904,Delighted to see this finally published: Trust...,positive,positive,positive,positive
1395764999029657607,Great congratulations to Bashar Nuseibeh @BNus...,positive,positive,positive,positive
1397891830306512899,I’m not enjoying this spider! #spider,negative,negative,negative,positive
1396373953208332288,"Well done UK, we did ourselves proud once agai...",positive,positive,positive,negative
1395686278813859845,So excited to have @TLevingstone at our #CRTAI...,negative,positive,positive,positive
1387447572256657414,We mourn the passing of Apollo 11 astronaut Mi...,positive,positive,positive,negative
1397881733937238016,I really enjoyed talking to the amazing & bril...,positive,positive,positive,positive
1395788559953670154,Thank you for the opportunity to speak to the ...,positive,positive,positive,positive


In [10]:
tweets.set_index(map(lambda s_id: 'https://twitter.com/profile/status/'+str(s_id), 
                     list(tweets.index)), 
                 inplace=True)

tweets.to_csv('outputs/example-predictions.tsv', sep='\t')