In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import nltk
from nltk.classify import NaiveBayesClassifier

import csv
import pandas as pd
import re
import tqdm

In [3]:
tweets = pd.read_csv('data/twitter-train.tsv', sep='\t', 
                     quoting=csv.QUOTE_NONE, usecols=[0,1,2],
                     names=['id', 'label', 'message'], 
                     index_col=0, dtype={'label': 'category'})

In [4]:
# Clean and prepare messages:
eyes_regex = r'[8:=;]'
nose_regex = r"['`\-]?"
def replace_hash_text(match):
    hash_text = match.group(1)
    if hash_text.isupper():
        return '<HASHTAG> ' + hash_text
    else:
        return '<HASHTAG> ' + ' '.join(re.findall(r'([a-zA-Z0-9]+?)(?=\b|[A-Z0-9_])', hash_text))


tweets['message'] = tweets['message'].str.decode('unicode_escape', errors='ignore')
tweets['message'] = tweets['message'].str.strip('"')  # remove left-most and right-most "
tweets['message'] = tweets['message'].str.replace('""', '"', regex=False)
tweets['message'] = tweets['message'].str.replace(r'https?://\S+\b|www\.(\w+\.)+\S*', '<URL>') # replace URLs
tweets['message'] = tweets['message'].str.replace(r'([/()\[\]])',r' \1 ') # Force splitting words appended with slashes/parenthesis/brackets
tweets['message'] = tweets['message'].str.replace(r'@\w+', '<USER>') # Replace usernames
tweets['message'] = tweets['message'].str.replace(r'[-+]?[.\d]*[\d]+[:,.\d]*', ' <NUMBER> ') # Replace numbers
tweets['message'] = tweets['message'].str.replace(r'#(\S+)', replace_hash_text)
tweets['message'] = tweets['message'].str.replace(eyes_regex + nose_regex + r'[)d]+|[)d]+' + nose_regex + eyes_regex, '<SMILE>', flags=re.IGNORECASE)
tweets['message'] = tweets['message'].str.replace(eyes_regex + nose_regex + r'p+', '<LOLFACE>', flags=re.IGNORECASE)
tweets['message'] = tweets['message'].str.replace(eyes_regex + nose_regex + r'\(+|\)+' + nose_regex + eyes_regex, '<SADFACE>')
tweets['message'] = tweets['message'].str.replace(eyes_regex + nose_regex + r'[/|l*]', '<NEUTRALFACE>')
tweets['message'] = tweets['message'].str.replace(r'<3', '<HEART>')
tweets['message'] = tweets['message'].str.replace(r'([!?.]){2,}', r'\1 <REPEAT>') # Mark punctuation repetitions (eg. "!!!" => "! <REPEAT>")
tweets['message'] = tweets['message'].str.replace(r'\b(\S*?)(.)\2{2,}\b', r'\1\2 <ELONG>') # Mark elongated words (eg. "wayyyy" => "way <ELONG>")
tweets['message'] = tweets['message'].str.replace(r'\s+', ' ') # Replace all whitespace characters by only one space
tweets['message'] = tweets['message'].str.strip()
tweets['message'] = tweets['message'].str.lower()

In [5]:
tokenizer = nltk.tokenize.TweetTokenizer()

def features(tweet):
    tokens = tokenizer.tokenize(tweet)
    
    unigram = dict([(token, True) for token in tokens])
    bigram = dict([(' '.join(tokens), True) for tokens in nltk.bigrams(tokens)])
    trigram = dict([(' '.join(tokens), True) for tokens in nltk.trigrams(tokens)])
    
    return {**unigram, **bigram, **trigram}

In [6]:
positive_tweets = tweets[tweets['label'] == 'positive']
negative_tweets = tweets[tweets['label'] == 'negative']
neutral_tweets = tweets[tweets['label'] == 'neutral']

In [7]:
positive_feats = [(features(tweet), 'positive') for tweet in positive_tweets['message']]
negative_feats = [(features(tweet), 'negative') for tweet in negative_tweets['message']]
neutral_feats = [(features(tweet), 'neutral') for tweet in neutral_tweets['message']]

classifier = NaiveBayesClassifier.train(negative_feats + neutral_feats + positive_feats)

In [8]:
tweets_test = pd.read_csv('data/SemEval2017-task4-test.subtask-A.english.txt', sep='\t', 
                          quoting=csv.QUOTE_NONE, usecols=[0,1,2],
                          names=['id', 'label', 'message'], 
                          index_col=0, dtype={'label': 'category'})

sentiments = []

for tweet in tqdm.tqdm(tweets_test['message']):
    sentiments.append(classifier.classify(features(tweet)))

100%|██████████| 12284/12284 [00:05<00:00, 2202.54it/s]


In [9]:
classifier.show_most_informative_features()

Most Informative Features
          happy <number> = True           positi : neutra =    114.6 : 1.0
              so excited = True           positi : neutra =     66.1 : 1.0
                   syria = True           negati : positi =     56.7 : 1.0
       happy <number> th = True           positi : neutra =     55.6 : 1.0
                  fucked = True           negati : positi =     51.6 : 1.0
                 erdogan = True           negati : positi =     49.3 : 1.0
            be the worst = True           negati : neutra =     49.0 : 1.0
            <user> happy = True           positi : neutra =     46.5 : 1.0
                   . : ( = True           negati : neutra =     45.1 : 1.0
                 the ira = True           negati : positi =     44.6 : 1.0


In [10]:
with open('outputs/nb-predictions.txt', 'w') as f:
    for tweet_id, pred in zip(list(tweets_test.index), sentiments):
        f.write(str(tweet_id) + '\t' + pred + '\n')

In [11]:
!perl SemEval2017_task4_test_scorer_subtaskA.pl data/SemEval2017_task4_subtaskA_test_english_gold.txt outputs/nb-predictions.txt

outputs/nb-predictions.txt	0.562	0.554	0.537	


In [12]:
import pickle

with open('nb-classifier.pickle', 'wb') as handle:
    pickle.dump(classifier, handle, protocol=pickle.HIGHEST_PROTOCOL)