In [None]:
import pandas as pd
import numpy as np
import re
import itertools
import math
from string import punctuation
from unidecode import unidecode
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import PorterStemmer
import html
from multiprocessing import Pool
from tqdm import tqdm
import string
import en_core_web_sm
import warnings
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score


DEV_PATH = 'DSL2122_january_dataset/development.csv'
EVAL_PATH = 'DSL2122_january_dataset/evaluation.csv'

N_JOBS = 12
URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
HASHTAG_REGEX = r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"
TARGET_REGEX = r"(?:@[\w_]+)"
CAPITALIZED_WORDS_REGEX = r"\b[A-Z]{2,}\b"

EMOTICONS = {
    ('smiley',): [':-)', ':-))', ':-)))', ':)', ':))', ':)))', ':-]', ':]', ':-3', ':3', ':->', ':>', '8-)', ':o)', ':-}', ':}', ':c)', ':^)', '=]', '=)'],
    ('laugh',): [':-d', ':d', '8-d', '8d', 'x-d', 'xd', '=d', '=3', 'b^d', '>^_^<', '<^!^>', '^/^', '（*^_^*）', '(^<^) (^.^)', '(^^)', '(^.^)', '(^_^.)', '(^_^)', '(^j^)', '(*^.^*)', '(^—^）', '(#^.^#)', '(*^^)v', '(^_^)v'],
    ('sad',): [':-(', ':(', ':-c', ':c', ':-<', ':<', ':-[', ':[', ':-||', '>:[', ':{', ':@', '>:(', ":'-)", ":')", "('_')", '(/_;)', '(t_t) (;_;)', '(;_;', '(;_:)', '(;o;)', '(:_;)', '(tot)', ';_;', ';-;', ';n;', ';;', 'q.q', 't.t', 'qq', 'q_q', '(＾v＾)', '(＾ｕ＾)', '(^)o(^)', '(^o^)', ')^o^('],
    ('crying',): [":'-(", ":'("],
    ('horror',): ["d-':"],
    ('disgust',): ['d:<'],
    ('playful',): ['d:', ':-p', ':p', 'x-p', 'xp', ':-þ', ':þ', ':b', '=p', '>:p', ':-b'],
    ('dismay',): ['d8', 'd;', 'd=', 'dx'],
    ('surprise',): [':-o', ':o'],
    ('shock',): [':-0'],
    ('yawn',): ['8-0', '>:o'],
    ('kiss',): [':-*', ':*'],
    ('tongue', 'tied'): [':x', ':-x', ':-#', ':#', ':-&', ':&'],
    ('wink',): [';-)', ';)', '*-)', '*)', ';-]', ';]', ';^)', ':-,', ';d', '(^_-)'],
    ('annoyed',): [':-/', ':/', ':-[.]', '>:[(\\)]', '>:/', ':[(\\)]', '=/', '=[(\\)]', ':l', '=l', ':s'],
    ('straight',): [':-|', ':|'],
    ('embarrassed',): [':$'],
    ('angel',): ['o:-)', 'o:)', '0:-3', '0:3', '0:-)', '0:)', '0;^)'],
    ('evil',): ['>:-)', '>:)', '}:-)', '}:)', '3:-)', '3:)', '>;)'],
    ('cool',): ['|;-)'],
    ('bored',): ['|-o'],
    ('tongue',): [':-j'],
    ('party',): ['#-)'],
    ('drunk',): ['%-)', '%)'],
    ('sick',): [':-###..', ':###..'],
    ('dump',): ['<:-|'],
    ('troubled',): ['(>_<)', '(>_<)>'],
    ('baby',): ["(';')"],
    ('shy',): ['(^^>``', '(^_^;)', '(-_-;)', '(~_~;) (・.・;)'],
    ('sleeping',): ['(-_-)zzz'],
    ('confused',): ['((+_+))', '(+o+)'],
    ('ultraman',): ['(o|o)'],
    ('joyful',): ['^_^', '(^_^)/', '(^o^)／'],
    ('respect',): ['(__)', '_(._.)_', '<(_ _)>', '<m(__)m>', 'm(__)m', 'm(_ _)m'],
    ('shame',): ['(-.-)', '(-_-)', '(一一)', '(；一_一)'],
    ('tired',): ['(=_=)'],
    ('cat',): ['(=^·^=)', '(=^··^=)', '=_^= '],
    ('looking', 'down'): ['(..)', '(._.)'],
    ('giggling',): ['^m^'],
    ('confusion',): ['(・・?', '(?_?)'],
    ('waving',): ['（^—^）', '(;_;)/~~~', '(^.^)/~~~', '(-_-)/~~~ ($··)/~~~', '(t_t)/~~~', '(tot)/~~~'],
    ('excited',): ['(*^0^*)'],
    ('amazed',): ['(*_*)', '(*_*;', '(+_+)', '(@_@)'],
    ('music',): ['((d[-_-]b))'],
    ('worried',): ['(-"-)', '(ーー;)'],
    ('eyeglasses',): ['(^0_0^)'],
    ('surprised',): [':o o_o', 'o_0', '(o.o)', 'oo'],
    ('surpised',): ['o.o'],
    ('dissatisfied',): ['(*￣m￣)'],
    ('deflated',): ["('a`)"],
}

EMOTICONS_DICT = {}
for k, v in EMOTICONS.items():
    for emo in v:
        EMOTICONS_DICT[emo] = list(k)

In [None]:
def tokenize(text):
    # https://pythonspot.com/nltk-stop-words/
    # https://pythonspot.com/nltk-stemming/
    text = unidecode(text)
#   stopWords = set(stopwords.words('english'))
    # tk = word_tokenize
    tk = TweetTokenizer().tokenize
    words = tk(text)
    wordsCleaned = []
    ps = PorterStemmer()
    for word0 in words:
        no_emoticons = EMOTICONS_DICT.get(word0, [word0])
        for word in no_emoticons:
            ww = word_tokenize(word)
            if len(ww) > 1 and any([x in punctuation for x in [ww[0][-1], ww[1][0]]]):
                ww = [word]
            # print(ww)
            for w in ww:
                sw = ps.stem(w)
                # print(w, sw)
                # if len(sw) > 1 and sw not in stopWords: # wanted also to delete len(sw) > 1
                wordsCleaned.append(sw)
    return wordsCleaned


def tokenize_texts_it(args):
    (i, text, args) = args
    return tokenize(text)


def tokenize_texts(texts):
    return parallel(tokenize_texts_it, texts)
    # return [tokenize(text) for text in texts]


# twitter tokenized, w/o stop words, stemmed, tokenized, ...
def wordset(tokenized_texts):
    words = {}
    for tokenized_text in tokenized_texts:
        for token in tokenized_text:
            words[token] = words.get(token, 0) + 1
    return words

# sentiments is an iterable of integers in {0,1}


def ngrams(texts, words, sentiments=None):
    regex = re.compile(r'((.)\2{2,})')
    neg = set(("not", "nor", "n't", "no"))
    stop_neg = set("!),-.;>?]_}")
    stop_j = ["but", "so", "after", "before", "although", "even", "because", "as", "if", "till", "until", "unless", "once", "while", "whereas", "spite", "despite", "addition", "furthermore", "however", "anyway",
              "therefore", "consequently", "though", "provided", "since", "whenever", "wherever", "besides", "conversely", "later", "moreover", "nonetheless", "thereafter", "thus", "whatever", "whoever", "cause"]
    stop_j = [tokenize(w)[0] for w in stop_j]
    # non_stop_j = ["and", "or", "nor", "where", "when", "still", "yet"]
    stop_neg.update(stop_j)
    ngram_features = []
    all_weights = []
    last_words = []
    # sent_count = {k:0 for k in ['0','1']}
    sent_count = [0, 0]
    if sentiments is None:
        sentiments = [0]*len(texts)
    for sentiment, text in tqdm(list(zip(sentiments, texts))):  # single tweet
        text_weights = []
        ngram_feature = {}
        lw = ('', 0)
        # for tk in split_list(text): # subtweet
        # for subtext in sent_tokenize(text):  # subtweet
        for subtext in [text]:
            tk = tokenize(subtext)
            ls = []
            signs = []
            sign = 1
            for s in tk:
                if s in neg:
                    sign = -1
                else:
                    if s in stop_neg:
                        sign = 1
                    if len(s) > 1:
                        ls.append(s)
                        signs.append(sign)
            # sign = -1 if any(["not" in s or "n't" in s for s in tk]) else 1
            sent_count[sentiment] += 1
            if len(ls) == 0:
                continue
            # sent_count[sentiment if sign == 1 else 1-sentiment] += 1
            # subt = ' '.join(tk).replace("not","").replace("n't","")
            # remove stop words and stem subt, tokenize
            # ls = subt.split() # ls is the list obtained tokenizing subt
            weights = []
            for i, (s, sign) in enumerate(zip(ls, signs)):
                matches = [x[0] for x in regex.findall(s)]
                if len(matches) > 0:
                    weights.append(1.5*sign)
                    max_occ = 0
                    for cc in itertools.product((1, 2), repeat=len(matches)):
                        ts = s
                        for m, c in zip(matches, cc):
                            ts = ts.replace(m, m[:c], 1)
                        occ = words.get(ts, 0)
                        if max_occ < occ:
                            max_occ = occ
                            w = ts
                    if max_occ == 0:
                        ls[i] = ts
                    else:
                        ls[i] = w
                else:
                    weights.append(sign)
            lw = (ls[-1], weights[-1])
            for i in range(1, 3):  # 1 and 2 -grams
                for j in range(0, len(ls)+i-1):
                    w = '-'.join(ls[j:j+i])
                    # use average words' weight for n-gram weights
                    ngram_feature[w] = ngram_feature.get(
                        w, 0) + sum(weights[j:j+i])/i
            text_weights.extend(weights)
        ngram_features.append(ngram_feature)
        last_words.append(lw)
        all_weights.append(text_weights)
    return ngram_features, all_weights, last_words, sent_count


def nlogn(n):
    if n == 0:
        return 0
    return n*math.log2(n)


def so_ne_it(so, ne, freq, freq_sent, sp, w, fw):
    p1w = freq_sent[1].get(w, 0)/fw
    p0w = freq_sent[0].get(w, 0)/fw
    # SO(w) = log(p(+|w)/p(+)) - log(p(-|w)/p(-))
    # so[w] = math.log2(p1w/sp[1]) - math.log2(p0w/sp[0])
    if p1w/sp[1] > p0w/sp[0]:
        so[w] = 1
    else:
        so[w] = -1
    # ne(w) = 1 − (−(p(+|w).log(p(+|w)) − p(−|w).log(p(−|w))))
    ne[w] = (1 + nlogn(p1w) + nlogn(p0w))  # * math.log(freq[w])


def ngram_pop(to_remove, d):
    for k in list(d.keys()):
        if k in to_remove:
            d.pop(k)


def so_ne(sentiments, ngram_features, sent_count):
    freq = {}
    freq_sent = [{}, {}]
    so = {}
    ne = {}
    for sent, ngrams in list(zip(sentiments, ngram_features)):
        for k, v in ngrams.items():
            freq[k] = freq.get(k, 0) + abs(v)
            tsent = sent if v > 0 else 1-sent
            freq_sent[tsent][k] = freq_sent[tsent].get(k, 0) + abs(v)
    to_remove = set()
    for k, v in freq.items():
        if v < 3:
            freq_sent[0].pop(k, 0)
            freq_sent[1].pop(k, 0)
            to_remove.add(k)
    for k in to_remove:
        freq.pop(k)
    for d in ngram_features:
        ngram_pop(to_remove, d)
    c = sent_count[0] + sent_count[1]
    sp = [sent_count[0]/c, sent_count[1]/c]
    for w, fw in freq.items():
        so_ne_it(so, ne, freq, freq_sent, sp, w, fw)
    return so, ne, freq, freq_sent


def lex_features_it(ngrams, lw, so, ne):
    # number of positives and negatives, sum of postive scores, sum of abs of negative scores
    np = nn = sp = sn = 0
    for w, ww in ngrams.items():
        if so.get(w, 0)*ww > 0:
            np += abs(ww)
            # sp += so[w]*ww # so based
            sp += ne.get(w, 0)*abs(ww)  # so+ne based
        elif so.get(w, 0)*ww < 0:
            nn += abs(ww)
            # sn -= so[w]*ww # so based
            sn += ne.get(w, 0)*abs(ww)  # so+ne based
    try:
        lwp = ne.get(lw[0], 0)*so.get(lw[0], 0) * \
            lw[1]  # last word polarity (as score)
    except:
        print(type(lw))
        print(lw)
        raise IndexError()
    return (np, nn, sp, sn, lwp)


def lex_features(ngram_features, last_words, so, ne):
    res = []
    for ngrams, lw in list(zip(ngram_features, last_words)):
        res.append(lex_features_it(ngrams, lw, so, ne))
    return res


def parallel(target, iterabile, args=(), n_jobs=N_JOBS):
    l = len(iterabile)
    iargs = zip(range(l), iterabile, [args]*l)
    with Pool(processes=n_jobs) as executor:
        res = list(executor.imap(target, iargs))
    return res


def tweet_word_sent(tweet_word, so, ne):
    res = []
    ps = PorterStemmer()
    no_emoticons = EMOTICONS_DICT.get(tweet_word, [tweet_word])
    for word in no_emoticons:
        ww = word_tokenize(word)
        if len(ww) > 1 and any([x in punctuation for x in [ww[0][-1], ww[1][0]]]):
            ww = [word]
        # print(ww)
        for w in ww:
            sw = ps.stem(w)
            # print(w, sw)
            # if len(sw) > 1 and sw not in stopWords: # wanted also to delete len(sw) > 1
            res.append(sw)
    # s = np.sign(sum([so.get(w,0) for w in res if not np.isclose(0, ne.get(w,0))]))
    s = sum([so.get(w, 0)*ne.get(w, 0) for w in res])
    return s


def countable_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts countable features from text and adds those as columns in the DataFrame.
    Countable features are:
    - number of urls
    - number of hashtags (#)
    - number of targets (@)
    - number of capitalized words (uppercase words having length > 1)    
    - number of question marks
    - number of esclamation marks
    - number of quotes
    - tweet length : number of words in the tweet
    - avg word length
    The text is then modified as follows:
    - html entities unescaped (before feature extraction)
    - urls are removed
    - hashtags symbols # are removed, mainteining only the subsequent words
    - targets are removed
    - space characters are compressed
    - leading whitespaces removed
    - the entire text is converted to lowercase
    """
    urls_list = []
    hashtags_list = []
    targets_list = []
    capitalized_words_list = []
    tweet_len_list = []
    avg_word_len_list = []
    question_marks_list = []
    esclamation_marks_list = []
    quotes_list = []
    new_rows = []

    for i, row in df.iterrows():
        row['text'] = html.unescape(row['text'])
        urls = [''.join(i) for i in re.findall(URL_REGEX, row['text'])]
        hashtags = re.findall(HASHTAG_REGEX, row['text'])
        targets = re.findall(TARGET_REGEX, row['text'])
        capitalized_words = re.findall(CAPITALIZED_WORDS_REGEX, row['text'])
        question_marks = row['text'].count('?')
        esclamation_marks = row['text'].count('!')
        quotes = row['text'].count('"') / 2

        for token in urls + targets + ['#']:
            row['text'] = row['text'].replace(token, '')
        new_row = re.sub('\s+', ' ', row['text']
                         ).lower().strip(string.whitespace)

        text_split = new_row.split()

        tweet_len = len(text_split)
        avg_word_len = np.mean([len(w)
                               for w in text_split]) if text_split else 0

        urls_list.append(len(urls))
        hashtags_list.append(len(hashtags))
        targets_list.append(len(targets))
        capitalized_words_list.append(len(capitalized_words))
        tweet_len_list.append(tweet_len)
        avg_word_len_list.append(avg_word_len)
        question_marks_list.append(question_marks)
        esclamation_marks_list.append(esclamation_marks)
        quotes_list.append(quotes)
        new_rows.append(new_row)

    df['#urls'] = urls_list
    df['#hashtags'] = hashtags_list
    df['#targets'] = targets_list
    df['#capitalized_words'] = capitalized_words_list
    df['#words'] = tweet_len_list
    df['avg_words_length'] = avg_word_len_list
    df['#question_marks'] = question_marks_list
    df['#esclamation_marks'] = esclamation_marks_list
    df['#quotes'] = quotes_list
    df['text'] = new_rows

    return df


def pos_features(df: pd.Series, so, ne) -> pd.DataFrame:
    """
    return:
    (so_sum(NOUN), so_sum(ADJ), so_sum(VERB), so_sum(INTJ), so_sum(ADJ),
    sone_sum(NOUN), sone_sum(ADJ), sone_sum(VERB), sone_sum(INTJ), sone_sum(ADJ))
    """
    nlp = en_core_web_sm.load()
    POS_FEATURES = ['NOUN', 'ADJ', 'VERB', 'ADV', 'INTJ']
    features = []

    for i, text in tqdm(df.iteritems()):
        doc = nlp(text)
        # pos = [[t.text for t in doc if t.pos_ == feature]
        #        for feature in POS_FEATURES]
        #pos_numbers = tuple([len(pos[i]) for i in range(len(POS_FEATURES))])
        #pos_percentages = tuple([i/len(words) for i in pos_numbers]) if len(words) else (0, 0, 0, 0, 0)
        # pos_sone = [[tweet_word_sent(t, so, ne) for t in pos[i]]
        #             for i in range(len(POS_FEATURES))]

        pos_sone = [[tweet_word_sent(t.text, so, ne) for t in doc if t.pos_ == feature]
               for feature in POS_FEATURES]

        pos_sone_scores = tuple([sum(i) for i in pos_sone])
        pos_so_scores = tuple([sum([np.sign(i) for i in p]) for p in pos_sone])

        features.append(pos_so_scores + pos_sone_scores)

    columns = [f'so_sum({p})' for p in POS_FEATURES] + \
        [f'sone_sum({p})' for p in POS_FEATURES]

    new_df = pd.DataFrame(features, index=df.index, columns=columns)
    return new_df

def user_features(df: pd.DataFrame, transformer: pd.DataFrame=None, default_values: list=None):
    """
    df: train DataFrame
    transformer: DataFrame which maps users to #tweets and avg_sentiment. Evaluated only the first time using train df and used to transform the target df
    default_values: default values of #tweets and avg_sentiment. Evaluated only once as for transformer
    """
    if transformer is None:
        transformer = pd.pivot_table(data=df, index='user', values='sentiment', aggfunc=['count', 'mean'])
        transformer.columns = ['#tweets', 'avg_sentiment']
    
    if default_values is None:
        avg_dataset_sentiment = df['sentiment'].sum()/len(df)
        default_values = [0, avg_dataset_sentiment]

    f8 = df['user'].map(lambda x: transformer.loc[x].values if x in transformer.index else default_values)
    df['#tweets'] = [t[0] for t in f8]
    df['avg_sentiment'] = [t[1] for t in f8]
    df.drop('user', axis=1, inplace=True)

    return df, transformer, default_values

class LexiconExtractor:
    """
    Extracts custom lexicon from train dataframe and extracts lexicon features 
    """
    lex_columns_names = ['#positive_ngrams', '#negative_ngrams', 'sum_positive_entropies', 'sum_negative_entropies', 'last_word_polarity']
    
    def __init__(self, df_fit_transform, sentiments, df_transform):
        """
        Constructs lexicon, extracts features, transforms dataframes
        """
        tokenized = tokenize_texts(df_fit_transform['text'])
        self.words = wordset(tokenized)        
        self.ngram_features, self.all_weights, self.last_words, self.sent_count = ngrams(texts=df_fit_transform['text'], words=self.words, sentiments=sentiments)
        self.so, self.ne, self.freq, self.freq_sent = so_ne(sentiments, self.ngram_features, self.sent_count)        
        self.lex = lex_features(self.ngram_features, self.last_words, self.so, self.ne)
        df_fit_transform[self.lex_columns_names] = self.lex
        
        ngram_features_tr, all_weights_tr, last_words_tr, sent_count_tr = ngrams(texts=df_transform['text'], words=self.words)        
        lex_tr = lex_features(ngram_features_tr, last_words_tr, self.so, self.ne)
        df_transform[self.lex_columns_names] = lex_tr
        
        self.df1 = df_fit_transform
        self.df2 = df_transform

    def get_DataFrames(self):
        """
        Get transformed dataframes
        """
        return self.df1, self.df2
    
    def get_so_ne(self):
        return self.so, self.ne

    def get_freq(self):
        return self.freq

    def get_words(self):
        return self.words

def add_pos_features(df, df_text, so, ne):
    """
    Wrapper for pos_features
    """
    df_pos = pos_features(df_text, so, ne)
    df = pd.concat([df, df_pos], axis=1)

    return df

def date_to_ts(df):
    """
    Converts dates to timestamps
    """
    df['date'] = df['date'].map(lambda str: ' '.join(str.replace('PDT', '').split()[1:5]))
    df['date'] = pd.to_datetime(df['date'])
    df['timestamp'] = df['date'].values.astype(np.int64)
    df.drop('date', axis=1, inplace=True)

    return df

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

Input

In [None]:
from sklearn.model_selection import train_test_split

with open(DEV_PATH, 'r') as f:
	df_dev = pd.read_csv(f, usecols=[0, 2, 4, 5])

with open(EVAL_PATH, 'r') as f:
	df_eval = pd.read_csv(f, usecols=[1, 3, 4])

Drop Duplicates

In [None]:
df_dev.drop_duplicates(subset=['text', 'sentiment'], keep='first', inplace=True, ignore_index=True)
df_dev.drop_duplicates(subset='text', keep=False, inplace=True, ignore_index=True)

Twitter-specific and Textual features

In [None]:
df_dev = countable_features(df_dev)
df_eval = countable_features(df_eval)

Train, Validation, Test Split


In [None]:
df_train_valid, df_test = train_test_split(df_dev, train_size=.8, test_size=.2, shuffle=True, random_state=42)
df_train, df_valid = train_test_split(df_train_valid, test_size=.25, shuffle=True, random_state=42)

# Validation


Non-textual (based on user) features

In [None]:
df_train, user_map, default_values = user_features(df_train) # fit
df_valid, _, _ = user_features(df_valid, user_map, default_values) # transform

Split targets

In [None]:
sentiments_train = df_train.iloc[:, 0]
df_train = df_train.iloc[:, 1:]

sentiments_valid = df_valid.iloc[:, 0]
df_valid = df_valid.iloc[:, 1:]

Lexicon-Based features

In [None]:
lexicon = LexiconExtractor(df_train, sentiments_train, df_valid)

df_train, df_valid = lexicon.get_DataFrames()
so, ne = lexicon.get_so_ne()

Save & Drop Text

In [None]:
df_train_text = df_train['text']
df_valid_text = df_valid['text']
df_train.drop(['text'], axis=1, inplace=True)
df_valid.drop(['text'], axis=1, inplace=True)

PoS-oriented features

In [None]:
df_train = add_pos_features(df_train, df_train_text, so, ne)
df_valid = add_pos_features(df_valid, df_valid_text, so, ne)

BoW features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_docs = [' '.join([t for t, s in zip(df_train_text, sentiments_train) if s==i]) for i in [0, 1]]

tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=123, binary=False)
tfidf.fit(train_docs)
vocab = tfidf.vocabulary_

tfidf_train = tfidf.transform(df_train_text)
tfidf_valid = tfidf.transform(df_valid_text)

df_train = pd.concat([df_train, pd.DataFrame.sparse.from_spmatrix(tfidf_train, index=df_train.index, columns=sorted(vocab.keys()))], axis=1)
df_valid = pd.concat([df_valid, pd.DataFrame.sparse.from_spmatrix(tfidf_valid, index=df_valid.index, columns=sorted(vocab.keys()))], axis=1)

Non-textual (timestamp)

In [None]:
from sklearn.preprocessing import MinMaxScaler

df_train = date_to_ts(df_train)
df_valid = date_to_ts(df_valid)

scaler = MinMaxScaler()
scaler.fit(df_train.loc[:, ['timestamp']])

df_train.loc[:, ['timestamp']] = scaler.transform(df_train.loc[:, ['timestamp']])
df_valid.loc[:, ['timestamp']] = scaler.transform(df_valid.loc[:, ['timestamp']])

Model Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

classifiers = [
    RandomForestClassifier(n_jobs=N_JOBS),
    BaggingClassifier(base_estimator=SVC(), max_samples=.25, max_features=.25, n_estimators=N_JOBS*2, n_jobs=N_JOBS),
    GaussianNB(),
    Pipeline([('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier(n_jobs=N_JOBS))])] # KNN with normalized features

for clf in classifiers:
    clf.fit(df_train, sentiments_train)
    sentiments_pred = clf.predict(df_valid)
    f1 = f1_score(sentiments_valid, sentiments_pred, average='macro')
    print(f'{f1} : {clf}')

In [None]:
x = Pipeline([('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier(n_jobs=N_JOBS, weights='distance'))])

x.fit(df_train, sentiments_train)
sentiments_pred = x.predict(df_valid)
f1 = f1_score(sentiments_valid, sentiments_pred, average='macro')
print(f'{f1} : {clf}')

Hyperparameter Tuning

In [None]:
criterion = ['gini', 'entropy']
max_features = [1, 2, 3, 4, 5, 'auto', 'log2']
min_samples_split = [2, 3, 4, 5, 6]

for f in max_features:
    for s in min_samples_split:
        for c in criterion:
            rf_tuning = RandomForestClassifier(
                n_estimators=200,
                random_state=42,
                max_features=f,
                criterion=c,
                min_samples_split=s,
                n_jobs=N_JOBS,
            )

            rf_tuning.fit(df_train, sentiments_train)
            pred_tuning = rf_tuning.predict(df_valid)
            score = f1_score(sentiments_valid, pred_tuning, average='macro')

            print(f'criterion={c}\tmax_features={f}\tmin_samples_split={s}\tf1_score={score}')

Tf-idf tuning

In [None]:
df_train.drop(columns=vocab, axis=1, inplace=True)
df_valid.drop(columns=vocab, axis=1, inplace=True)

from sklearn.feature_extraction.text import TfidfVectorizer

train_docs = [' '.join([t for t, s in zip(df_train_text, sentiments_train) if s==i]) for i in [0, 1]]


for i in [50, 75, 100, 125, 150, 175, 200]:
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=i, binary=False)
    tfidf.fit(train_docs)
    vocab = tfidf.vocabulary_

    tfidf_train = tfidf.transform(df_train_text)
    tfidf_valid = tfidf.transform(df_valid_text)

    tfidf_train = pd.concat([df_train, pd.DataFrame.sparse.from_spmatrix(tfidf_train, index=df_train.index, columns=sorted(vocab.keys()))], axis=1)
    tfidf_valid = pd.concat([df_valid, pd.DataFrame.sparse.from_spmatrix(tfidf_valid, index=df_valid.index, columns=sorted(vocab.keys()))], axis=1)

    rf = RandomForestClassifier(
                n_estimators=200,
                random_state=42,
                max_features=5,
                criterion='gini',
                min_samples_split=5,
                n_jobs=N_JOBS,
            )
    rf.fit(tfidf_train, sentiments_train)
    pred = rf.predict(tfidf_valid)

    print(f"n_features={i}\tf1={f1_score(sentiments_valid, pred, average='macro')}")

Feature Importances

In [None]:
feature_importances = sorted({(k, v) for k, v in zip(rf_tuning.feature_names_in_, rf_tuning.feature_importances_)}, key=lambda x: x[1], reverse=True)
feature_importances

# Test

In [None]:
df_train = df_train_valid # use 80% of development set to test the classifier

# user features
df_train, user_map, default_values = user_features(df_train) # fit
df_test, _, _ = user_features(df_test, user_map, default_values) # transform

sentiments_train = df_train.iloc[:, 0]
df_train = df_train.iloc[:, 1:]
sentiments_test = df_test.iloc[:, 0]
df_test = df_test.iloc[:, 1:]

# lexicon features
lexicon = LexiconExtractor(df_train, sentiments_train, df_test)
df_train, df_test = lexicon.get_DataFrames()
so, ne = lexicon.get_so_ne()

df_train_text = df_train['text']
df_test_text = df_test['text']
df_train.drop(['text'], axis=1, inplace=True)
df_test.drop(['text'], axis=1, inplace=True)

# pos oriented features
df_train = add_pos_features(df_train, df_train_text, so, ne)
df_test = add_pos_features(df_test, df_test_text, so, ne)

# BoW features
train_docs = [' '.join([t for t, s in zip(df_train_text, sentiments_train) if s==i]) for i in [0, 1]]
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=123, binary=False)
tfidf.fit(train_docs)
vocab = tfidf.vocabulary_
tfidf_train = tfidf.transform(df_train_text)
tfidf_test = tfidf.transform(df_test_text)
df_train = pd.concat([df_train, pd.DataFrame.sparse.from_spmatrix(tfidf_train, index=df_train.index, columns=sorted(vocab.keys()))], axis=1)
df_test = pd.concat([df_test, pd.DataFrame.sparse.from_spmatrix(tfidf_test, index=df_test.index, columns=sorted(vocab.keys()))], axis=1)

# timestamp feature
df_train = date_to_ts(df_train)
df_test = date_to_ts(df_test)

scaler = MinMaxScaler()
scaler.fit(df_train.loc[:, ['timestamp']])
df_train.loc[:, ['timestamp']] = scaler.transform(df_train.loc[:, ['timestamp']])
df_test.loc[:, ['timestamp']] = scaler.transform(df_test.loc[:, ['timestamp']])

Test

In [None]:
rf_test = RandomForestClassifier(
    n_estimators=1000,
    random_state=42,
    max_features=5,
    criterion='gini',
    min_samples_split=5,
    n_jobs=N_JOBS,
    verbose=1
    )

rf_test.fit(df_train, sentiments_train)
pred_test = rf_test.predict(df_test)
f1_test = f1_score(sentiments_test, pred_test, average='macro')

f1_test

In [None]:
features = {}

features['Lexicon-based'] = [
    '#positive_ngrams',
    '#negative_ngrams',
    'sum_positive_entropies',
    'sum_negative_entropies',
    'last_word_polarity'
    ]
    
features['PoS-oriented'] = [
    'so_sum(NOUN)',
    'so_sum(ADJ)',
    'so_sum(VERB)',
    'so_sum(ADV)',
    'so_sum(INTJ)',
    'sone_sum(NOUN)',
    'sone_sum(ADJ)',
    'sone_sum(VERB)',
    'sone_sum(ADV)',
    'sone_sum(INTJ)'
    ]

features['Twitter-specific'] = [
    '#urls',
    '#hashtags',
    '#targets'
    ]

features['Textual'] = [
    '#capitalized_words',
    '#words',
    'avg_words_length',
    '#question_marks',
    '#esclamation_marks',
    '#quotes'
    ]

features['Non-Textual'] = [
    'timestamp',
    '#tweets',
    'avg_sentiment'
    ]
    
features['BoW'] = vocab

Remove subsets of features once at time

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

results = []

for f, list in features.items():
    df_train_subset = df_train.drop(list, axis=1)
    df_test_subset = df_test.drop(list, axis=1)

    rf_subset = RandomForestClassifier(n_estimators=100, n_jobs=N_JOBS, random_state=42)
    rf_subset.fit(df_train_subset, sentiments_train)
    pred_subset = rf_subset.predict(df_test_subset)
    score = f1_score(sentiments_test, pred_subset, average='macro')
    results.append((f, score))

rf_subset = RandomForestClassifier(n_estimators=100, n_jobs=N_JOBS, random_state=42)
rf_subset.fit(df_train, sentiments_train)
pred_subset = rf_subset.predict(df_test)
score = f1_score(sentiments_test, pred_subset, average='macro')
results.append(('all features', score))

Plot results

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
df_plot = pd.DataFrame({'F1-Macro Score' : [x[1] for x in results], 'Features group removed' : [x[0] for x in results]})

fig, ax = plt.subplots()
ax.set_xlim(0,1)
plt.grid()
sns.barplot(x=[x[1] for x in results], y=[x[0] for x in results], orient='h', color='tab:blue')
chart = sns.barplot(data=df_plot, x='F1-Macro Score', y='Features group removed', orient='h', color='tab:blue', ci=None)
chart.bar_label(chart.containers[0])

plt.savefig('bars.svg', format='svg', pad_inches=50, dpi=300, quality=80, optimize=True, progressive=True)

# Evaluation

In [None]:
# 100% of development set is used

# user features
df_dev, user_map, default_values = user_features(df_dev) # fit
df_eval, _, _ = user_features(df_eval, user_map, default_values) # transform

sentiments_dev = df_dev.iloc[:, 0]
df_dev = df_dev.iloc[:, 1:]

# lexicon features
lexicon = LexiconExtractor(df_dev, sentiments_dev, df_eval)
df_dev, df_eval = lexicon.get_DataFrames()
so, ne = lexicon.get_so_ne()

df_dev_text = df_dev['text']
df_eval_text = df_eval['text']
df_dev.drop(['text'], axis=1, inplace=True)
df_eval.drop(['text'], axis=1, inplace=True)

# pos oriented features
df_dev = add_pos_features(df_dev, df_dev_text, so, ne)
df_eval = add_pos_features(df_eval, df_eval_text, so, ne)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
# BoW features
dev_docs = [' '.join([t for t, s in zip(df_dev_text, sentiments_dev) if s==i]) for i in [0, 1]]
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=123, binary=False)
tfidf.fit(dev_docs)
vocab = tfidf.vocabulary_
tfidf_dev = tfidf.transform(df_dev_text)
tfidf_eval = tfidf.transform(df_eval_text)
df_dev = pd.concat([df_dev, pd.DataFrame.sparse.from_spmatrix(tfidf_dev, index=df_dev.index, columns=sorted(vocab.keys()))], axis=1)
df_eval = pd.concat([df_eval, pd.DataFrame.sparse.from_spmatrix(tfidf_eval, index=df_eval.index, columns=sorted(vocab.keys()))], axis=1)

# timestamp feature
df_dev = date_to_ts(df_dev)
df_eval = date_to_ts(df_eval)
scaler = MinMaxScaler()
scaler.fit(df_dev.loc[:, ['timestamp']])
df_dev.loc[:, ['timestamp']] = scaler.transform(df_dev.loc[:, ['timestamp']])
df_eval.loc[:, ['timestamp']] = scaler.transform(df_eval.loc[:, ['timestamp']])

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_eval = RandomForestClassifier(
	n_estimators=5000,
	random_state=42,
	max_features=5,
	criterion='gini',
	min_samples_split=5,
	n_jobs=N_JOBS,
	verbose=1
	)

rf_eval.fit(df_dev, sentiments_dev)
pred_eval = rf_eval.predict(df_eval)

Output

In [None]:
import csv

out = [(i, p) for (i, p) in enumerate(pred_eval)]

OUT_FILE = 'out_20_01.csv'

with open(OUT_FILE, 'w') as f:
	writer = csv.writer(f)
	writer.writerow(["Id", "Predicted"])
	for row in out:
		writer.writerow(row)

In [None]:
import seaborn as sns
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

freq = lexicon.get_freq()
words = lexicon.get_words()

sone = {k:so[k]*ne[k] for k in so.keys()}
stopWords = set(stopwords.words('english'))
word_list = words.words()
best_words = sorted([(k,v*so[k]) for k, v in ne.items() if freq[k] > 2000 and k not in stopWords and k in word_list], key=lambda x: abs(x[1]), reverse=True)[:20]
w = [i[0] for i in best_words]
values = [i[1] for i in best_words]

red = np.array([.84, .15, .16])
blue = np.array([.12, .47, .71])
white = np.array([1, 1, 1])

sns.barplot(y=w, x=values, orient='h', palette=[red*abs(v*2) + white*(1-abs(2*v)) if v > 0 else blue*abs(2*v) + white*(1-abs(v*2)) for v in values])

# plt.savefig('fig.eps', format='eps')