In [1]:
#!/usr/bin/env python3

"""loads the data, pre-trainied embeddings, feature sets, and trains a voting classifier for task B and subsequently 
   tests the model on the held-out test data"""

from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np
import logging
import codecs

from sklearn.multiclass import OneVsRestClassifier
from random import randint
import gensim.models
import word2vecReaderUtils as utils
from word2vecReader import *
import json

from load import parse_dataset

In [2]:
def ngramFeaturize(corpus):
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize
    #vectorizer = TfidfVectorizer(strip_accents="unicode", analyzer="word", tokenizer=tokenizer, stop_words="english")
    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),tokenizer=tokenizer, min_df=1) #token_pattern=r'\b\w+\b',
    analyze = bigram_vectorizer.build_analyzer()

    X = bigram_vectorizer.fit_transform(corpus).toarray()
    strUnits = bigram_vectorizer.get_feature_names()
    print("len features",len(bigram_vectorizer.get_feature_names()))
    return X

In [6]:
  def wvVectors(corpus):
    tweetVectors = []
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize
    wvModel = Word2Vec.load_word2vec_format('../install_dependencies/word2vec_twitter_model.bin', binary=True)
#     wvModel = Word2Vec.load_word2vec_format('/Users/shiva/Downloads/word2vec_twitter_model/word2vec_twitter_model.bin', binary=True)
    emojiModel = gensim.models.KeyedVectors.load_word2vec_format('../../Task3/extra_resources/emoji2vec.bin', binary=True)
    unknowns = []
    for tweet in corpus:
        t = tokenizer(tweet)
        sentVectors = []
        for word in t:
            if word in wvModel:
                sentVectors.append(wvModel[word])
            elif word in emojiModel:
                sentVectors.append(np.concatenate((emojiModel[word], np.zeros(100))))
        if len(sentVectors)==0:
            print("empty sentence",t)
        tweetVectors.append(sentVectors)
    return tweetVectors

In [7]:
def wvConcatVectorsFeaturize(corpus):
    concatenatedVectors = []
    tVectors = wvVectors(corpus)
#     maxlength_w = max([len(t) for t in tVectors])
    maxlength_w = 41 # based on both train and test 
    for vecs in tVectors:
        concatVec = []
        for i in range(maxlength_w):
            if i<len(vecs):
                concatVec = np.concatenate((concatVec,vecs[i]))
            else:
                concatVec = np.concatenate((concatVec,np.zeros(400)))
        concatenatedVectors.append(concatVec)
    return concatenatedVectors

In [9]:
# Experiment settings

DATASET_FP = "../datasets/train/SemEval2018-T3-train-taskB_emoji.txt"
TASK = "B" # Define, A or B
FNAME = './predictions-task' + TASK + '.txt'
PREDICTIONSFILE = open(FNAME, "w")
EXTRA_FEATURES = 1

# Loading dataset and featurised simple Tfidf-BoW model
corpus, y = parse_dataset(DATASET_FP)

test_corpus, test_y = parse_dataset('../datasets/goldtest_TaskB/SemEval2018-T3_gold_test_taskB_emoji.txt')

X_bigram = ngramFeaturize(corpus+test_corpus)

X = wvConcatVectorsFeaturize(corpus)

X = list(X)
y = np.array(y)
#print("shape X after concatenation:", X.shape)

if EXTRA_FEATURES:
    extraFeatures = np.load(open('./all_train_feats_TaskB.npy', 'rb'))
    for i in range(len(X)):
        X[i] = np.concatenate((X[i],X_bigram[i],extraFeatures[i]))
print("X dimension",len(X[0]))
X = np.array(X)

class_counts = np.asarray(np.unique(y, return_counts=True)).T.tolist()
print ("class counts:", class_counts)

len features 60360
empty sentence ['illridewithyou']
empty sentence []
X dimension 76847
class counts: [[0, 1923], [1, 1390], [2, 316], [3, 205]]


In [10]:
K_FOLDS = 10 # 10-fold crossvalidation

clf1 = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', class_weight = {0:1, 1:1, 2:3,3:3})
clf2 = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', class_weight = {0:1, 1:1, 2:1, 3:1})
clf3 = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', class_weight = {0:1, 1:1, 2:2, 3:2})

CLF = VotingClassifier(estimators=[('lr1', clf1), ('lr2', clf2), ('lr3', clf3)], voting='soft', n_jobs=-1)

In [15]:
# Returns an array of the same size as 'y' where each entry is a prediction obtained by cross validated
predicted = cross_val_predict(CLF, X, y, cv=K_FOLDS)

# Modify F1-score calculation depending on the task
if TASK.lower() == 'a':
    score = metrics.f1_score(y, predicted, pos_label=1)
elif TASK.lower() == 'b':
     # if you set average to None, it will return results for each class separately 
    score = metrics.f1_score(y, predicted, average=None)
    score_ = metrics.f1_score(y, predicted, average='macro') 
print ("F1-score Task", TASK, score)
print ("F1-score Task", TASK, score_)
for p in predicted:
    PREDICTIONSFILE.write("{}\n".format(p))
PREDICTIONSFILE.close()

F1-score Task B [ 0.70644391  0.65843331  0.27684964  0.04444444]
F1-score Task B 0.421542827585


In [16]:
X_bigram_vec_test = X_bigram[len(corpus):] 
X_test = wvConcatVectorsFeaturize(test_corpus)

X_test = list(X_test)
#print("shape X after concatination:", X.shape)

if EXTRA_FEATURES:
    extraFeatures = np.load("./all_test_feats_TaskB.npy")
    for i in range(len(X_test)):
        X_test[i] = np.concatenate((X_test[i],X_bigram_vec_test[i], extraFeatures[i]))
print("X_test dimension",len(X_test[0]))
X_test = np.array(X_test)

print("Fit on the whole Train ...")
CLF.fit(X, y)

print("Ready to TEST")


y_test_predicted = CLF.predict(X_test)

with open('test_prediction_TaskB_shiva.txt', 'w') as f:
    for y in y_test_predicted:
        f.write(str(y)+"\n")
        
score = metrics.f1_score(test_y, y_test_predicted, average=None)
score_ = metrics.f1_score(test_y, y_test_predicted, average='macro') 
print ("F1-score Task", TASK, score)
print ("F1-score Task", TASK, score_)

X_test dimension 76847
Fit on the whole Train ...
Ready to TEST
F1-score Task B [ 0.78991597  0.56493506  0.31404959  0.        ]
F1-score Task B 0.417225154525
