In [5]:
import os
import time
import sys
import re
from subprocess import call
import numpy as np
from nltk import TweetTokenizer
from nltk.tokenize.stanford import StanfordTokenizer

# Downloading the models

As mentioned in the readme, here are the pretrained models you can download:

- [sent2vec_wiki_unigrams](https://drive.google.com/open?id=0B6VhzidiLvjSa19uYWlLUEkzX3c) 5GB (600dim, trained on english wikipedia)
- [sent2vec_wiki_bigrams](https://drive.google.com/open?id=0B6VhzidiLvjSaER5YkJUdWdPWU0) 16GB (700dim, trained on english wikipedia)
- [sent2vec_twitter_unigrams](https://drive.google.com/open?id=0B6VhzidiLvjSaVFLM0xJNk9DTzg) 13GB (700dim, trained on english tweets)
- [sent2vec_twitter_bigrams](https://drive.google.com/open?id=0B6VhzidiLvjSeHI4cmdQdXpTRHc) 23GB (700dim, trained on english tweets)
- [sent2vec_toronto books_unigrams](https://drive.google.com/open?id=0B6VhzidiLvjSOWdGM0tOX1lUNEk) 2GB (700dim, trained on the [BookCorpus dataset](http://yknzhu.wixsite.com/mbweb))
- [sent2vec_toronto books_bigrams](https://drive.google.com/open?id=0B6VhzidiLvjSdENLSEhrdWprQ0k) 7GB (700dim, trained on the [BookCorpus dataset](http://yknzhu.wixsite.com/mbweb))

# Code provided by Pagliardini et al.
For more details, please visit authors' [repository](https://github.com/epfml/sent2vec).

In [6]:
FASTTEXT_EXEC_PATH = os.path.abspath("./fasttext")

BASE_SNLP_PATH = "/home/aires/repositories/sent2vec/models/stanford-corenlp-full-2017-06-09/"
SNLP_TAGGER_JAR = os.path.join(BASE_SNLP_PATH, "stanford-corenlp-3.8.0.jar")

MODEL_WIKI_UNIGRAMS = os.path.abspath("models/wiki_unigrams.bin")
# MODEL_WIKI_BIGRAMS = os.path.abspath("./sent2vec_wiki_bigrams")
# MODEL_TORONTOBOOKS_UNIGRAMS = os.path.abspath("./sent2vec_wiki_unigrams")
# MODEL_TORONTOBOOKS_BIGRAMS = os.path.abspath("./sent2vec_wiki_bigrams")
# MODEL_TWITTER_UNIGRAMS = os.path.abspath('./sent2vec_twitter_unigrams')
# MODEL_TWITTER_BIGRAMS = os.path.abspath('./sent2vec_twitter_bigrams')

# Generating sentence embeddings

Now you can just run the following cells:

## Utils for tokenization

In [7]:
def tokenize(tknzr, sentence, to_lower=True):
    """Arguments:
        - tknzr: a tokenizer implementing the NLTK tokenizer interface
        - sentence: a string to be tokenized
        - to_lower: lowercasing or not
    """
    sentence = sentence.strip()
    sentence = ' '.join([format_token(x) for x in tknzr.tokenize(sentence)])
    if to_lower:
        sentence = sentence.lower()
    sentence = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))','<url>',sentence) #replace urls by <url>
    sentence = re.sub('(\@[^\s]+)','<user>',sentence) #replace @user268 by <user>
    filter(lambda word: ' ' not in word, sentence)
    return sentence

def format_token(token):
    """"""
    if token == '-LRB-':
        token = '('
    elif token == '-RRB-':
        token = ')'
    elif token == '-RSB-':
        token = ']'
    elif token == '-LSB-':
        token = '['
    elif token == '-LCB-':
        token = '{'
    elif token == '-RCB-':
        token = '}'
    return token

def tokenize_sentences(tknzr, sentences, to_lower=True):
    """Arguments:
        - tknzr: a tokenizer implementing the NLTK tokenizer interface
        - sentences: a list of sentences
        - to_lower: lowercasing or not
    """
    return [tokenize(tknzr, s, to_lower) for s in sentences]

## Utils for inferring embeddings

In [8]:
def get_embeddings_for_preprocessed_sentences(sentences, model_path, fasttext_exec_path):
    """Arguments:
        - sentences: a list of preprocessed sentences
        - model_path: a path to the sent2vec .bin model
        - fasttext_exec_path: a path to the fasttext executable
    """
    timestamp = str(time.time())
#     print timestamp
    test_path = os.path.abspath('./'+timestamp+'_fasttext.test.txt')
    embeddings_path = os.path.abspath('./'+timestamp+'_fasttext.embeddings.txt')
#     print test_path, embeddings_path
    dump_text_to_disk(test_path, sentences)
    call(fasttext_exec_path+
          ' print-sentence-vectors '+
          model_path + ' < '+
          test_path + ' > ' +
          embeddings_path, shell=True)
    embeddings = read_embeddings(embeddings_path)
    os.remove(test_path)
    os.remove(embeddings_path)
#     print len(embeddings), len(sentences)
#     print embeddings, sentences
    assert(len(sentences) == len(embeddings))
    return np.array(embeddings)

def read_embeddings(embeddings_path):
    """Arguments:
        - embeddings_path: path to the embeddings
    """
#     print "I'm reading here. ", embeddings_path
    with open(embeddings_path, 'r') as in_stream:
        embeddings = []
        for line in in_stream:
#             print line
            line = '['+line.replace(' ',',')+']'
            embeddings.append(eval(line))
        return embeddings
    return []

def dump_text_to_disk(file_path, X, Y=None):
    """Arguments:
        - file_path: where to dump the data
        - X: list of sentences to dump
        - Y: labels, if any
    """
    with open(file_path, 'w') as out_stream:
#         print "start writting here. y:", Y
        if Y is not None:
            for x, y in zip(X, Y):
                out_stream.write('__label__'+str(y)+' '+x+' \n')
        else:
            for x in X:
#                 print x
                out_stream.write(x+' \n')

def get_sentence_embeddings(sentences, ngram='unigrams', model='concat_wiki_twitter'):
    """ Returns a numpy matrix of embeddings for one of the published models. It
    handles tokenization and can be given raw sentences.
    Arguments:
        - ngram: 'unigrams' or 'bigrams'
        - model: 'wiki', 'twitter', or 'concat_wiki_twitter'
        - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...]
    """
    wiki_embeddings = None
    twitter_embbedings = None
    tokenized_sentences_NLTK_tweets = None
    tokenized_sentences_SNLP = None
    if model == "wiki" or model == 'concat_wiki_twitter':
        tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
        s = ' <delimiter> '.join(sentences) #just a trick to make things faster
        tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
        tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ')
        assert(len(tokenized_sentences_SNLP) == len(sentences))
        if ngram == 'unigrams':
            wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                     MODEL_WIKI_UNIGRAMS, FASTTEXT_EXEC_PATH)
        else:
            wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                     MODEL_WIKI_BIGRAMS, FASTTEXT_EXEC_PATH)
    if model == "twitter" or model == 'concat_wiki_twitter':
        tknzr = TweetTokenizer()
        tokenized_sentences_NLTK_tweets = tokenize_sentences(tknzr, sentences)
        if ngram == 'unigrams':
            twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \
                                     MODEL_TWITTER_UNIGRAMS, FASTTEXT_EXEC_PATH)
        else:
            twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \
                                     MODEL_TWITTER_BIGRAMS, FASTTEXT_EXEC_PATH)
    if model == "twitter":
        return twitter_embbedings
    elif model == "wiki":
        return wiki_embeddings
    elif model == "concat_wiki_twitter":
        return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1)
    sys.exit(-1)

# Norm Conflict Identification

In this section of the code we adapt to use sent2vec on the norm conflict identification.

Notice that you need to change the path of conflicts_path and non_conflicts_path in order to make it work.

In [9]:
import random
import pandas as pd

In [10]:
# CONSTANTS
conflicts_path = '/home/aires/datasets/conflicts/conflicts.csv'
non_conflicts_path = '/home/aires/datasets/conflicts/non-conflicts.csv'
CONFLICT = 1
N_CONFLICT = 0

In [11]:
norms_seen = [] # Account new norms.
cnflcts = []
nn_cnflcts = []
y = [] # Norm pair classes (Either 1 for conflicts or 0 for non-conflicts).

# Read conflicts.
df_conflict = pd.read_csv(conflicts_path)

conf_rows = len(df_conflict) # Get the number of conflicting samples.

for i in range(conf_rows):
    # Get norm pair.
    norm1, norm2 = df_conflict['norm1'][i], df_conflict['norm2'][i]
    
    cnflcts.append((norm1, norm2)) # Save the conflicting pair.
    y.append(CONFLICT) # Mark the pair as a conflicting one.
    
    if norm1 not in norms_seen:
        # Avoid adding duplicates.
        norms_seen.append(norm1)
    if norm2 not in norms_seen:
        norms_seen.append(norm2)

# Read non-conflicts.
df_non_conflict = pd.read_csv(non_conflicts_path)

non_conf_rows = len(df_non_conflict)

for i in range(conf_rows):
    
    j = random.randint(0, non_conf_rows) # Get a random pair of non-conflicting norms.
    # We select a random non conflict pair because we have many more non-conflicting cases than conflicts.
    
    norm1, norm2 = df_non_conflict['norm1'][j], df_non_conflict['norm2'][j]
    
    nn_cnflcts.append((norm1, norm2))
    y.append(N_CONFLICT) # Mark the pair as a non-conflicting one.
    
    if norm1 not in norms_seen:
        norms_seen.append(norm1)
    if norm2 not in norms_seen:
        norms_seen.append(norm2)
        
norms = dict() # Create a dictionary to acess norms addresing an index for each one.
for i, x in enumerate(norms_seen):
    norms[x] = i

IOError: File /home/aires/datasets/conflicts/conflicts.csv does not exist

In [35]:
# Generating embeddings
embeddings = get_sentence_embeddings(norms_seen, ngram='unigrams', model='wiki')

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPTokenizer[0m instead.'


In [36]:
threshold = 2

In [65]:
# Create folds.
from sklearn.model_selection import KFold
random_st = 32
n_folds = [7, 8, 9, 10]

In [40]:
def generate_offset(train, train_indexes):    
    # Create offset.
    embedding_sum = np.zeros(embeddings.shape[1]) # Create and empty array to receive the sum of all
                                                  # conflicting embeddings.
    for i in train_indexes:
        # Sum all embeddings to one.
        embedding_sum += embeddings[norms[train[i][0]]] - embeddings[norms[train[i][1]]]

    # Take the mean to obtain the offset.
    return embedding_sum / train_indexes.shape[0]

In [74]:
# Making a test over real conflicts.
fold = 0

for n_fold in n_folds:
    # Run over the number of folds.
    acc_sum = 0
    
    kf = KFold(n_splits=n_fold, shuffle=True, random_state=random_st)
    
    print "\t\tUsing a total of %d folds." % n_fold
    
    for train_index, test_index in kf.split(cnflcts):

        y_gold = []
        y_pred = []

        print "Working on fold %d." % fold
        fold += 1

        # Generating offset for this fold.
        offset = generate_offset(cnflcts, train_index)

        for i in test_index:
            # Create diffs for conflicts.
            y_gold.append(CONFLICT)
            diff = embeddings[norms[cnflcts[i][0]]] - embeddings[norms[cnflcts[i][1]]]
            conflict_diff = np.linalg.norm(offset - diff)

            if conflict_diff < threshold:
                y_pred.append(CONFLICT)
            else:
                y_pred.append(N_CONFLICT)

            j = random.randint(0, len(nn_cnflcts) - 1)

            diff = embeddings[norms[nn_cnflcts[j][0]]] - embeddings[norms[nn_cnflcts[j][1]]]
            conflict_diff = np.linalg.norm(offset - diff)
            y_gold.append(N_CONFLICT)

            if conflict_diff < threshold:
                y_pred.append(CONFLICT)
            else:
                y_pred.append(N_CONFLICT)

        acc = accuracy_score(y_gold, y_pred)
        print "Accuracy: %.2f" % acc
        acc_sum += acc

    mean_acc = acc_sum / n_fold
    print "Mean accuracy: %.2f" % mean_acc

		Using a total of 7 folds.
Working on fold 0.
Accuracy: 0.93
Working on fold 1.
Accuracy: 0.97
Working on fold 2.
Accuracy: 0.93
Working on fold 3.
Accuracy: 0.97
Working on fold 4.
Accuracy: 0.97
Working on fold 5.
Accuracy: 0.90
Working on fold 6.
Accuracy: 0.96
Mean accuracy: 0.95
		Using a total of 8 folds.
Working on fold 7.
Accuracy: 0.96
Working on fold 8.
Accuracy: 0.96
Working on fold 9.
Accuracy: 0.92
Working on fold 10.
Accuracy: 0.96
Working on fold 11.
Accuracy: 0.96
Working on fold 12.
Accuracy: 0.96
Working on fold 13.
Accuracy: 0.88
Working on fold 14.
Accuracy: 0.96
Mean accuracy: 0.95
		Using a total of 9 folds.
Working on fold 15.
Accuracy: 0.96
Working on fold 16.
Accuracy: 0.96
Working on fold 17.
Accuracy: 0.96
Working on fold 18.
Accuracy: 0.92
Working on fold 19.
Accuracy: 0.96
Working on fold 20.
Accuracy: 0.95
Working on fold 21.
Accuracy: 0.95
Working on fold 22.
Accuracy: 0.91
Working on fold 23.
Accuracy: 0.95
Mean accuracy: 0.95
		Using a total of 10 fold