In [79]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# from nltk import word_tokenize

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

import os
from collections import defaultdict, Counter
from tqdm import tqdm_notebook as tqdm
import pickle
from operator import itemgetter
from IPython.core.debugger import set_trace
import numpy as np
import string

data_dirpath = '/usr2/mamille2/11-830_data/hw4'

# Train author ID simple classifier

In [2]:
# Load data
data = {}
folds = ['classtrain', 'train', 'dev', 'test']

for f in folds:
    data[f] = pd.read_csv(f'/usr2/mamille2/11-830_data/hw4/{f}.csv')
#     print(len(data[f]))

vec = CountVectorizer(min_df=2)
vec.fit(data['classtrain']['text'])

X = {}
for f in folds:
    X[f] = vec.transform(data[f]['text'])
#     print(X[f].shape)

label_dict = {'leia': 0, 'luke': 1}

y = {}
for f in folds:
    y[f] = data[f]['label'].map(lambda x: label_dict[x])
#     print(y[f].shape)

# for f in ['classtrain', 'dev', 'test']:

# Train
clf = LogisticRegression()
clf.fit(X['classtrain'], y['classtrain'])

# Get scores
for f in ['classtrain', 'dev', 'test']:
    print(f)
    preds = clf.predict(X[f])
    print(f"Accuracy: {accuracy_score(preds, y[f])}")
    print(f"Precision: {precision_score(preds, y[f])}")
    print(f"Recall: {recall_score(preds, y[f])}")
    print(f"F1: {f1_score(preds, y[f])}")
    print()

classtrain
Accuracy: 0.97163
Precision: 0.9809
Recall: 0.9630451430478921
F1: 0.9718905743755388

dev
Accuracy: 0.5670080142475512
Precision: 0.5422974176313446
Recall: 0.5704918032786885
F1: 0.5560374343757133

test
Accuracy: 0.95695
Precision: 0.9737
Recall: 0.9421383647798742
F1: 0.9576592082616179



# Examine data

In [12]:
data['train']

Unnamed: 0,label,text
0,luke,i ' d use this place of business again .
1,luke,this business was very highly rated and rightl...
2,luke,great food when you can get in .
3,luke,the pasta was fresh and tasty .
4,luke,salt & pepper calamari - delicious .
5,luke,the kids devoured these .
6,luke,will be a staple on subsequent trips .
7,luke,water boiled fish - this dish was phenomenal .
8,luke,starts slow & builds & builds .
9,luke,"great , great dish ."


In [29]:
# Label distribution
len(data['train'][data['train']['label']=='leia'])/len(data['train'])

0.5

In [30]:
# Label distribution
len(data['test'][data['test']['label']=='leia'])/len(data['test'])

0.5

In [12]:
lexica['luke']

['wife',
 '.',
 'good',
 'is',
 'girlfriend',
 'a',
 'of',
 'you',
 'place',
 'great',
 'excellent',
 'the',
 'beer',
 'as',
 'quality',
 'value',
 'burger',
 'average',
 'in',
 'some',
 'its',
 'las',
 'this',
 'best',
 'solid',
 'food',
 'price',
 'beers',
 'selection',
 'better',
 'real',
 'guys',
 'from',
 'business',
 'buddy',
 'strip',
 'well',
 'bbq',
 'vegas',
 'casino',
 "''",
 'outstanding',
 'has',
 'star',
 'service',
 'steak',
 'these',
 'phoenix',
 'joint',
 'style',
 'sports',
 '``',
 'valley',
 'notch',
 'simply',
 'decent',
 'places',
 'expect',
 'or',
 'gf',
 'seems',
 'if',
 'bachelor',
 'game',
 'bad',
 'your',
 'high',
 'ribs',
 'http://url',
 'chinese',
 'wifey',
 'review',
 'pizza',
 'than',
 'prices',
 'fair',
 '`',
 'folks',
 'bar',
 'bucks',
 'brisket',
 "'ll",
 'most',
 'burgers',
 'city',
 'chain',
 'certainly',
 'steaks',
 'buffet',
 'mexican',
 'craft',
 'years',
 'ipa',
 'standard',
 'charlotte',
 'cool',
 'at',
 'rating',
 'town',
 ';',
 'italian',
 ',',

In [13]:
lexica['leia']

['!',
 'husband',
 'we',
 'boyfriend',
 'so',
 'i',
 'love',
 'our',
 'my',
 'yummy',
 'was',
 '!!',
 'hubby',
 'delicious',
 ':-rrb-',
 'she',
 'loved',
 'bf',
 'us',
 'he',
 'cute',
 'were',
 'super',
 '&',
 'because',
 'ordered',
 "n't",
 'did',
 'definitely',
 'friend',
 'chocolate',
 'amazing',
 'lovely',
 'salad',
 '!!!',
 'her',
 'delish',
 'sweet',
 'fabulous',
 'it',
 'got',
 'beautiful',
 'cake',
 'yum',
 '-',
 'had',
 'mom',
 'hair',
 'came',
 'birthday',
 'happy',
 'excited',
 'and',
 'favorite',
 'dessert',
 'wonderful',
 'tea',
 'cream',
 'wanted',
 'really',
 '?!',
 'girl',
 'sister',
 'also',
 'salon',
 'wedding',
 ':-lrb-',
 'since',
 'girls',
 'too',
 'massage',
 'am',
 'adorable',
 'gorgeous',
 'sooo',
 'husbands',
 'server',
 'hubs',
 'cupcakes',
 'die',
 'him',
 'strawberry',
 ';-rrb-',
 'omg',
 'cupcake',
 'bachelorette',
 'totally',
 '-lrb-',
 'could',
 'veggie',
 'huge',
 'spa',
 'okay',
 '!!!!',
 'rude',
 'lol',
 'his',
 'which',
 'asked',
 'soooo',
 '*',
 'fun

# Add to Leia and Luke lexica

In [94]:
# Build probability tables
assoc = {'leia': {}, 'luke': {}}
texts = {}
texts['leia'] = [w for d in data['train'][data['train']['label']=='leia']['text'].tolist() for w in d.split() if not w in stops]
texts['luke'] = [w for d in data['train'][data['train']['label']=='luke']['text'].tolist() for w in d.split() if not w in stops]
train_vocab = set(texts['leia']).union(set(texts['luke']))
counters = {'leia': Counter(texts['leia']), 'luke': Counter(texts['luke'])}
total_counter = counters['leia'] + counters['luke']
doc_count = {name: len(data['train'][data['train']['label']==name]) for name in authors}

for name in authors:
    for w in tqdm(train_vocab):
        assoc[name][w] = np.log((counters[name][w]/doc_count[name]) / (total_counter[w]/len(data['train'])))

HBox(children=(IntProgress(value=0, max=17348), HTML(value='')))

  del sys.path[0]





HBox(children=(IntProgress(value=0, max=17348), HTML(value='')))




In [95]:
# Sorted associations
sorted_assoc = {name: sorted(assoc[name].items(), key=itemgetter(1), reverse=True) for name in authors}

In [96]:
# Take top n and add to lexica
top = 10000

for name in authors:
    print(name)
    for i in range(top):
        wd, assoc = sorted_assoc[name][i]
        if not wd in lexica[name]:
            lexica[name].append(wd)
            
    print(len(lexica[name]))

leia
10804
luke
10617


In [98]:
# Save new lexica
for name in authors:
    with open(os.path.join(data_dirpath, f'{name}_{top}.txt'), 'w') as f:
        for w in lexica[name]:
            f.write(w+'\n')

# Identify, replace author-specific terms

In [11]:
lexica = {}
stops = ['a', 'the', 'of'] + [c for c in string.punctuation]
for name in ['leia', 'luke']:
    with open(os.path.join(data_dirpath, f'{name}.txt')) as f:
        lexica[name] = [w for w in f.read().splitlines() if not w in stops]
    print(len(lexica[name]))

3000
2997


In [12]:
glove_model = KeyedVectors.load_word2vec_format("/usr2/mamille2/11-830_data/hw4/gensim_glove_6Bvectors300d.txt", binary=False)

len(glove_model.vocab)

400000

## Build mapping from leia > luke lexicon

In [37]:
conversions = {}

In [24]:
# Load extended conversion list
conv_path = os.path.join(data_dirpath, 'lexicon_sims_10000.pkl')
print('Loading conversion lists...')
with open(conv_path, 'rb') as f:
    conversions['extended'] = pickle.load(f)
print(len(conversion['luke']))
print(len(conversion['leia']))

Loading conversion lists...
10166
10187


In [30]:
# Load original conversion list
conv_path = os.path.join(data_dirpath, 'lexicon_sims_3000.pkl')
with open(conv_path, 'rb') as f:
    conversions['orig'] = pickle.load(f)
print(len(conversion['leia']))
print(len(conversion['luke']))

2917
2679


In [36]:
for name in authors:
    print(len(special_conversion[name]))

10187
10166


In [38]:
conversions['extended'] = special_conversion

In [14]:
conversion = {}
conversion['leia'] = {} # Target is Leia's speech
conversion['luke'] = {}

authors = ['leia', 'luke']
for name in authors:
    print(name)
    other = [a for a in authors if a!=name][0]
    
    for w in tqdm(lexica[other]):
        if w in glove_model.vocab:
            conversion[name][w] = sorted([(glove_model.similarity(w, other_w), rank, other_w) for rank, other_w in enumerate(lexica[name]) if other_w in glove_model.vocab], reverse=True)

# Save similarities
# conv_path = os.path.join(data_dirpath, 'lexicon_sims_luke_extended_leia_orig.pkl')
# with open(conv_path, 'wb') as f:
#     pickle.dump(conversion, f)

leia


HBox(children=(IntProgress(value=0, max=2997), HTML(value='')))

KeyboardInterrupt: 

In [7]:
conversion['leia']['wife']

[(0.86463905943091457, 1, 'husband'),
 (0.84308741832967493, 140, 'daughter'),
 (0.80870149281341941, 460, 'mother'),
 (0.7343635981724328, 563, 'married'),
 (0.70373893088560702, 1086, 'daughters'),
 (0.69169295249234264, 62, 'sister'),
 (0.68089037680147291, 8099, 'father'),
 (0.67667515116835109, 7550, 'grandmother'),
 (0.67516156845518138, 35, 'her'),
 (0.6694897401467258, 572, 'niece'),
 (0.66527166451062381, 192, 'son'),
 (0.65429199982189712, 29, 'friend'),
 (0.62811667698436624, 1460, 'granddaughter'),
 (0.62243296618177568, 468, 'cousin'),
 (0.61271282713328368, 9267, 'couple'),
 (0.60902210852241245, 745, 'aunt'),
 (0.60336544379573442, 15, 'she'),
 (0.60022967604749222, 1630, 'brother'),
 (0.59869925990638828, 151, 'woman'),
 (0.58550354983788977, 4716, 'whom'),
 (0.57655378387779466, 807, 'lady'),
 (0.57410036263328246, 3, 'boyfriend'),
 (0.5518234406347734, 9099, 'marry'),
 (0.54136929771820019, 326, 'parents'),
 (0.54084921718376222, 1117, 'herself'),
 (0.5399629205253027

## Obfuscate

In [80]:
def obfuscate(text, tgt, threshold, conversion, mod):
    
    toks = text.split()
    obf_toks = []
    
    for t in toks:
        if t in conversion[tgt] and conversion[tgt][t][0][0] > threshold:
#             obf_toks.append(conversion[name][t][0][-1])
            candidates = []
            for i in range(len(conversion[tgt][t])):
                item = conversion[tgt][t][i]
                if item[0] > threshold and not item[-1] in stops:
                    candidates.append(item)
                else:
                    break
                    
            # Select highest ranked among candidates
#             ranked = sorted(conversion[name][t], key=itemgetter(1))
#             for i in range(len(ranked)):
#                 if ranked[i][0] > threshold:
#                     obf_toks.append(ranked[i][-1])
#                     break
                    
            best = sorted(candidates, key=itemgetter(1))[0]
            if mod == 'mod':
                if best[-1] == t:
                    obf_toks.append(best[-1])
                else:
                    obf_toks.append(best[-1]+'z')
            else:
                obf_toks.append(best[-1])
            
        else:
#             if mod == 'mod':
#                 obf_toks.append(t+'z')
#             else:
#                 obf_toks.append(t)
            obf_toks.append(t)
            
    return ' '.join(obf_toks)

In [86]:
# folds = ['train', 'dev', 'test']
# folds = ['train', 'test']
folds = ['test']
authors = ['leia', 'luke']
# thresholds = [0.3, 0.5, 0.7]
thresholds = [0.5, 0.7]
# thresholds = [0.3]
lexicon = 'extended'
# lexicon = 'orig'
label_dict = {'leia': 0, 'luke': 1}
reverse_label_dict = {0: 'leia', 1: 'luke'}
# mod = 'mod'
mod = 'mod'

for t in thresholds:
    print(t)
    
    X = {}
    y = {}
    
    for f in folds:
        for name in authors:
            data[f][f'obfuscated_{lexicon}_{t}_{name}_{mod}'] = \
                    list(map(lambda x: obfuscate(x, name, t, conversions[lexicon], mod), tqdm(data[f]['text'].tolist())))

        for name in authors:
            print(f"Target: {name}")
            X[f] = vec.transform(data[f][f'obfuscated_{lexicon}_{t}_{name}_{mod}'])
            y[f] = data[f]['label'].map(lambda x: label_dict[x])

            # Get scores
            preds = clf.predict(X[f])
            data[f][f'obfuscated_{lexicon}_{t}_{name}_{mod}_preds'] = [reverse_label_dict[p] for p in preds]
            print(f"Accuracy: {accuracy_score(preds, y[f])}")
    #         print(f"Precision: {precision_score(preds, y[f])}")
    #         print(f"Recall: {recall_score(preds, y[f])}")
    #         print(f"F1: {f1_score(preds, y[f])}")
            print()
        print()
    print()

0.5


HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))


Target: leia
Accuracy: 0.59965

Target: luke
Accuracy: 0.5597



0.7


HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))


Target: leia
Accuracy: 0.6196

Target: luke
Accuracy: 0.59205





In [84]:
data['test'].columns

Index(['label', 'text', 'obfuscated_leia', 'obfuscated_luke',
       'obfuscated_extended_0.3_leia', 'obfuscated_extended_0.3_luke',
       'obfuscated_orig_0.3_leia', 'obfuscated_orig_0.3_luke',
       'obfuscated_extended_0.3_leia_mod', 'obfuscated_extended_0.3_luke_mod',
       'obfuscated_extended_0.3_leia_mod_preds',
       'obfuscated_extended_0.3_luke_mod_preds',
       'obfuscated_extended_0.5_leia_mod', 'obfuscated_extended_0.5_luke_mod',
       'obfuscated_extended_0.5_leia_mod_preds',
       'obfuscated_extended_0.5_luke_mod_preds',
       'obfuscated_extended_0.7_leia_mod', 'obfuscated_extended_0.7_luke_mod',
       'obfuscated_extended_0.7_leia_mod_preds',
       'obfuscated_extended_0.7_luke_mod_preds',
       'obfuscated_extended_0.5_leia_nomod',
       'obfuscated_extended_0.5_luke_nomod',
       'obfuscated_extended_0.5_leia_nomod_preds',
       'obfuscated_extended_0.5_luke_nomod_preds',
       'obfuscated_extended_0.7_leia_nomod',
       'obfuscated_extended_0.7_luke

## Error analysis

In [75]:
pd.set_option('display.max_colwidth', -1)

In [90]:
data['test'].loc[:, ['text','obfuscated_extended_0.5_leia_mod', 'obfuscated_extended_0.5_luke_mod',
                    'label', 'obfuscated_extended_0.5_leia_mod_preds', 'obfuscated_extended_0.5_luke_mod_preds']]

Unnamed: 0,text,obfuscated_extended_0.5_leia_mod,obfuscated_extended_0.5_luke_mod,label,obfuscated_extended_0.5_leia_mod_preds,obfuscated_extended_0.5_luke_mod_preds
0,ok but not special .,!z wez wez special wez,ok .z not special .,luke,luke,luke
1,my wife ordered the tenderloin sliders .,husbandz husbandz orderedz the grilledz sliderz wez,goodz wife judgez the steakz sliders .,luke,luke,luke
2,not the best i ever had but good .,wez the n'tz wez wez wez wez wez wez,not the best .z ever somez .z good .,luke,luke,luke
3,"while not traditional , both are tasty .",soz wez folkz wez wez wez deliciousz wez,.z not traditional .z .z are tasty .,luke,luke,luke
4,all excellent and perfectly cooked .,wez wonderfulz soz perfectz deliciousz wez,all excellent .z seemsz cooked .,luke,luke,luke
5,they are the most flavorful of the animal .,wez wez the soz deliciousz of the petz wez,.z are the most saucesz of the meatz .,luke,luke,luke
6,"and the bonus , no corkage .",soz the bonusz wez wez corkage wez,.z the bonusz .z no corkage .,luke,luke,luke
7,the food was extremely edible .,the drinksz soz soz fruitz wez,the food .z wellz edible .,luke,luke,luke
8,the chicken and broccoli is also very good .,the saladz soz veggiesz soz soz wez wez wez,the steakz .z turnipsz is .z very good .,luke,luke,luke
9,i ordered yellowtail and they gave me red snapper .,wez orderedz yellowtail soz wez gave !z greenz snapper wez,.z judgez yellowtail .z .z thatz goodz soxz grouperz .,luke,luke,luke


In [95]:
# True positives
sel = data['test'][(data['test']['obfuscated_extended_0.5_leia_mod_preds']=='leia') & (data['test']['obfuscated_extended_0.5_luke_mod_preds']=='luke')]
sel.loc[:, ['text','obfuscated_extended_0.5_leia_mod', 'obfuscated_extended_0.5_luke_mod', 
                    'label', 'obfuscated_extended_0.5_leia_mod_preds', 'obfuscated_extended_0.5_luke_mod_preds']]

Unnamed: 0,text,obfuscated_extended_0.5_leia_mod,obfuscated_extended_0.5_luke_mod,label,obfuscated_extended_0.5_leia_mod_preds,obfuscated_extended_0.5_luke_mod_preds
4381,love the chocolate strawberry truffle cake .,love the chocolate strawberry trufflez cake wez,goodz the piez shortcakez porciniz cookedz .,luke,leia,luke
5610,we had the vegetarian gluten-free prix fixe .,we wez the vegetarian gluten-free prix fixe wez,.z somez the cuisinez gluten-free grandz fixe .,luke,leia,luke
10000,"on monday , i spoke to him on the phone and he said he could come by to check out my car the following afternoon .",itz camez wez wez spoke wez him itz the phone soz he said he wez wez wasz wez checkedz wez husbandz mercedesz the following afternoon wez,.z onz .z .z interviewedz .z .z .z the internetz .z .z ''z .z .z .z thisz .z youz .z goodz vehiclez the .z herez .,leia,leia,luke
10003,"when they brought us a 0 course menu , we said that we were there for the st patrick ' s day irish stew , guinness etc . .",when wez brought us a 2z course dessertz wez we said wez we soz wez soz the chapelz patrick --z sz day scottishz soupz wez guinessz etc wez wez,.z .z .z americanz a 0 course buffetz .z .z ''z .z .z .z .z .z the stz patrick `z sz .z irish stew .z guinness etc . .,leia,leia,luke
10009,it was just two of us so we totally felt like little <UNK> with 0 big pretzels and a pizza !,wez soz wez werez of us so we reallyz felt wez soz <UNK> soz 2z soz pretzels soz a cheesez !z,.z .z .z inz of americanz .z .z seemsz certainlyz .z .z <UNK> .z 0 big nachosz .z a pizza youz,leia,leia,luke
10018,we stumbled upon this gem as we were walking up crescent street to find something that appealed to both of us .,we slippedz upon wez diamondsz soz we soz walking wez crescent streetsz wez wez wez wez appealed wez wez of us wez,.z stumbledz onz this gem as .z .z bikingz .z crescent street .z goodz goodz .z courtz .z .z of americanz .,leia,leia,luke
10032,"she made my fiancã © , dog and i very comfortable and it felt like a friend or family member was <UNK> us .",she soz husbandz fiancã © wez petz soz wez wez uncomfortablez soz wez felt wez a friend soz husbandz member soz <UNK> us wez,wifez .z goodz fiancã © .z horsez .z .z very quitez .z .z certainlyz .z a wifez .z wifez electedz .z <UNK> americanz .,leia,leia,luke
10056,"okay , so i had it pretty good , being waited on by a young hostess , treated to glasses of wine and super comfortable seating in our own private box .",okayz wez so wez wez wez wez wez wez wez waited itz wasz a girlz hostess wez treatmentz wez glasses of dessertz soz super uncomfortablez seatedz wasz our soz privatez boxesz wez,okz .z .z .z somez .z pretty good .z .z waitsz .z thisz a agedz hostessesz .z beingz .z bottlesz of beerz .z bowlz quitez seating .z goodz youz businessz boxz .,leia,leia,luke
10060,we had the flourless chocolate cake with the blood orange <UNK> sorbet was out of control .,we wez the flourless chocolate cake soz the bloodz orangez <UNK> sorbet soz wez of gripz wez,.z somez the flourless piez cookedz .z the urinez yellowz <UNK> gazpachoz .z .z of control .,leia,leia,luke
10079,the onion soup was so rich and creamy along with the roast peppers and quinoa salad that added a bit of texture .,the saladz saladz soz so richerz soz creamy andz soz the deliciousz cucumberz soz quinoa deliciousz wez wez a soz of creamyz wez,the onion cookedz .z .z wealthz .z saucesz wellz .z the roast chiliz .z quinoa steakz .z .z a goodz of texture .,leia,leia,luke


In [96]:
# Incorrectly Luke
sel = data['test'][(data['test']['obfuscated_extended_0.5_leia_mod_preds']=='luke') & (data['test']['obfuscated_extended_0.5_luke_mod_preds']=='luke') &\
                      (data['test']['label']=='luke')]
sel.loc[:, ['text','obfuscated_extended_0.5_leia_mod', 'obfuscated_extended_0.5_luke_mod', 
                    'label', 'obfuscated_extended_0.5_leia_mod_preds', 'obfuscated_extended_0.5_luke_mod_preds']]

Unnamed: 0,text,obfuscated_extended_0.5_leia_mod,obfuscated_extended_0.5_luke_mod,label,obfuscated_extended_0.5_leia_mod_preds,obfuscated_extended_0.5_luke_mod_preds
0,ok but not special .,!z wez wez special wez,ok .z not special .,luke,luke,luke
1,my wife ordered the tenderloin sliders .,husbandz husbandz orderedz the grilledz sliderz wez,goodz wife judgez the steakz sliders .,luke,luke,luke
2,not the best i ever had but good .,wez the n'tz wez wez wez wez wez wez,not the best .z ever somez .z good .,luke,luke,luke
3,"while not traditional , both are tasty .",soz wez folkz wez wez wez deliciousz wez,.z not traditional .z .z are tasty .,luke,luke,luke
4,all excellent and perfectly cooked .,wez wonderfulz soz perfectz deliciousz wez,all excellent .z seemsz cooked .,luke,luke,luke
5,they are the most flavorful of the animal .,wez wez the soz deliciousz of the petz wez,.z are the most saucesz of the meatz .,luke,luke,luke
6,"and the bonus , no corkage .",soz the bonusz wez wez corkage wez,.z the bonusz .z no corkage .,luke,luke,luke
7,the food was extremely edible .,the drinksz soz soz fruitz wez,the food .z wellz edible .,luke,luke,luke
8,the chicken and broccoli is also very good .,the saladz soz veggiesz soz soz wez wez wez,the steakz .z turnipsz is .z very good .,luke,luke,luke
9,i ordered yellowtail and they gave me red snapper .,wez orderedz yellowtail soz wez gave !z greenz snapper wez,.z judgez yellowtail .z .z thatz goodz soxz grouperz .,luke,luke,luke


In [98]:
# Incorrectly Leia
sel = data['test'][(data['test']['obfuscated_extended_0.5_leia_mod_preds']=='leia') & (data['test']['obfuscated_extended_0.5_luke_mod_preds']=='leia') &\
                      (data['test']['label']=='leia')]
sel.loc[:, ['text','obfuscated_extended_0.5_leia_mod', 'obfuscated_extended_0.5_luke_mod', 
                    'label', 'obfuscated_extended_0.5_leia_mod_preds', 'obfuscated_extended_0.5_luke_mod_preds']]

Unnamed: 0,text,obfuscated_extended_0.5_leia_mod,obfuscated_extended_0.5_luke_mod,label,obfuscated_extended_0.5_leia_mod_preds,obfuscated_extended_0.5_luke_mod_preds
10001,"we ended up getting a side of the bbq sauce to dip our fries in , it was <UNK> ! ! ! the area isnt that great and the casino is old , but the craps table is hot and it was fun !",we ended wez wez a butz of the bbq deliciousz wez dip our crispyz wasz wez wez soz <UNK> !z !z !z the outsidez isnt wez iz soz the harrahz soz girlz wez wez the craps table soz warmz soz wez soz lovez !z,.z nearlyz .z goodz a onz of the bbq steakz .z dropz goodz fries .z .z .z .z <UNK> youz youz youz the cityz isnt .z great .z the casino is old .z .z the craps roomz is hot .z .z .z goodz youz,leia,leia,leia
10002,"our drink order was wrong , the champagne we ordered was delivered by a bartender who could not even open the bottle - lrb - i had to do it for him - rrb - and we waited ages for water .",our milkz order soz wez wez the champagne we orderedz soz speechz wasz a waitressz who wez wez wez opensz the champagnez - lrb - wez wez wez wez wez soz him - rrb - soz we waited childrenz soz waterz wez,goodz beerz shouldz .z wrong .z the bottlesz .z judgez .z delivered thisz a busboyz wifez .z not .z openingz the beerz - lrb - .z somez .z .z .z .z .z - rrb - .z .z waitsz agedz .z supplyz .,leia,leia,leia
10015,"first of all , i love the name of this place : - rrb - i ' ll be purchasing a t-shirt , lol !",soz of wez wez wez love the originallyz of wez wez ;z - rrb - wez --z jz wez purchasez a jeansz wez lol !z,.z of all .z .z goodz the samez of this place : - rrb - .z `z ll be buyz a shirtz .z lol youz,leia,leia,leia
10027,"all of the food we had was great , we did the <UNK> buffet one morning and all of the <UNK> and dinners for the business trip we were on were great .",wez of the drinksz we wez soz iz wez we wez the <UNK> dessertz wez morning soz wez of the <UNK> soz brunchz soz the privatez dayz we soz itz soz iz wez,all of the food .z somez .z great .z .z .z the <UNK> buffet .z newsz .z all of the <UNK> .z mealz .z the business trip .z .z .z .z great .,leia,leia,leia
10033,we ' ve been back many many times since and received the same incredible food and service - lrb - we <UNK> the servers here - rrb - that originally made us fall in love with grimaldi ' s .,we --z ve wez wez wez wez evenz since soz received the wez amazingz drinksz soz service - lrb - we <UNK> the serverz wez - rrb - wez originally soz us becausez wasz love soz grimaldi --z sz wez,.z `z ve .z .z .z .z .z .z .z fromz the .z incredible food .z service - lrb - .z <UNK> the softwarez here - rrb - .z builtz .z americanz fall .z goodz .z grimaldi `z sz .,leia,leia,leia
10034,"overall i ' d give the sandwich 0 stars , partly because i should ' ve asked for less mustard and partly because i wish they had challah and finally because i ordered the wrong thing .",decreasez wez --z dz wez the saladz 2z actorz wez becausez because wez wez --z ve asked soz soz honeyz soz becausez because wez wish wez wez challah soz wez because wez orderedz the wez wez wez,overall .z `z fz .z the sandwich 0 stars .z resultz .z .z should `z ve youz .z .z mustard .z resultz .z .z goodz .z somez briochez .z ifz .z .z judgez the wrong goodz .,leia,leia,leia
10042,"she never makes eye contact with me - lrb - i am demanding that way - good customer service , eye contact , smile - rrb - and today she was chatting it up with another customer waiting at the bar .",she wez soz eyesz contact soz !z - lrb - wez iz demanding wez wez - wez customerz service wez eyesz contact wez smile - rrb - soz wez she soz chatting wez wez soz soz customerz waiting timez the barsz wez,wifez .z goodz eyez contact .z goodz - lrb - .z veryz demandz .z way - good customersz service .z eyez contact .z smilingz - rrb - .z .z wifez .z jokingz .z .z .z .z customersz 'llz .z the bar .,leia,leia,leia
10050,it ' s all really high platforms the cute bilingual sandals are not there i asked an employee & what he showed me was n't in the same planet not sure i ' ll go back & wo n't be referring friend there for sure,wez --z sz wez wez high platforms the cute languagez sandals wez wez wez wez asked wasz workersz & wez he hadz !z soz wez wasz the wez planet wez wez wez --z jz !z wez & wez wez wez askedz friend wez soz wez,.z `z sz all goodz high platforms the stupidz bilingual shortsz are not .z .z youz an contractorz & what .z seenz goodz .z .z .z the .z planet not .z .z `z ll .z .z & wo .z be whatz wifez .z .z .z,leia,leia,leia
10052,"the parking is the same , the outside is the same , the stage is the same , and the concert goers are the same .",the entrancez soz the wez wez the outsidez soz the wez wez the stage soz the wez wez soz the concert goers wez the wez wez,the parking is the .z .z the .z is the .z .z the stagesz is the .z .z .z the performedz goers are the .z .,leia,leia,leia
10069,"at the end of the day i returned to find my bed made , new towels , and all of the garbage still under the beds .",timez the wez of the day wez shez wez wez husbandz bed soz wez whichz towelsz wez soz wez of the wastez wez whichz the bedz wez,.z the .z of the .z .z yearsz .z goodz goodz asleepz .z .z new sheetsz .z .z all of the garbage .z under the roomsz .,leia,leia,leia


## 1-time

In [7]:
# Estimate coverage
data_wds = set(vec.get_feature_names())
embed_wds = set(glove_model.vocab.keys())
print(len(data_wds))
print(len(embed_wds))
coverage = data_wds.intersection(embed_wds)
print(len(coverage))

13657
400000
13287


In [4]:
glove2word2vec(glove_input_file="/usr2/mamille2/11-830_data/hw4/glove.6B.300d.txt", word2vec_output_file="/usr2/mamille2/11-830_data/hw4/gensim_glove_6Bvectors300d.txt")

(400000, 300)