In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
df = pd.read_csv('../data/processed/wordgame_20170807.csv')
df['word'] = df['word'].astype(str)
df['association'] = df['association'].astype(str)

# later
with open('../data/processed/sources.csv') as f:
    sources_list = f.read().splitlines()
print(sources_list)
df.head()

In [None]:
print("Dataset shape: " + str(df.shape))
print("Number of sources: " + str(len(df['forumID'].unique())))

### Syntactic features
...

In [None]:
df['tf'] = (100*df.groupby(['word'])['word'].transform('count'))/len(df) #percentage
print(df.word.value_counts().head(7).index.tolist())
print("Mean: "+str(df.tf.mean())+"\tMedian: "+str(df.tf.median()))  

In [None]:
df['pair'] = df.apply(lambda r: str(r.word) + ":" + str(r.association), axis=1)
df['pf'] = (100*df.groupby(['pair'])['pair'].transform('count'))/len(df)
df.pair.value_counts().head(7).index.tolist()

In [None]:
df['len1'] = df['word'].apply(lambda x:len(x))
df['len2'] = df['association'].apply(lambda x:len(x))
df['ldiff'] = df['len1'] - df['len2'] # length difference
print("Mean: "+str(df.len1.mean())+"\tMedian: "+str(df.len1.median()))  
print("Mean: "+str(df.ldiff.mean())+"\tMedian: "+str(df.ldiff.median()))  

In [None]:
import Levenshtein
df['levenshtein'] = df.apply(lambda r:Levenshtein.distance(r.word, r.association), axis=1)
print("Mean: "+str(df.levenshtein.mean())+"\tMedian: "+str(df.levenshtein.median()))

In [None]:
import os
df['prefix'] = df.apply(lambda r: os.path.commonprefix([r.word, r.association]), axis=1)
df['pl']= (100*df['prefix'].apply(lambda x: len(x)))/(0.5*(df['len1']+df['len2']))
df['suffix'] = df.apply(lambda r: os.path.commonprefix([r.word[::-1], r.association[::-1]]), axis=1)
df['suffix'] = df['suffix'].apply(lambda x:x[::-1]) #re-reverse suffix
df['sl']= (100*df['suffix'].apply(lambda x: len(x)))/(0.5*(df.len1+df.len2))

In [None]:
print(df.prefix.value_counts().head(20).index.tolist())
print("Mean: "+str(df.pl.mean())+"\tMedian: "+str(df.pl.median()))
print(df.suffix.value_counts().head(20).index.tolist())
print("Mean: "+str(df.sl.mean())+"\tMedian: "+str(df.sl.median()))

### Semantic features

In [None]:
from gensim.models.keyedvectors import KeyedVectors
w2v_model = KeyedVectors.load_word2vec_format('../data/external/GoogleNews-vectors-negative300.bin', binary=True)
print('Loaded word embeddings')

In [None]:
df['inw2v'] = df.apply(lambda r:((r.word in w2v_model.vocab) & (r.association in w2v_model.vocab)), axis=1)
print("Mean: "+str(df.inw2v.mean()))

In [None]:
df['sim'] = 0
df.ix[df.inw2v, 'sim'] = df.ix[df.inw2v].apply(lambda r:w2v_model.similarity(r.word, r.association), axis=1)
print("Mean: "+str(df.sim.mean())+"\tMedian: "+str(df.sim.median()))
print(df[(df.sim<0.18)&(df.sim>0.17)].pair.head().tolist())

In [26]:
df['wv1'] = df['word'].apply(lambda x: np.zeros(300)) 
df['wv2'] = df['word'].apply(lambda x: np.zeros(300)) 

df.ix[df.inw2v, 'wv1'] = df.ix[df.inw2v, 'word'].apply(lambda x: w2v_model.word_vec(x)) 
df.ix[df.inw2v, 'wv2'] = df.ix[df.inw2v, 'association'].apply(lambda x: w2v_model.word_vec(x)) 

In [35]:
from nltk.corpus import wordnet as wn
dog = wn.synset("dog.n.01")
for h in dog.hypernyms():
    print(h.lemma_names)

<bound method Synset.lemma_names of Synset('canine.n.02')>
<bound method Synset.lemma_names of Synset('domestic_animal.n.01')>
