In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
df = pd.read_csv('../data/processed/wordgame_20170807.csv')
df['word'] = df['word'].astype(str)
df['association'] = df['association'].astype(str)

# later
with open('../data/processed/sources.csv') as f:
    sources_list = f.read().splitlines()
print(sources_list)
df.head()

['AC', 'BC', 'CC', 'ECF', 'GOG', 'LEF', 'SAS', 'TF', 'U2', 'WP']


Unnamed: 0,user,forum,word,association,forumID
0,My smile is relief,U2,children,cute,8
1,Habanerose,GOG,journey,adventure,4
2,Judge,AC,listerine,antiseptic,0
3,Kleetus,GOG,whale,hello,4
4,le_chevalier,GOG,no,conspiracy,4


In [3]:
print("Dataset shape: " + str(df.shape))
print("Number of sources: " + str(len(df['forumID'].unique())))

Dataset shape: (330395, 5)
Number of sources: 10


### Syntactic features
...

In [4]:
df['tf'] = (100*df.groupby(['word'])['word'].transform('count'))/len(df) #percentage
print(df.word.value_counts().head(7).index.tolist())
print("Mean: "+str(df.tf.mean())+"\tMedian: "+str(df.tf.median()))  

['water', 'music', 'time', 'money', 'love', 'fire', 'food']
Mean: 0.0188537006299	Median: 0.007869368483179226


In [5]:
df['pair'] = df.apply(lambda r: str(r.word) + ":" + str(r.association), axis=1)
df['pf'] = (100*df.groupby(['pair'])['pair'].transform('count'))/len(df)
df.pair.value_counts().head(7).index.tolist()

['me:you',
 'man:woman',
 'up:down',
 'time:clock',
 'meow:meow',
 'green:grass',
 'dog:cat']

In [6]:
df['len1'] = df['word'].apply(lambda x:len(x))
df['len2'] = df['association'].apply(lambda x:len(x))
df['ldiff'] = df['len1'] - df['len2'] # length difference
print("Mean: "+str(df.len1.mean())+"\tMedian: "+str(df.len1.median()))  
print("Mean: "+str(df.ldiff.mean())+"\tMedian: "+str(df.ldiff.median()))  

Mean: 6.77352865509	Median: 6.0
Mean: -0.000853523812406	Median: 0.0


In [7]:
import Levenshtein
df['levenshtein'] = df.apply(lambda r:Levenshtein.distance(r.word, r.association), axis=1)
print("Mean: "+str(df.levenshtein.mean())+"\tMedian: "+str(df.levenshtein.median()))

Mean: 6.80292982642	Median: 6.0


In [8]:
import os
df['prefix'] = df.apply(lambda r: os.path.commonprefix([r.word, r.association]), axis=1)
df['pl']= (100*df['prefix'].apply(lambda x: len(x)))/(0.5*(df['len1']+df['len2']))
df['suffix'] = df.apply(lambda r: os.path.commonprefix([r.word[::-1], r.association[::-1]]), axis=1)
df['suffix'] = df['suffix'].apply(lambda x:x[::-1]) #re-reverse suffix
df['sl']= (100*df['suffix'].apply(lambda x: len(x)))/(0.5*(df.len1+df.len2))

In [9]:
print(df.prefix.value_counts().head(20).index.tolist())
print("Mean: "+str(df.pl.mean())+"\tMedian: "+str(df.pl.median()))
print(df.suffix.value_counts().head(20).index.tolist())
print("Mean: "+str(df.sl.mean())+"\tMedian: "+str(df.sl.median()))

['', 's', 'c', 'b', 'p', 'm', 'f', 't', 'd', 'a', 'w', 'h', 'r', 'g', 'l', 'e', 're', 'co', 'n', 'i']
Mean: 3.40372050678	Median: 0.0
['', 'e', 's', 't', 'y', 'n', 'ed', 'er', 'ing', 'r', 'd', 'es', 'l', 'a', 'tion', 'k', 'ion', 'on', 'o', 'ation']
Mean: 4.8668958518	Median: 0.0


### Semantic features

In [10]:
from gensim.models.keyedvectors import KeyedVectors
w2v_model = KeyedVectors.load_word2vec_format('../data/external/GoogleNews-vectors-negative300.bin', binary=True)
print('Loaded word embeddings')

Loaded word embeddings


In [11]:
df['inw2v'] = df.apply(lambda r:((r.word in w2v_model.vocab) & (r.association in w2v_model.vocab)), axis=1)
print("Mean: "+str(df.inw2v.mean()))

Mean: 0.819273899423


In [None]:
df['sim'] = 0
df.ix[df.inw2v, 'sim'] = df.ix[df.inw2v].apply(lambda r:w2v_model.similarity(r.word, r.association), axis=1)
print("Mean: "+str(df.sim.mean())+"\tMedian: "+str(df.sim.median()))
print(df[(df.sim<0.18)&(df.sim>0.17)].pair.head().tolist())

In [26]:
df['wv1'] = df['word'].apply(lambda x: np.zeros(300)) 
df['wv2'] = df['word'].apply(lambda x: np.zeros(300)) 

df.ix[df.inw2v, 'wv1'] = df.ix[df.inw2v, 'word'].apply(lambda x: w2v_model.word_vec(x)) 
df.ix[df.inw2v, 'wv2'] = df.ix[df.inw2v, 'association'].apply(lambda x: w2v_model.word_vec(x)) 

In [24]:
from nltk.corpus import wordnet as wn
synonyms = []
antonyms = []

#word = wn.synset("water.n.01")

#for i in word.hypernyms():
#    print(i.name())
#for i in word.hyponyms():
#    print(i.name())    
    
for syn in wn.synsets("light"):
    for l in syn.lemmas():
        synonyms.append(l.name()) #not it!
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name()) 
            
print(set(synonyms))
print(set(antonyms))

{'luminosity', 'light_up', 'tripping', 'spark', 'Light', 'faint', 'visible_radiation', 'luminance', 'illumine', 'visible_light', 'illuminate', 'brightness', 'lightsome', 'clear', 'sparkle', 'weak', 'get_down', 'promiscuous', 'fire_up', 'twinkle', 'dismount', 'abstemious', 'short', 'fall', 'unaccented', 'ignitor', 'perch', 'easy', 'lightly', 'light', 'wanton', 'idle', 'light-colored', 'Christ_Within', 'lighter', 'lightness', 'Inner_Light', 'clean', 'lighting', 'igniter', 'ignite', 'calorie-free', 'light-headed', 'light_source', 'brightness_level', 'scant', 'swooning', 'sluttish', 'luminousness', 'lite', 'illume', 'Light_Within', 'alight', 'unhorse', 'illumination', 'wakeful', 'unclouded', 'lightheaded', 'low-cal', 'get_off', 'loose'}
{'heavy', 'extinguish', 'dark'}


In [15]:
#df = df.drop('wv1', 1)
#df = df.drop('wv2', 1)
df.to_csv("../data/processed/wordgame_20170807_ft.csv", sep=',', index=False)