In [1]:
import csv
import itertools
import json
import os
import pickle
import time
from collections import defaultdict
from itertools import product
from itertools import permutations

import networkx as nx
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.util import ngrams

# Lexicon

In [2]:
# No English lexicon

# POS tagger

In [3]:
def pos_tagger(tokens):
    return nltk.pos_tag(tokens, tagset='universal')

# Techniques

In [4]:
def is_english_techniques_file(filename):
    return filename.startswith('en_') and filename.endswith('_techniques.txt')

In [5]:
def add_node(g, n):
    if not n in g:
        g.add_node(n, count=0)

In [6]:
def add_edge(g, n1, n2):
    if n1 != n2 and not nx.has_path(g, n1, n2):
        g.add_edge(n1, n2)

In [7]:
graph_syn = nx.Graph()
techniques_root = 'data/techniques/'
for e in os.listdir(techniques_root):
    file_path = techniques_root + e
    if os.path.isfile(file_path):
        if is_english_techniques_file(e):
            with open(file_path) as f:
                for line in f:
                    syn_set = set()
                    techs1 = line.strip()
                    for tech in techs1.split(' or '):
                        syn_set.add(tech)
                        add_node(graph_syn, tech)
                    syn_set = list(syn_set)
                    i1 = syn_set[0]
                    for i2 in syn_set[1:]:
                        add_edge(graph_syn, i1, i2)

In [8]:
len(graph_syn)

404

In [9]:
graph_syn.number_of_edges()

120

In [10]:
nx.number_connected_components(graph_syn)

284

In [11]:
nx.write_gexf(graph_syn, 'data/english_techniques_lexicon_1.gexf')

In [12]:
graph_syn = nx.read_gexf('data/english_techniques_lexicon_1.gexf')

# POS tagging

In [13]:
def technique_tagger(x):
    result = []
    tokens = nltk.word_tokenize(x)
    tags = pos_tagger(tokens)
    for token, tag in tags:
        result.append((token, tag.lower()))
    if len(result) == 1:
        tag = result[0][1]
        if tag not in ['noun', 'verb']:
            tag = 'noun'
        result = [(x, tag)]
    return result

In [14]:
%%time

with open('data/english_techniques_postags.csv', 'w') as f:
    writer = csv.writer(
        f,
        delimiter=',',
        quotechar='"',
        quoting=csv.QUOTE_MINIMAL
    )
    for tech in graph_syn.nodes_iter():
        pos_tag = ' '.join(tag for token, tag in technique_tagger(tech))
        row = [tech, pos_tag]
        writer.writerow(row)

CPU times: user 352 ms, sys: 20 ms, total: 372 ms
Wall time: 369 ms


In [15]:
postags = {}
with open('data/english_techniques_postags.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        postags[row[0]] = row[1]

In [16]:
def get_postags(x):
    try:
        tags = postags[x]
    except:
        postags[x] = ' '.join(tag for token, tag in technique_tagger(x))
        tags = postags[x]
    return list(zip(nltk.word_tokenize(x),nltk.word_tokenize(tags)))

# Example
get_postags('deep fry')

[('deep', 'adj'), ('fry', 'noun')]

# wordnet synonyms

In [17]:
nouns_and_verbs = set()
for tech in graph_syn.nodes_iter():
    tags = get_postags(tech)
    for token, tag in tags:
        if tag in ['noun', 'verb']:
            nouns_and_verbs.add(token)

In [18]:
len(nouns_and_verbs)

368

In [19]:
%%time

with open('data/wordnet_techniques_synonyms.csv', 'w') as f:
    writer = csv.writer(
        f,
        delimiter=',',
        quotechar='"',
        quoting=csv.QUOTE_MINIMAL
    )
    for x in nouns_and_verbs:
        if x in graph_syn:
            syns = []
            for ss in wn.synsets(x):
                if ss.name().startswith(x) and ss.pos() in ['n', 'v']:
                    syns.extend(map(lambda y: y.replace('_', ' ').lower(), ss.lemma_names()))
            if syns:
                row = [x] + syns
                writer.writerow(row)

CPU times: user 2.51 s, sys: 64 ms, total: 2.58 s
Wall time: 2.57 s


In [20]:
wordnet_syns = {}
with open('data/wordnet_techniques_synonyms.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        wordnet_syns[row[0]] = row[1:]

In [21]:
len(wordnet_syns)

126

In [22]:
wordnet_graph = nx.Graph()
for k in wordnet_syns:
    syns = wordnet_syns[k]
    for syn in syns:
        wordnet_graph.add_edge(k, syn)

In [23]:
nx.number_connected_components(wordnet_graph)

117

In [24]:
def is_kn_complete(g):
    complete = True
    for n1 in g:
        for n2 in g:
            if n1 != n2 and not g.has_edge(n1, n2):
                complete = False
                break
        if not complete:
            break
    return complete

In [25]:
kn_complete_graphs = []
for subg in nx.connected_component_subgraphs(wordnet_graph):
    if is_kn_complete(subg):
        kn_complete_graphs.append(subg)

In [26]:
len(kn_complete_graphs)

62

In [27]:
syns_found = 0
for g in kn_complete_graphs:
    syn_set = g.nodes()
    i1 = syn_set[0]
    add_node(graph_syn, i1)
    for i2 in syn_set[1:]:
        add_node(graph_syn, i2)
        add_edge(graph_syn, i1, i2)
        syns_found += 1
syns_found

20

In [28]:
len(graph_syn)

424

In [29]:
graph_syn.number_of_edges()

140

In [30]:
nx.number_connected_components(graph_syn)

284

In [31]:
nx.write_gexf(graph_syn, 'data/english_techniques_lexicon_2.gexf')

In [32]:
graph_syn = nx.read_gexf('data/english_techniques_lexicon_2.gexf')

# Infinitive, gerund, participle, and noun

In [33]:
stemmer = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

In [34]:
def is_infinitivable_word(word_tag):
    tag = word_tag[1]
    return tag in ['noun', 'verb']

def is_infinitivable_technique(technique):
    return any(map(is_infinitivable_word, get_postags(technique)))

def naive_infinitive_noun(word):
    return word

def infinitive_verb(word):
    return lemmatizer.lemmatize(word, pos='v')

def infinitive_word(word_tag):
    word = word_tag[0]
    tag = word_tag[1]
    inf = word
    if word.isalpha():
        if tag == 'noun':
            inf = naive_infinitive_noun(word)
        elif tag == 'verb':
            inf = infinitive_verb(word)
    return inf

def infinitive_technique(technique):
    infs = map(infinitive_word, get_postags(technique))
    return ' '.join(infs)

# Example
infinitive_technique('deep fried')

'deep fry'

In [35]:
def is_gerundable_word(word_tag):
    tag = word_tag[1]
    return tag in ['noun', 'verb']

def is_gerundable_technique(technique):
    return any(map(is_gerundable_word, get_postags(technique)))

def naive_gerund_noun(word):
    lemma = lemmatizer.lemmatize(word)
    ger = lemma + 'ing'
    if ger not in words.words():
        ger = word
    return ger

def gerund_verb(word):
    if word.endswith('ing'):
        return word
    
    def count_vowels(s):
        c = 0
        for v in 'aeiou':
            c += s.count(v)
        return c
    
    def is_vowel(x):
        return x in 'aeiou'
    
    def is_consonant(x):
        return not is_vowel(x)
    
    lemma = lemmatizer.lemmatize(word, pos='v')
    ger = lemma + 'ing'
    if lemma.endswith('ie'): # lie -> lying
        ger = lemma[:-2] + 'ying'
    elif lemma.endswith('e') and lemma != 'be': # make -> making
        ger = lemma[:-1] + 'ing'
    elif len(lemma) >= 3 and is_consonant(word[-3]) and is_vowel(word[-2]) and is_consonant(word[-1]):
        if word[-1] not in 'wxy':
            if count_vowels(word) == 1: # sit -> sitting
                ger = lemma + lemma[-1] + 'ing'
    return ger

def gerund_word(word_tag):
    word = word_tag[0]
    tag = word_tag[1]
    ger = word
    if word.isalpha():
        if tag == 'noun':
            ger = naive_gerund_noun(word)
        elif tag == 'verb':
            ger = gerund_verb(word)
    return ger

def gerund_technique(technique):
    gers = map(gerund_word, get_postags(technique))
    return ' '.join(gers)

# Example
gerund_technique('deep fried')

'deep frying'

In [36]:
def is_participlable_word(word_tag):
    tag = word_tag[1]
    return tag in ['noun', 'verb']

def is_participlable_technique(technique):
    return any(map(is_participlable_word, get_postags(technique)))

def naive_participle_noun(word):
    lemma = lemmatizer.lemmatize(word)
    par = lemma + 'ed'
    if par not in words.words():
        par = word
    return par

def participle_verb(word):
    if word.endswith('ed'):
        return word
    
    def count_vowels(s):
        c = 0
        for v in 'aeiou':
            c += s.count(v)
        return c
    
    def is_vowel(x):
        return x in 'aeiou'
    
    def is_consonant(x):
        return not is_vowel(x)
    
    lemma = lemmatizer.lemmatize(word, pos='v')
    par = lemma + 'ed'
    if lemma.endswith('e'): # live -> lived
        par = lemma + 'd'
    elif len(lemma) >= 3 and is_consonant(word[-3]) and is_vowel(word[-2]) and is_consonant(word[-1]):
        if word[-1] not in 'wxy':
            if count_vowels(word) == 1: # stop -> stopped
                par = lemma + lemma[-1] + 'ed'
    if par not in words.words():
        par = word
    return par

def participle_word(word_tag):
    word = word_tag[0]
    tag = word_tag[1]
    par = word
    if word.isalpha():
        if tag == 'noun':
            par = naive_participle_noun(word)
        elif tag == 'verb':
            par = participle_verb(word)
    return par

def participle_technique(technique):
    pars = map(participle_word, get_postags(technique))
    return ' '.join(pars)

# Example
participle_technique('deep fry')

'deep fry'

In [38]:
%%time

for tech in graph_syn.nodes():
    if is_infinitivable_technique(tech):
        inf = infinitive_technique(tech)
        add_node(graph_syn, inf)
        add_edge(graph_syn, tech, inf)
    if is_gerundable_technique(tech):
        ger = gerund_technique(tech)
        add_node(graph_syn, ger)
        add_edge(graph_syn, tech, ger)
    if is_participlable_technique(tech):
        par = participle_technique(tech)
        add_node(graph_syn, par)
        add_edge(graph_syn, tech, par)

CPU times: user 36.6 s, sys: 5.32 s, total: 41.9 s
Wall time: 41.9 s


In [39]:
len(graph_syn)

582

In [40]:
graph_syn.number_of_edges()

322

In [41]:
nx.number_connected_components(graph_syn)

260

In [42]:
nx.write_gexf(graph_syn, 'data/english_techniques_lexicon_3.gexf')

In [43]:
graph_syn = nx.read_gexf('data/english_techniques_lexicon_3.gexf')

# Normalization

In [44]:
# Numbers
def numbers(x):
    return x.replace(' 1 ', ' one ') \
            .replace(' 2 ', ' two ') \
            .replace(' 3 ', ' three ') \
            .replace(' 4 ', ' four ') \
            .replace(' 5 ', ' five ') \
            .replace(' 6 ', ' six ') \
            .replace(' 7 ', ' seven ') \
            .replace(' 8 ', ' eight ') \
            .replace(' 9 ', ' nine ')
    
# Dashes (-)
def dashes1(x):
    return x.replace('-', ' ')

def dashes2(x):
    return x.replace('-', '')

# POS tags
# ADJETIVOS .... A ADJ ...... X
# ADVERBIOS .... R ADV
# DETERMINANTES  D DET
# NOMBRES ...... N NOUN ..... X
# VERBOS ....... V VERB ..... X
# PRONOMBRES ... P PRON
# CONJUNCIONES . C CONJ
# INTERJECCIONES I INTERJ
# PREPOSICIONES  S PREP
# PUNTUACIÓN ... F PUNTUATION
# NUMERALES .... Z NUM ...... X
# FECHAS Y HORAS W DATE-TIME
def pos_tags(x):
    tags = get_postags(x)
    filtered = [token
                for token, tag in tags
                if tag in ['adj', 'noun', 'verb', 'num']
               ]
    return ' '.join(filtered)

def itself(x):
    return x

funcs = [itself, pos_tags, numbers, dashes1, dashes2]
combinations = []
for i in range(1, len(funcs) + 1):
    combinations.append(list(itertools.combinations(funcs, i)))
combinations = [c for comb in combinations for c in comb]

# def normalize(technique): # original time consuming version
#     result = set()
#     for c in combinations:
#         x = technique
#         for f in c:
#             x = f(x)
#         result.add(x)
#     return result

def normalize(technique): # dynamic programming version
    result = set()
    for c in combinations:
        x = technique
        for f in c:
            if not x in d[f.__name__]:
                d[f.__name__][x] = f(x)
            x = d[f.__name__][x]
        result.add(x)
    return result

In [45]:
len([list(map(lambda x: x.__name__, c)) for c in combinations])

31

In [46]:
d = defaultdict(dict)

# or

# with open('data/english_techniques_normalization.pickle', 'rb') as f:
#     d = pickle.load(f)

In [47]:
%%time

for tech in graph_syn.nodes():
    if len(nltk.word_tokenize(tech)) < 4:
        norms = normalize(tech)
        for norm in norms:
            add_node(graph_syn, norm)
            add_edge(graph_syn, tech, norm)

d = dict(d)

CPU times: user 308 ms, sys: 0 ns, total: 308 ms
Wall time: 309 ms


In [48]:
with open('data/english_techniques_normalization.pickle', 'wb') as f:
    pickle.dump(d, f)

In [49]:
len(graph_syn)

655

In [50]:
graph_syn.number_of_edges()

399

In [51]:
nx.number_connected_components(graph_syn)

256

In [52]:
nx.write_gexf(graph_syn, 'data/english_techniques_lexicon_4.gexf')

In [53]:
graph_syn = nx.read_gexf('data/english_techniques_lexicon_4.gexf')

# Synonyms

In [54]:
def my_ngrams(technique):
    ngrms = []
    tokens = nltk.word_tokenize(technique)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [55]:
lengths = defaultdict(int)
for tech in graph_syn.nodes_iter():
    lengths[len(nltk.word_tokenize(tech))] += 1
lengths = dict(lengths)

In [56]:
lengths

{1: 444, 2: 170, 3: 37, 4: 4}

In [57]:
def ngram_combinations(technique):
    combs = []
    ngram_list = my_ngrams(technique)
    for i in range(1, len(ngram_list) + 1):
        combs.extend(permutations(ngram_list, i))
    combs = [list(c) for c in combs if ' '.join(c) == technique]
    return combs

# Example
ngram_combinations('green pepper')

[['green pepper'], ['green', 'pepper']]

In [58]:
def comb_syns(expr, syn_dict):
    res = set()
    combs = ngram_combinations(expr)
    for ngrms in combs:
        syn_list = [syn_dict[ngrm] for ngrm in ngrms]
        syn_comb = list(product(*syn_list))
        for sc in syn_comb:
            res.add(' '.join(sc))
    return list(res)

# Example
expr= 'green pepper'
syn_dict = {
    'green': ['green'],
    'pepper': ['pepper', 'peppers'],
    'green pepper': ['green pepper'],
}
comb_syns(expr, syn_dict)

['green pepper', 'green peppers']

In [59]:
def create_syn_dict(ngrms):
    d = {}
    for ngrm in ngrms:
        d[ngrm] = set([ngrm])
    return d

create_syn_dict(my_ngrams('green pepper'))

{'green': {'green'}, 'green pepper': {'green pepper'}, 'pepper': {'pepper'}}

In [60]:
def max_size_synset(synset):
    return len(nltk.word_tokenize(sorted(synset, key=lambda x: len(nltk.word_tokenize(x)), reverse=True)[0]))

In [61]:
%%time

for syns1 in list(nx.connected_components(graph_syn)):
    max_size = max_size_synset(syns1)
    if max_size < 4:
        for tech in syns1:
            ngrms = my_ngrams(tech)
            syn_dict = create_syn_dict(ngrms)
            for ngrm in ngrms:
                if ngrm in graph_syn and ngrm not in syns1:
                    syns2 = nx.node_connected_component(graph_syn, ngrm)
                    syn_dict[ngrm] = syn_dict[ngrm].union(syns2)
            syn_combs = comb_syns(tech, syn_dict)
            for syn_tech in syn_combs:
                add_node(graph_syn, syn_tech)
                add_edge(graph_syn, tech, syn_tech)

CPU times: user 316 ms, sys: 4 ms, total: 320 ms
Wall time: 318 ms


In [62]:
len(graph_syn)

1416

In [63]:
graph_syn.number_of_edges()

1160

In [64]:
nx.number_connected_components(graph_syn)

256

In [65]:
nx.write_gexf(graph_syn, 'data/english_techniques_lexicon_5.gexf')