In [588]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.lang.en import English
import os
from os import listdir
from collections import defaultdict
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
import re
import string
from lxml import etree

In [356]:
annotated_sentences = []

nlp=spacy.load("en")

for text in os.listdir("python-crawl/Chunks/"):
    if text != ".DS_Store":
        path = "python-crawl/Chunks/" + text

        file = open(path, "r").read()
        
        file = file.lower()
        
        doc = nlp(file)
        
        for sent in doc.sents:
            annotated_sentences.append(sent)

In [426]:
annotated_words = [token for sent in annotated_sentences for token in sent]

In [521]:
not_allowed = ["PROPN", "NUM", "PUNCT", "SYM", "SPACE"]

In [527]:
filter_words  = [token for token in annotated_words if token.pos_ not in not_allowed and token.orth_.isalpha()]

In [528]:
sorted_words = sorted(filter_words, key=lambda x: x.orth_)

In [542]:
# removing duplicates 
visited = defaultdict(int)
sorted_Dict_words = []

for word in sorted_words:
    if visited[(word.orth_, word.pos_)] == 0:
        visited[(word.orth_, word.pos_)] = 1
        sorted_Dict_words.append(word)

In [547]:
sorted_Dict_words[110:120]

[abled,
 abnegation,
 abnormal,
 abnormalities,
 abnormality,
 abnormally,
 aboard,
 aboard,
 abode,
 abolish]

In [578]:
# tools for collocations

bigram_measures = nltk.collocations.BigramAssocMeasures()

stopwords = nltk.corpus.stopwords.words("english")

allowed=["NOUN", "VERB", "ADJ", "ADV", "X", "ADP"]

# helper function for filtering bigrams
def contains_bigram(container, bigram):
    for added in container:
        if added[0].orth_ == bigram[0].orth_ and added[1].orth_ == bigram[1].orth_:
            return True
    return False

nltk_bigrams = list(nltk.bigrams(annotated_words))
    

In [549]:
def get_bigrams(tok):
    if tok.orth_ in stopwords:
        return []
    
    filtered = []
    
    for bigram in nltk_bigrams:
        if tok.orth_ in (bigram[0].orth_, bigram[1].orth_):
            if bigram[0].pos_ in allowed:
                filtered.append(bigram[0])
            if bigram[1].pos_ in allowed:
                filtered.append(bigram[1])

    bigram_finder = BigramCollocationFinder.from_words(filtered)
    
    try:
        word_filter = lambda w1, w2: tok.orth_ not in (w1.orth_, w2.orth_)
        bigram_finder.apply_ngram_filter(word_filter)
        
        phrase_filter = lambda w1, w2: (w1.pos_, w2.pos_) not in allowed_phrases[tok.pos_]
        bigram_finder.apply_ngram_filter(phrase_filter)
    except:
        pass
    
    calc = bigram_finder.nbest(bigram_measures.pmi, 6)
    
    filtered = []

    for bi in calc:
        if not contains_bigram(filtered, bi):
            filtered.append(bi)
         
    return filtered

In [580]:
# data structure for dictionary creation
dictionary = defaultdict(lambda: defaultdict(list))

count = 0

for token in sorted_Dict_words:
    if len(dictionary[token.lemma_][token.pos_]) == 0:
        # lemma found in:
        sentence = token.sent.text.capitalize()
        
        # this is unneeded -> dictionary[token.lemma_][token.pos_][0] = word form
        #dictionary[token.lemma_][token.pos_][1] = sentence in which it is found
        
        dictionary[token.lemma_][token.pos_].append(token.orth_)
        dictionary[token.lemma_][token.pos_].append(sentence)
            
        # dictionary[token.lemma_][token.pos_][2:] = tuple consisting of a tuple and a string - 
        # tuple = (w1, w2) tuple of words that make up the collocation 
        # string = example usage of the collocation
                  
        collocations = get_bigrams(token)
        for bigram in collocations:
                dictionary[token.lemma_][token.pos_].append(((bigram[0].lemma_, bigram[1].lemma_), bigram[0].sent.text.capitalize()))
    
    count+=1
    if count%200 == 0:
        print("Currently done with " + str(count) + " entries.")
    if count == 6000:
        break
                                                  
        

Currently done with 200 entries.
Currently done with 400 entries.
Currently done with 600 entries.
Currently done with 800 entries.
Currently done with 1000 entries.
Currently done with 1200 entries.
Currently done with 1400 entries.
Currently done with 1600 entries.
Currently done with 1800 entries.
Currently done with 2000 entries.
Currently done with 2200 entries.
Currently done with 2400 entries.
Currently done with 2600 entries.
Currently done with 2800 entries.
Currently done with 3000 entries.
Currently done with 3200 entries.
Currently done with 3400 entries.
Currently done with 3600 entries.
Currently done with 3800 entries.
Currently done with 4000 entries.
Currently done with 4200 entries.
Currently done with 4400 entries.
Currently done with 4600 entries.
Currently done with 4800 entries.
Currently done with 5000 entries.
Currently done with 5200 entries.
Currently done with 5400 entries.
Currently done with 5600 entries.
Currently done with 5800 entries.
Currently done wit

In [581]:
# creating the dictionary

root = etree.Element("data")

for word in dictionary.items(): #for each lemma
    entry = etree.Element("entry") # children: form
    form = etree.Element("form", type="lemma") # children: orth
    
    orth = etree.Element("orth")
    orth.text = word[0]
    
    form.append(orth)

    entry.append(form)
    
    for pos in word[1].items(): #pos[0] = tag, pos[1] = tuple (collocation, example)
        gramGrp = etree.Element("gramGrp") # children: pos
        
        pos_tag = etree.Element("pos")
        pos_tag.text = pos[0]
        
        gramGrp.append(pos_tag)
        # feature[1][0] - ignore
        
        collocation_text = []
        example_sentence = pos[1][1]
        
        # writing collocation
        for cols in pos[1][2:]:
            collocation_text.append(cols[0])

        # writing test senence
        if example_sentence == "":
            example_sentence = col[0]
                
        cit = etree.Element("cit")
        cit.text = example_sentence
        
        colloc = etree.Element("colloc")
        colloc.text = str(collocation_text).strip('[]')

        gramGrp.append(cit)   
        gramGrp.append(colloc)
        break
        
    entry.append(gramGrp)
    root.append(entry)

# tree = etree.ElementTree(root)
# print(etree.tostring(tree))
writing_file = open("python-crawl/dict-teilex15-22.xml", "w")
writing_file.write(etree.tostring(root,encoding=str, pretty_print=True))

1465588

In [551]:
len(sorted_Dict_words)

63830

In [622]:
dictionary["diagnostic"]["NOUN"]

[(('medical', 'diagnostic'),
  'Some don’t find out until they are older , when they run into trouble trying to have children and , through medical diagnostics , find out that their sex is more complicated than they expected .')]

In [623]:
dictionary["diagnostic"]["ADJ"]

[(('diagnostic', 'standard'),
  'Until 1973 , homosexuality was a psychological disorder justified in the diagnostic and statistical manual of mental disorders ; the current edition , the dsm-5 , still considers transgender people disabled .')]