In [1]:
import pandas as pd

In [2]:
def searchItem(line, item):
    if item in line:
        return True
    else:
        return False

In [3]:
def getDocLen(filename):
    count = 0
    with open(filename, 'rb') as f:
        for line in f: count += 1
    return count

In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize

def joinTerm(list_to_join):
    components = list_to_join.copy()
    
    if len(components) > 0:
        new_list = [components[0]]
    else:
        return ''
    
    for component in components[1:]:
        if "'" in component and component.index("'") < 1:
            new_list.append(component.lower())                
        elif component.isalpha() or component.isnumeric():
            new_list.append(' ' + component.lower())
            
    return ''.join(new_list)


def getTerm(line):
    line_list = word_tokenize(line)
    try:
        if searchItem(line=line, item=','):
            term_end_position = min(line_list.index(','), line_list.index('('))
        else:
            term_end_position = line_list.index('(')
    except ValueError:
        return ''
    
    return joinTerm(line_list[:term_end_position])

In [5]:
print(joinTerm(['ada', "s", "Pies"]))

ada s pies


In [6]:
filename = 'meddict_raw.txt'
doclen = getDocLen(filename)
print(doclen)

226870


In [9]:
# Build term list!
def buildList(filename):
    with open(filename, 'r') as f:
        terms = []
        for _ in range(doclen):
            
            line = f.readline()
            if searchItem(line=line, item='('):
                terms.append(getTerm(line))
    return terms

In [13]:
terms = buildList(filename)

In [15]:
print(terms[400:420], len(terms))

['acrasia', 'acratia', 'acraturesis', 'Acree-Rosenheim test', 'acremoniosis', 'aciidine', 'acriflavine', 'acrisia', 'acritochromacy', 'acroaesthesia', 'acroanesthesia', 'acroarthritis', 'acroasphyzia', 'acroatazia', 'acrobystiolith', 'acrobystitis', 'acrocarpous', 'acrochordon', 'acrocinesia', 'acrocyanosis'] 24736


In [27]:
# Not bad.  24,000 terms should be a big enough sample.
# Next is calculating lexical difficulty or word complexity and getting everyting into a dataframe

df_terms = pd.DataFrame(terms, columns=['terms'])['terms'].unique()
df_terms = pd.DataFrame(df_terms, columns=['terms'])
print('Unique terms: ', df_terms.count())
df_terms.head()

Unique terms:  terms    21056
dtype: int64


Unnamed: 0,terms
0,Aachen
1,abaca
2,abactio
3,Abadie's sign
4,abaissement


In [30]:
# A robust method is proposed here: https://www.aclweb.org/anthology/D18-1410 - they had people rank 15000 words
# According to some researchers at Stanford, the length of words reflects their complexity https://langcog.stanford.edu/papers_new/lewis-2015-underrev.pdf
# We could also try to detect wheter a word is latin or greek-derived

df_terms['word_len'] = df_terms['terms'].str.len()
df_terms.sort_values(by='word_len').head()

Unnamed: 0,terms,word_len
18,,0
12081,m,1
1647,H,1
7258,f,1
17562,5,1


In [35]:
# We have some nonsense here.  Let's stick with 4 letters or more for now.
df_terms = df_terms[df_terms['word_len'] > 3]
df_terms.sort_values(by='word_len').head(100)

Unnamed: 0,terms,word_len
2099,balm,4
10911,peel,4
10915,pint,4
8545,heal,4
8532,Hata,4
10922,side,4
15395,tive,4
18857,taon,4
5909,ysis,4
10937,1000,4


In [55]:
# We're getting somewhere.  There are some terms that are useless, though.
# Some patterns to find and drop:
# number+anything can go
# "of number" is likely all useless
# We'll kill acronyms of the form "letter.letter"
# Anything with a special character
# "letter+letter "

# And a few more

# Diving into some regex
patterns = ('[0-9]+.',
            '[0-9]+-.',
            'of .',
            '.\..',
            'gr .',
            '\W.+',
            '\w+-',
            '\w+\W',
            '\w+-\d+',
           )
clean_d = df_terms.copy()

#conditional = clean_d.sort_values(by='word_len').head(100)['terms'].str.match(pat = patterns[0])\

for pattern in patterns:
    clean_d[pattern] = clean_d['terms'].str.match(pat=pattern)
    clean_d = clean_d[clean_d[pattern] == False]
    
clean_d.head()

Unnamed: 0,terms,word_len,[0-9]+.,[0-9]+-.,of .,.\..,gr .,\W.+,\w+-,\w+\W,\w+-\d+
0,Aachen,6,False,False,False,False,False,False,False,False,False
1,abaca,5,False,False,False,False,False,False,False,False,False
2,abactio,7,False,False,False,False,False,False,False,False,False
4,abaissement,11,False,False,False,False,False,False,False,False,False
5,abalienated,11,False,False,False,False,False,False,False,False,False


In [56]:
clean_d.sort_values(by='word_len').head(100)

Unnamed: 0,terms,word_len,[0-9]+.,[0-9]+-.,of .,.\..,gr .,\W.+,\w+-,\w+\W,\w+-\d+
12708,Tine,4,False,False,False,False,False,False,False,False,False
19030,icie,4,False,False,False,False,False,False,False,False,False
19036,ulum,4,False,False,False,False,False,False,False,False,False
16058,flea,4,False,False,False,False,False,False,False,False,False
5221,year,4,False,False,False,False,False,False,False,False,False
19081,aism,4,False,False,False,False,False,False,False,False,False
5277,derm,4,False,False,False,False,False,False,False,False,False
3277,caul,4,False,False,False,False,False,False,False,False,False
2338,tion,4,False,False,False,False,False,False,False,False,False
3273,ture,4,False,False,False,False,False,False,False,False,False


In [57]:
# Looks a lot better.  Let's ditch the regex columns and save the dictionary out to file.
clean_d = clean_d[['terms', 'word_len']]
clean_d.to_csv('meddict_clean.csv', header=True, index=None)