In [2]:
import pandas as pd
import pickle

In [3]:
words_df = pd.read_csv('../../norsk_ordvev_nob/dat/words.tab',sep='\t', header=None)
words_df.columns = ['word_id', 'word', 'pos']

In [4]:
words_df = words_df.drop_duplicates()

In [4]:
words_df[words_df['word_id'] == 12698]

Unnamed: 0,word_id,word,pos
29583,12698,inventar,Noun


In [5]:
words_df.shape

(189525, 3)

In [5]:
synset_df = pd.read_csv('../../norsk_ordvev_nob/dat/synsets.tab',sep='\t', header=None)
synset_df.columns = ['synset_id', 'ontological']

In [6]:
synset_df = synset_df.drop_duplicates()

In [7]:
synset_df

Unnamed: 0,synset_id,ontological
0,1,Underspecified
1,10,Artifact+Object+Group
2,100,Human+Object+Group
3,1000,Natural+Group
4,10000,Human+Object+Group
...,...,...
306670,99995,LanguageRepresentation+Artifact+Object
306671,99996,Human+Object
306672,99997,LanguageRepresentation+Artifact+Object
306673,99998,Human+Object


In [8]:
wordsenses_df = pd.read_csv('../../norsk_ordvev_nob/dat/wordsenses.tab',sep='\t', header=None)
wordsenses_df.columns = ['wordsense_id', 'word_id', 'synset_id', 'register']

In [9]:
wordsenses_df = wordsenses_df[['word_id','synset_id']].drop_duplicates()

In [10]:
wordsenses_df[wordsenses_df['word_id'] == 58346]['synset_id'].values

array([65865, 65866], dtype=int64)

In [11]:
relations_df = pd.read_csv('../../norsk_ordvev_nob/dat/relations.tab',sep='\t', header=None)
relations_df.columns = ['synset_id', 'name', 'name2', 'target_synset_id', 'taxonomic', 'inheritance']

In [12]:
relations_df['name2'].unique()

array(['domain', 'has_hyperonym', 'has_mero_member', 'is_instance_of',
       'concerns', 'near_synonym', 'eq_has_synonym', 'made_by',
       'has_mero_part', 'has_holo_part', 'used_for', 'has_holo_member',
       'role_agent', 'used_for_object', 'has_holo_location',
       'involved_agent', 'has_mero_madeof', 'has_mero_location',
       'xpos_near_synonym', 'near_antonym', 'involved_instrument',
       'role_patient', 'involved_patient', 'has_hypernym',
       'used_for_qualby', 'has_holo_madeof', 'usedFor', 'has_holonym'],
      dtype=object)

In [13]:
realtion_types = ['domain', 'has_hyperonym', 'is_instance_of', 'near_synonym', 'xpos_near_synonym']

In [14]:
relations_df = relations_df[relations_df['name2'].isin(realtion_types)]

In [15]:
relations_df = relations_df[['synset_id','target_synset_id']].drop_duplicates()

In [16]:
relations_df[relations_df['synset_id'] == 15991]

Unnamed: 0,synset_id,target_synset_id
72809,15991,28777
72810,15991,6660


In [17]:
synset_mapping = {}

for index, row in relations_df[['synset_id', 'target_synset_id']].iterrows():
    if row['synset_id'] not in synset_mapping:
        synset_mapping[row['synset_id']] = []
    synset_mapping[row['synset_id']].append(row['target_synset_id'])

In [None]:
synset_mapping[45004]

In [None]:
synset_word_mapping = {}

for index, row in wordsenses_df[['word_id', 'synset_id']].iterrows():
    if row['synset_id'] not in synset_word_mapping:
        synset_word_mapping[row['synset_id']] = []
    synset_word_mapping[row['synset_id']].append(row['word_id'])

In [None]:
synset_word_mapping[134]

In [None]:
synset_word_literal_mapping = {}

for synset_id, word_ids in synset_word_mapping.items():
    synset_word_literal_mapping[synset_id] = []
    for word_id in word_ids:
        try:
            synset_word_literal_mapping[synset_id].append(words_df[words_df['word_id'] == word_id]['word'].values[0])
        except:
            continue

In [None]:
synset_word_literal_mapping[1311]

In [None]:
word_synonym_mapping = {}

for index, row in words_df.iterrows():
    word = str(row['word']).lower()
    if word not in word_synonym_mapping:
        word_synonym_mapping[word] = []
    synset_ids = wordsenses_df[wordsenses_df['word_id'] == row['word_id']]['synset_id'].values
    for synset_id in synset_ids:
        word_synonym_mapping[word] += [str(x).lower() for x in synset_word_literal_mapping[synset_id]]
    word_synonym_mapping[word] = list(set(word_synonym_mapping[word]))

word_synonym_mapping

In [None]:
word_synonym_mapping_filtered = {}

for word in word_synonym_mapping:
    if len(word_synonym_mapping[word]) > 1:
        word_synonym_mapping_filtered[word] = word_synonym_mapping[word]

word_synonym_mapping_filtered

In [None]:
for word in word_synonym_mapping_filtered['adferd']:
    print(word_synonym_mapping_filtered[word])

In [None]:
with open('../../data/synset_word_literal_mapping.pkl', 'wb') as f:
    pickle.dump(synset_word_literal_mapping, f, pickle.HIGHEST_PROTOCOL)

In [22]:
with open('../../data/synset_word_literal_mapping.pkl', 'rb') as f:
    synset_word_literal_mapping = pickle.load(f)

In [None]:
ontologies = {}
for index, row in synset_df.iterrows():
    for ontology in row['ontological'].replace('(','').replace(')','').split('+'):
        if ontology not in ontologies:
            ontologies[ontology] = [False for _ in range(len(words_df))]
            
ontologies.keys()

In [None]:
for o in ontologies:
    ontologies[o] = 0
ontologies

In [None]:
with open('../../data/ontologies.pkl', 'wb') as f:
    pickle.dump(ontologies, f, pickle.HIGHEST_PROTOCOL)

In [23]:
with open('../../data/ontologies.pkl', 'rb') as f:
    ontologies = pickle.load(f)

In [24]:
words_pos_ontologies_df = pd.DataFrame.from_dict(ontologies)

In [25]:
words_pos_ontologies_df['word_id'] = words_df['word_id']
words_pos_ontologies_df['word'] = words_df['word']
words_pos_ontologies_df['pos'] = words_df['pos']

In [None]:
for index, row in synset_df.iterrows():
    try:
        for word in synset_word_literal_mapping[row['synset_id']]:
            for ontology in row['ontological'].replace('(','').replace(')','').split('+'):
                words_pos_ontologies_df[words_pos_ontologies_df['word'] == word][ontology] = True
    except:
        continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
words_pos_ontologies_df[words_pos_ontologies_df['word'] == 'fremtreden']['Object']

In [7]:
labeled_data = pd.read_pickle('../../data/labeled_data.pkl')

In [8]:
word_pos_mapping = {}

for index, row in words_df.iterrows():
    word_pos_mapping[str(row['word']).lower()] = row['pos']
    
#word_pos_mapping

{'druesaft': 'Noun',
 'sabotasjegruppe': 'Noun',
 'operascene': 'Noun',
 'navnesøster': 'Noun',
 'gederø': 'Noun',
 'gebauer': 'Noun',
 'gawrys': 'Noun',
 'gavrilova': 'Noun',
 'gavey': 'Noun',
 'gautvedt': 'Noun',
 'gast': 'Noun',
 'gasior': 'Noun',
 'garåsen': 'Noun',
 'garrod': 'Noun',
 'navneskilt': 'Noun',
 'garpe': 'Noun',
 'gardøl': 'Noun',
 'garcha': 'Noun',
 'ganji': 'Noun',
 'gangvik': 'Noun',
 'gangmark': 'Noun',
 'gange': 'Noun',
 'galtrud': 'Noun',
 'galligani': 'Noun',
 'gallegos': 'Noun',
 'navneendring': 'Noun',
 'gallala': 'Noun',
 'gajhede': 'Noun',
 'gajek': 'Noun',
 'gai': 'Noun',
 'gagnon': 'Noun',
 'gadeholt': 'Noun',
 'gad': 'Noun',
 'gabow': 'Noun',
 'gaasrud': 'Noun',
 'føsund': 'Noun',
 'navneplate': 'Noun',
 'føli': 'Noun',
 'færstaul': 'Noun',
 'fæmundshytten': 'Noun',
 'fålun': 'Noun',
 'furrebøe': 'Noun',
 'fureid': 'Noun',
 'funke': 'Verb',
 'frøysaker': 'Noun',
 'frøvig': 'Noun',
 'frøstad': 'Noun',
 'substantiv': 'Noun',
 'frydnes': 'Noun',
 'froyn': 'N

In [9]:
set([word_pos_mapping[x] for x in word_pos_mapping])

{'Adjective', 'None', 'Noun', 'Verb'}

In [None]:
word_ontological_mapping = {}

for index, row in words_df.iterrows():
    lowered = str(row['word']).lower()
    word_ontological_mapping[lowered] = []
    
    synset_ids = wordsenses_df[wordsenses_df['word_id'] == row['word_id']]['synset_id'].values
    for synset_id in synset_ids:
        try:
            word_ontological_mapping[lowered].append(synset_df[synset_df['synset_id'] == synset_id]['ontological'].values[0])
        except:
            continue
word_ontological_mapping

In [None]:
word_ontological_mapping_cleaned = {}
for key, value in word_ontological_mapping.items():
    temp = []
    for o in value:
        temp += o.replace('(','').replace(')','').split('+')
    word_ontological_mapping_cleaned[key] = list(set(temp))

word_ontological_mapping_cleaned

In [None]:
with open('../../data/word_ontological_mapping.pkl', 'wb') as f:
    pickle.dump(word_ontological_mapping_cleaned, f, pickle.HIGHEST_PROTOCOL)

In [10]:
with open('../../data/word_ontological_mapping.pkl', 'rb') as f:
    word_ontological_mapping_cleaned = pickle.load(f)

In [11]:
interesting_ontologies = ['GeopoliticalPlace',
                          'Group',
                          'BoundedEvent', 
                          'Communication', 
                          'Social', 
                          '3rdOrderEntity', 
                          'Occupation', 
                          'MoneyRepresentation', 
                          'Purpose', 
                          'Vehicle']

In [12]:
counts = {
    'Adjective': [],
    'Verb': [],
    'Noun': [],
    'Underspecified': [],
    'Artifact': [],
    'Object': [],
    'Group': [],
    'Human': [],
    'Natural': [],
    'LanguageRepresentation': [],
    'Living': [],
    'GeopoliticalPlace': [],
    'BodyPart': [],
    'Instrument': [],
    'Place': [],
    '3rdOrderEntity': [],
    'Mental': [],
    'Purpose': [],
    'Social': [],
    'Institution': [],
    'Plant': [],
    'Imagerepresentation': [],
    'Creature': [],
    'Animal': [],
    'Comestible': [],
    'Quantity': [],
    'Building': [],
    'Substance': [],
    'Part': [],
    'Property': [],
    'BoundedEvent': [],
    'Agentive': [],
    'Communication': [],
    'Garment': [],
    'Furniture': [],
    'Vehicle': [],
    '1stOrderEntity': [],
    'Covering': [],
    'Liquid': [],
    'Time': [],
    'UnboundedEvent': [],
    'Physical': [],
    'Dynamic': [],
    'Domain': [],
    'Existence': [],
    'Location': [],
    'Manner': [],
    'Container': [],
    'Condition': [],
    'Static': [],
    '2ndOrderEntity': [],
    'Phenomenal': [],
    'MoneyRepresentation': [],
    'Experience': [],
    'Relation': [],
    'Form': [],
    'Representation': [],
    'Stimulating': [],
    'Colour': [],
    'Cause': [],
    'Occupation': [],
    'Possession': [],
    'Artwork': [],
    'Software': [],
    'None': []
}

for index, value in labeled_data['Raw'].items():
    
    count = {
        'Adjective': 0,
        'Verb': 0,
        'Noun': 0,
        'Underspecified': 0,
        'Artifact': 0,
        'Object': 0,
        'Group': 0,
        'Human': 0,
        'Natural': 0,
        'LanguageRepresentation': 0,
        'Living': 0,
        'GeopoliticalPlace': 0,
        'BodyPart': 0,
        'Instrument': 0,
        'Place': 0,
        '3rdOrderEntity': 0,
        'Mental': 0,
        'Purpose': 0,
        'Social': 0,
        'Institution': 0,
        'Plant': 0,
        'Imagerepresentation': 0,
        'Creature': 0,
        'Animal': 0,
        'Comestible': 0,
        'Quantity': 0,
        'Building': 0,
        'Substance': 0,
        'Part': 0,
        'Property': 0,
        'BoundedEvent': 0,
        'Agentive': 0,
        'Communication': 0,
        'Garment': 0,
        'Furniture': 0,
        'Vehicle': 0,
        '1stOrderEntity': 0,
        'Covering': 0,
        'Liquid': 0,
        'Time': 0,
        'UnboundedEvent': 0,
        'Physical': 0,
        'Dynamic': 0,
        'Domain': 0,
        'Existence': 0,
        'Location': 0,
        'Manner': 0,
        'Container': 0,
        'Condition': 0,
        'Static': 0,
        '2ndOrderEntity': 0,
        'Phenomenal': 0,
        'MoneyRepresentation': 0,
        'Experience': 0,
        'Relation': 0,
        'Form': 0,
        'Representation': 0,
        'Stimulating': 0,
        'Colour': 0,
        'Cause': 0,
        'Occupation': 0,
        'Possession': 0,
        'Artwork': 0,
        'Software': 0,
        'None': 0
    }

    for word in value.lower().split():
        if word in word_pos_mapping:
            count[word_pos_mapping[word]] += 1
        if word in word_ontological_mapping_cleaned:
            for o in word_ontological_mapping_cleaned[word]:
                count[o] += 1
    for key in count:
        counts[key].append(count[key])

counts

{'Adjective': [4,
  7,
  3,
  4,
  9,
  12,
  16,
  2,
  13,
  3,
  17,
  36,
  19,
  4,
  0,
  11,
  32,
  8,
  2,
  6,
  6,
  4,
  2,
  5,
  1,
  7,
  11,
  2,
  1,
  6,
  2,
  4,
  9,
  14,
  4,
  1,
  2,
  1,
  6,
  8,
  5,
  3,
  4,
  5,
  4,
  1,
  15,
  16,
  9,
  0,
  14,
  10,
  0,
  10,
  1,
  23,
  4,
  3,
  18,
  0,
  3,
  53,
  3,
  9,
  29,
  8,
  0,
  2,
  9,
  8,
  49,
  1,
  1,
  17,
  27,
  2,
  0,
  8,
  5,
  3,
  6,
  17,
  13,
  34,
  3,
  17,
  4,
  1,
  1,
  10,
  3,
  11,
  9,
  1,
  15,
  23,
  51,
  28,
  22,
  4,
  3,
  10,
  8,
  2,
  3,
  9,
  8,
  2,
  1,
  11,
  1,
  2,
  26,
  70,
  14,
  1,
  13,
  2,
  11,
  11,
  1,
  3,
  12,
  1,
  2,
  4,
  13,
  2,
  5,
  61,
  7,
  2,
  8,
  3,
  12,
  0,
  3,
  10,
  2,
  0,
  3,
  2,
  13,
  7,
  9,
  7,
  40,
  2,
  2,
  11,
  21,
  16,
  8,
  13,
  7,
  1,
  5,
  0,
  3,
  9,
  6,
  5,
  21,
  33,
  5,
  3,
  6,
  17,
  4,
  3,
  15,
  3,
  0,
  0,
  1,
  2,
  2,
  18,
  7,
  18,
  2,
  4,
  0,
  18,
  1,
  2

In [13]:
for key, value in counts.items():
    labeled_data[key] = value

labeled_data.to_pickle('../../data/labeled_data_pos_ont.pkl')

In [None]:
labeled_data.head(5)['Raw'].values