# Model 1: Attention GET DATA

In [4]:
from importlib import reload  

import pandas as pd
import matplotlib.pyplot as plt
import pickle

import help_functions
import data_processor

## STRUCTURE
1. Add the different word senses to the texts.
2. Build a vocabulary.

## 0. BASIC HELP FUNCTIONS

In [3]:
def save_data_to_file(data):
    print("Given data with head:")
    print(data.head())
    should_save = input("Do you wish to save it? (y/n): ")
    if should_save == "y":
        filename = input("Specify the filename to save to: ")
        data.to_csv(filename, index=False)
        print("Saved data!")

In [2]:
def load_data_from_file():
    should_load = input("Do you wish to load data from a file? (y/n): ")
    if should_load == "y":
        filename = input("Specify the filename to load from: ")
        data = pd.read_csv(filename)
        return data

## 1. Add the different word senses to the texts.

In [8]:
data = load_data_from_file()

In [9]:
filename = "/Users/lovhag/Projects/dl4nlp_assignment_1/a1_data/wsd_train.txt"
data = pd.read_table(filename,header=None,names=['sense_key', 'lemma', 'word_position', 'text'])
#data = data.iloc[0:10]
data.head()

Unnamed: 0,sense_key,lemma,word_position,text
0,keep%2:42:07::,keep.v,15,Action by the Committee In pursuance of its ma...
1,national%3:01:00::,national.a,25,A guard of honour stood in formation in honour...
2,build%2:31:03::,build.v,38,The principle that statistics should be timely...
3,place%1:04:00::,place.n,36,"Again , he appealed for additional support for..."
4,position%1:04:01::,position.n,76,"Also , the IAEA has the lowest number of women..."


In [10]:
processor = data_processor.DataProcessor(data.text.to_list(), data.lemma.to_list(), data.word_position.to_list(), data.sense_key.to_list())
processor.get_data().head()

Unnamed: 0,text,lemma,word_pos,sense_key
0,Action by the Committee In pursuance of its ma...,keep.v,15,keep%2:42:07::
1,A guard of honour stood in formation in honour...,national.a,25,national%3:01:00::
2,The principle that statistics should be timely...,build.v,38,build%2:31:03::
3,"Again , he appealed for additional support for...",place.n,36,place%1:04:00::
4,"Also , the IAEA has the lowest number of women...",position.n,76,position%1:04:01::


In [11]:
processor.fix_period_spaces_and_word_index_in_data()
processor.fix_quotations_and_word_index_in_data()
processor.get_data().head()

Unnamed: 0,text,lemma,word_pos,sense_key
0,Action by the Committee In pursuance of its ma...,keep.v,15,keep%2:42:07::
1,A guard of honour stood in formation in honour...,national.a,25,national%3:01:00::
2,The principle that statistics should be timely...,build.v,38,build%2:31:03::
3,"Again , he appealed for additional support for...",place.n,37,place%1:04:00::
4,"Also , the IAEA has the lowest number of women...",position.n,76,position%1:04:01::


In [12]:
processor.lemmatize_text_in_data()
processor.get_data().head()

Unnamed: 0,text,lemma,word_pos,sense_key,lemmatized_text
0,Action by the Committee In pursuance of its ma...,keep.v,15,keep%2:42:07::,action by the Committee in pursuance of -PRON-...
1,A guard of honour stood in formation in honour...,national.a,25,national%3:01:00::,a guard of honour stand in formation in honour...
2,The principle that statistics should be timely...,build.v,38,build%2:31:03::,the principle that statistic should be timely ...
3,"Again , he appealed for additional support for...",place.n,37,place%1:04:00::,"again , -PRON- appeal for additional support f..."
4,"Also , the IAEA has the lowest number of women...",position.n,76,position%1:04:01::,"also , the IAEA have the low number of woman p..."


In [9]:
def check_lemmas_matching(row, text_col, word_pos_col):
    text_splitted = row[text_col].split(' ')
    lemma_in_text = text_splitted[row[word_pos_col]]
    return lemma_in_text == row.lemma[:-2]

In [10]:
lemmas_matching_col = processor.get_data().apply(lambda x: check_lemmas_matching(x, 'lemmatized_text', 'word_pos'), axis=1)

In [11]:
lemmas_matching_col[lemmas_matching_col == False]

45       False
61       False
93       False
172      False
215      False
         ...  
75902    False
75903    False
75958    False
75967    False
76048    False
Length: 1979, dtype: bool

In [12]:
def look_at_data_example(data, show_index):
    row = data.iloc[show_index]
    print(f"lemma: {row.lemma}")
    print("")
    print(f"original: {row.text}")
    print("")
    print(f"lemmatized: {row.lemmatized_text}")
    print("")
    text_splitted = row.text.split(' ')
    lemmatized_text_splitted = row.lemmatized_text.split(' ')
    print(f"lemma in original: {text_splitted[row.word_pos]}")
    print(f"lemma in lemmatized: {lemmatized_text_splitted[row.word_pos]}")

In [13]:
look_at_data_example(processor.get_data(), 45)

lemma: time.n

original: To begin with , South-North dialogue encompasses a broad field , from political and diplomatic exchanges through trade , poverty eradication , investment , technology , industrialization , capacity-building and financing for development to the empowering of people . A recent cover story in Time magazine featured our esteemed Secretary-General . It called him a dreamer .

lemmatized: to begin with , south-north dialogue encompass a broad field , from political and diplomatic exchange through trade , poverty eradication , investment , technology , industrialization , capacity-building and financing for development to the empowering of people . a recent cover story in Time magazine feature -PRON- esteemed secretary-general . -PRON- call -PRON- a dreamer .

lemma in original: Time
lemma in lemmatized: Time


#### Save lemmatized data

In [15]:
save_data_to_file(processor.get_data())

Given data with head:
                                                text       lemma  word_pos  \
0  Action by the Committee In pursuance of its ma...      keep.v        15   
1  A guard of honour stood in formation in honour...  national.a        25   
2  The principle that statistics should be timely...     build.v        38   
3  Again , he appealed for additional support for...     place.n        37   
4  Also , the IAEA has the lowest number of women...  position.n        76   

            sense_key                                    lemmatized_text  
0      keep%2:42:07::  action by the Committee in pursuance of -PRON-...  
1  national%3:01:00::  a guard of honour stand in formation in honour...  
2     build%2:31:03::  the principle that statistic should be timely ...  
3     place%1:04:00::  again , -PRON- appeal for additional support f...  
4  position%1:04:01::  also , the IAEA have the low number of woman p...  
Saved data!


#### Get sense encoded text

In [14]:
processor.sense_encode_text_in_data('lemmatized_text')
processor.get_data().head()

Unnamed: 0,text,lemma,word_pos,sense_key,lemmatized_text,sensed_lemma,sense_encoded_text
0,Action by the Committee In pursuance of its ma...,keep.v,15,keep%2:42:07::,action by the Committee in pursuance of -PRON-...,keep_1,"[action, by, the, Committee, in, pursuance, of..."
1,A guard of honour stood in formation in honour...,national.a,25,national%3:01:00::,a guard of honour stand in formation in honour...,national_1,"[a, guard, of, honour, stand, in, formation, i..."
2,The principle that statistics should be timely...,build.v,38,build%2:31:03::,the principle that statistic should be timely ...,build_1,"[the, principle, that, statistic, should, be, ..."
3,"Again , he appealed for additional support for...",place.n,37,place%1:04:00::,"again , -PRON- appeal for additional support f...",place_1,"[again, ,, -PRON-, appeal, for, additional, su..."
4,"Also , the IAEA has the lowest number of women...",position.n,76,position%1:04:01::,"also , the IAEA have the low number of woman p...",position_1,"[also, ,, the, IAEA, have, the, low, number, o..."


In [15]:
data = processor.get_data()

Potentially check some of the less frequent lemmas.

In [18]:
data.head()

Unnamed: 0,text,lemma,word_pos,sense_key,lemmatized_text,sensed_lemma,sense_encoded_text
0,Action by the Committee In pursuance of its ma...,keep.v,15,keep%2:42:07::,action by the Committee in pursuance of -PRON-...,keep_1,"[action, by, the, Committee, in, pursuance, of..."
1,A guard of honour stood in formation in honour...,national.a,25,national%3:01:00::,a guard of honour stand in formation in honour...,national_1,"[a, guard, of, honour, stand, in, formation, i..."
2,The principle that statistics should be timely...,build.v,38,build%2:31:03::,the principle that statistic should be timely ...,build_1,"[the, principle, that, statistic, should, be, ..."
3,"Again , he appealed for additional support for...",place.n,37,place%1:04:00::,"again , -PRON- appeal for additional support f...",place_1,"[again, ,, -PRON-, appeal, for, additional, su..."
4,"Also , the IAEA has the lowest number of women...",position.n,76,position%1:04:01::,"also , the IAEA have the low number of woman p...",position_1,"[also, ,, the, IAEA, have, the, low, number, o..."


In [19]:
data.iloc[0].sense_encoded_text

['action',
 'by',
 'the',
 'Committee',
 'in',
 'pursuance',
 'of',
 '-PRON-',
 'mandate',
 ',',
 'the',
 'Committee',
 'will',
 'continue',
 'to',
 'keep_1',
 'under',
 'review',
 'the',
 'situation',
 'relate',
 'to',
 'the',
 'question',
 'of',
 'Palestine',
 'and',
 'participate',
 'in',
 'relevant',
 'meeting',
 'of',
 'the',
 'General',
 'Assembly',
 'and',
 'the',
 'Security',
 'Council',
 '.',
 'the',
 'Committee',
 'will',
 'also',
 'continue',
 'to',
 'monitor',
 'the',
 'situation',
 'on',
 'the',
 'ground',
 'and',
 'draw',
 'the',
 'attention',
 'of',
 'the',
 'international',
 'community',
 'to',
 'urgent',
 'development',
 'in',
 'the',
 'occupied',
 'Palestinian',
 'Territory',
 ',',
 'include',
 'East',
 'Jerusalem',
 ',',
 'require',
 'international',
 'action',
 '.']

In [276]:
data = processor.get_data()
data[data.sensed_lemma=="force_4"].iloc[2].text

'Reiterating its full support for the efforts of the Secretary-General , the African Union and regional actors to find solutions to armed conflicts in the region , Reaffirming that any attempt at destabilization through violent means or seizing power by force is unacceptable , Reaffirming its resolutions 1325 ( 2000 ) and 1820 ( 2008 ) on women , peace and security , 1502 ( 2003 ) on the protection of humanitarian and United Nations personnel , and 1674 ( 2006 ) on the protection of civilians in armed conflict ,'

In [72]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76049 entries, 0 to 76048
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   text                76049 non-null  object
 1   lemma               76049 non-null  object
 2   word_pos            76049 non-null  int64 
 3   sense_key           76049 non-null  object
 4   lemmatized_text     76049 non-null  object
 5   sensed_lemma        76049 non-null  object
 6   sense_encoded_text  76049 non-null  object
dtypes: int64(1), object(6)
memory usage: 4.1+ MB


In [21]:
save_data_to_file(processor.get_data())

Given data with head:
                                                text       lemma  word_pos  \
0  Action by the Committee In pursuance of its ma...      keep.v        15   
1  A guard of honour stood in formation in honour...  national.a        25   
2  The principle that statistics should be timely...     build.v        38   
3  Again , he appealed for additional support for...     place.n        37   
4  Also , the IAEA has the lowest number of women...  position.n        76   

            sense_key                                    lemmatized_text  \
0      keep%2:42:07::  action by the Committee in pursuance of -PRON-...   
1  national%3:01:00::  a guard of honour stand in formation in honour...   
2     build%2:31:03::  the principle that statistic should be timely ...   
3     place%1:04:00::  again , -PRON- appeal for additional support f...   
4  position%1:04:01::  also , the IAEA have the low number of woman p...   

  sensed_lemma                                 sense

## 2. Build the vocabulary

In [5]:
data = load_data_from_file()

In [6]:
voc = help_functions.Vocabulary(min_word_freq=22, include_unknown=True, lower=True, character=False)
voc.build(data.sense_encoded_text)
len(voc)

97

In [23]:
voc.stoi['line_9']

8312

## 2.* Create data fitting to the model

In [76]:
def pad_sequence(sequence, max_sequence_len):
    return sequence+[voc.get_pad_idx()]*(max_sequence_len-len(sequence))

In [77]:
def create_training_data(data, max_sequence_len=None):
    def build_X_elem(X):
        X_elem = voc.encode([X])[0]
        if max_sequence_len:
            return pad_sequence(X_elem, max_sequence_len)
        return X_elem

    sense_dict = help_functions.build_sense_dict(data.lemma.to_list(), data.sense_key.to_list())
    X_data = []
    y_data = []
    for index, row in data.iterrows():
        X_data.append(build_X_elem(row.sense_encoded_text))
        y_data.append([1])

        # append faulty sense examples
        available_senses = list(sense_dict[row.lemma].keys())
        available_senses.remove(row.sense_key)
        for sense in available_senses:
            faulty_text = row.sense_encoded_text.copy()
            #print(faulty_text)
            faulty_text[row.word_pos] = row.lemma[:-2]+"_"+str(sense_dict[row.lemma][sense])
            X_data.append(build_X_elem(faulty_text.copy()))
            y_data.append([0])
    return X_data, y_data

In [189]:
import numpy as np
v = [0,1,2]
np.random.choice(len(v),1)[0]

1

In [190]:
def create_equal_training_data(data, max_sequence_len=None):
    def build_X_elem(X):
        X_elem = voc.encode([X])[0]
        if max_sequence_len:
            return pad_sequence(X_elem, max_sequence_len)
        return X_elem

    sense_dict = help_functions.build_sense_dict(data.lemma.to_list(), data.sense_key.to_list())
    X_data = []
    y_data = []
    for index, row in data.iterrows():
        X_data.append(build_X_elem(row.sense_encoded_text))
        y_data.append([1])

        # append one faulty sense example
        available_senses = list(sense_dict[row.lemma].keys())
        available_senses.remove(row.sense_key)
        faulty_sense = available_senses[np.random.choice(len(v), 1)[0]]
        faulty_text = row.sense_encoded_text.copy()
        faulty_text[row.word_pos] = row.lemma[:-2]+"_"+str(sense_dict[row.lemma][faulty_sense])
        X_data.append(build_X_elem(faulty_text.copy()))
        y_data.append([0])
    return X_data, y_data

In [73]:
def find_max_sequence_length(sequence_list):
    max_sequence_len = 0
    for sequence in sequence_list:
        if len(sequence) > max_sequence_len:
            max_sequence_len = len(sequence)
    return max_sequence_len

Max sequence length of data seems to be 283.

In [110]:
max_sequence_length = 283

In [74]:
find_max_sequence_length(X_data)

283

In [87]:
X_data, y_data = create_training_data(data, 283)

In [191]:
X_data_eq, y_data_eq = create_equal_training_data(data, 283)
print(f'Number of samples: {len(y_data_eq)}')
print(f'Sequence length per sample: {len(X_data_eq[0])}')

Number of samples: 152098
Sequence length per sample: 283


In [99]:
def save_data_with_pickle(data_dict):
    pre_filename = input("Specify which prefix filename you wish to save X_data and y_data to: ")
    if pre_filename:
        for key, value in data_dict.items():
            filename = pre_filename+"_"+key+".pickle"
            with open(filename, "wb") as fp:   #Pickling
                pickle.dump(value, fp)

In [100]:
save_data_with_pickle({"X_data": X_data, "y_data": y_data})

In [192]:
save_data_with_pickle({"X_data_eq": X_data_eq, "y_data_eq": y_data_eq})

Split into train and validation set.

In [97]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.33, random_state=42)

In [101]:
save_data_with_pickle({"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test})

In [193]:
X_train_eq, X_test_eq, y_train_eq, y_test_eq = train_test_split(X_data_eq, y_data_eq, test_size=0.33, random_state=42)
save_data_with_pickle({"X_train_eq": X_train_eq, "X_test_eq": X_test_eq, "y_train_eq": y_train_eq, "y_test_eq": y_test_eq})

In [109]:
save_data_with_pickle({"voc": voc})