In [247]:
import csv
import numpy as np

In [248]:
class Wordpair:
    word1 = None
    word2 = None
    
    def __init__(self, word1, word2):
        self.word1 = word1
        self.word2 = word2
    
    def __hash__(self):
        return hash(self.word1 + ',' + self.word2)
    
    def inverse(self):
        return WordPair(self.word2, self.word1)
    
    def to_list_of_strings(self):
        return [self.word1, self.word2]

In [249]:
class Labelpair(Wordpair):
    """
    Labels are expected to be POS (part of speech)
    
    """
    def __init__(self, label1, label2):
        super().__init__(label1, label2)

In [250]:
class Data:
    """
    Vault for all word similarities with labels
    
    """
    data_similarity = None
    data_labels = None
    
    def __init__(self, similarities=None, labels=None):
        if similarities is None:
            self.data_similarity = {}
        else:
            self.data_similarity = similarities
        
        if labels is None:
            self.data_labels = {}
        else:
            self.data_labels = labels
        
    def add(self, wordpair, labelpair, value):
        self.data_similarity[wordpair] = value
        self.data_labels[wordpair] = labelpair
        
    def get(self, wordpair):
        return self.data_similarity[wordpair]
        
    def get_with_labels(self, wordpair):
        return [self.data_labels[wordpair], self.data_similarity[wordpair]]
    
    def to_list(self):
        result = []
        all_pairs = self.data_similarity.keys()
        
        for pair in all_pairs:
            curr_list = pair.to_list_of_strings() + \
                self.data_labels[pair].to_list_of_strings() + \
                [float(self.data_similarity[pair])]
            result.append(curr_list)
            
        return result

In [251]:
class Dataset:
    """ 
    Base class for dataset
    
    """
    path = None
    
    def __init__(self, path):
        self.path = path
        
    def load_data_to_memory(self):
        raise NotImplementedError

In [257]:
class GoldenStandartDataset(Dataset):
    """
    Desribes arbitrary golden standart
    
    """
    standartized_label = "standartized"
    data = None
    
    def __init__(self, path):
        super().__init__(path)
        self.data = Data()
        self.load_data_to_memory()
            
    def load_data_to_memory(self):
        if self.standartized_label in self.path:
            self.load_data_to_memory_standartized()
        else:
            raise NotImplementedError
            
    def load_data_to_memory_standartized(self):
        """
        Read data from path in standartized form:
        word1, word2, label1, label2, similarity value
        
        labels are expected to be POS (part of speech)
        
        """
        separator = ','
        
        with open(self.path, newline='\n') as csv_file:
            reader = csv.reader(csv_file, delimiter=separator)
            for row in reader:
                words = Wordpair(row[0], row[1])
                labels = Labelpair(row[2], row[3])
                sim_value = row[4]
                
                try:
                    float(sim_value)
                except ValueError:
                    continue
                
                self.data.add(words, labels, sim_value)
    
    def write_data_to_file(self, filepath):
        """
        Write data to filepath in standartized form:
        word1, word2, label1, label2, similarity value

        labels are expected to be POS (part of speech)

        """
        separator = ','
        
        with open(filepath, 'w+', newline='\n') as csv_file:
            writer = csv.writer(csv_file, delimiter=separator)
            writer.writerow(["word1", "word2", "label1", "label2", "sim_value"])
            writer.writerows(self.data.to_list())

In [258]:
class SimLex999Dataset(GoldenStandartDataset):

    def __init__(self, path="./SimLex-999.txt"):
        super().__init__(path)
    
    def load_data_to_memory(self):
        if self.path.endswith(self.standartized_label):
            super().load_data_to_memory()
        else:
            separator = '\t'
            
            with open(self.path, newline='\n') as csv_file:
                reader = csv.reader(csv_file, delimiter=separator)
                
                line_idx = 0
                for row in reader:
                    if (line_idx == 0):
                        line_idx += 1
                        continue
                    
                    words = Wordpair(row[0], row[1])
                    labels = Labelpair(row[2], row[2])
                    sim_value = row[3]

                    self.data.add(words, labels, sim_value)

In [259]:
class WordSim353Dataset(GoldenStandartDataset):

    def __init__(self, path="./combined.csv"):
        super().__init__(path)
    
    def load_data_to_memory(self):
        if self.path.endswith(self.standartized_label):
            super().load_data_to_memory()
        else:
            separator = ','
            
            with open(self.path, newline='\n') as csv_file:
                reader = csv.reader(csv_file, delimiter=separator)
                
                line_idx = 0
                for row in reader:
                    if (line_idx == 0):
                        line_idx += 1
                        continue
                    
                    words = Wordpair(row[0], row[1])
                    labels = Labelpair("n", "n")
                    sim_value = row[2]

                    self.data.add(words, labels, sim_value)

In [273]:
class MENDataset(GoldenStandartDataset):
    def __init__(self, path="./MEN_dataset_lemma_form_full"):
        super().__init__(path)
    
    def load_data_to_memory(self):
        if self.path.endswith(self.standartized_label):
            super().load_data_to_memory()
        else:
            separator = ' '
            
            with open(self.path, newline='\n') as csv_file:
                reader = csv.reader(csv_file, delimiter=separator)

                for row in reader:
                    
                    words = Wordpair(row[0][:-2], row[1][:-2])
                    labels = Labelpair(row[0][-1], row[1][-1])
                    sim_value = row[2]

                    self.data.add(words, labels, sim_value)

In [274]:
simlex = SimLex999Dataset()

In [275]:
simlex.write_data_to_file("./SimLex-999-standartized.csv")

In [276]:
simlex_std = GoldenStandartDataset("./SimLex-999-standartized.csv")

In [277]:
simlex_std.data.to_list()

[['old', 'new', 'A', 'A', 1.58],
 ['smart', 'intelligent', 'A', 'A', 9.2],
 ['hard', 'difficult', 'A', 'A', 8.77],
 ['happy', 'cheerful', 'A', 'A', 9.55],
 ['hard', 'easy', 'A', 'A', 0.95],
 ['fast', 'rapid', 'A', 'A', 8.75],
 ['happy', 'glad', 'A', 'A', 9.17],
 ['short', 'long', 'A', 'A', 1.23],
 ['stupid', 'dumb', 'A', 'A', 9.58],
 ['weird', 'strange', 'A', 'A', 8.93],
 ['wide', 'narrow', 'A', 'A', 1.03],
 ['bad', 'awful', 'A', 'A', 8.42],
 ['easy', 'difficult', 'A', 'A', 0.58],
 ['bad', 'terrible', 'A', 'A', 7.78],
 ['hard', 'simple', 'A', 'A', 1.38],
 ['smart', 'dumb', 'A', 'A', 0.55],
 ['insane', 'crazy', 'A', 'A', 9.57],
 ['happy', 'mad', 'A', 'A', 0.95],
 ['large', 'huge', 'A', 'A', 9.47],
 ['hard', 'tough', 'A', 'A', 8.05],
 ['new', 'fresh', 'A', 'A', 6.83],
 ['sharp', 'dull', 'A', 'A', 0.6],
 ['quick', 'rapid', 'A', 'A', 9.7],
 ['dumb', 'foolish', 'A', 'A', 6.67],
 ['wonderful', 'terrific', 'A', 'A', 8.63],
 ['strange', 'odd', 'A', 'A', 9.02],
 ['happy', 'angry', 'A', 'A', 1.2

In [278]:
wordsim = WordSim353Dataset()

In [279]:
wordsim.write_data_to_file("./WordSim-353-standartized.csv")

In [280]:
wordsim_std = GoldenStandartDataset("./WordSim-353-standartized.csv")

In [281]:
wordsim_std.data.to_list()

[['love', 'sex', 'n', 'n', 6.77],
 ['tiger', 'cat', 'n', 'n', 7.35],
 ['tiger', 'tiger', 'n', 'n', 10.0],
 ['book', 'paper', 'n', 'n', 7.46],
 ['computer', 'keyboard', 'n', 'n', 7.62],
 ['computer', 'internet', 'n', 'n', 7.58],
 ['plane', 'car', 'n', 'n', 5.77],
 ['train', 'car', 'n', 'n', 6.31],
 ['telephone', 'communication', 'n', 'n', 7.5],
 ['television', 'radio', 'n', 'n', 6.77],
 ['media', 'radio', 'n', 'n', 7.42],
 ['drug', 'abuse', 'n', 'n', 6.85],
 ['bread', 'butter', 'n', 'n', 6.19],
 ['cucumber', 'potato', 'n', 'n', 5.92],
 ['doctor', 'nurse', 'n', 'n', 7.0],
 ['professor', 'doctor', 'n', 'n', 6.62],
 ['student', 'professor', 'n', 'n', 6.81],
 ['smart', 'student', 'n', 'n', 4.62],
 ['smart', 'stupid', 'n', 'n', 5.81],
 ['company', 'stock', 'n', 'n', 7.08],
 ['stock', 'market', 'n', 'n', 8.08],
 ['stock', 'phone', 'n', 'n', 1.62],
 ['stock', 'CD', 'n', 'n', 1.31],
 ['stock', 'jaguar', 'n', 'n', 0.92],
 ['stock', 'egg', 'n', 'n', 1.81],
 ['fertility', 'egg', 'n', 'n', 6.69],
 

In [282]:
men = MENDataset()

In [283]:
men.write_data_to_file("./MEN-standartized.csv")

In [284]:
men_std = GoldenStandartDataset("./MEN-standartized.csv")

In [285]:
men_std.data.to_list()

[['sun', 'sunlight', 'n', 'n', 50.0],
 ['automobile', 'car', 'n', 'n', 50.0],
 ['river', 'water', 'n', 'n', 49.0],
 ['stair', 'staircase', 'n', 'n', 49.0],
 ['morning', 'sunrise', 'n', 'n', 49.0],
 ['rain', 'storm', 'n', 'n', 49.0],
 ['cat', 'kitten', 'n', 'n', 49.0],
 ['dance', 'dancer', 'n', 'n', 49.0],
 ['camera', 'photography', 'n', 'n', 49.0],
 ['cat', 'feline', 'n', 'j', 48.0],
 ['sunny', 'sunshine', 'j', 'n', 48.0],
 ['pregnancy', 'pregnant', 'n', 'j', 48.0],
 ['beach', 'sand', 'n', 'n', 48.0],
 ['bakery', 'bread', 'n', 'n', 48.0],
 ['flower', 'garden', 'n', 'n', 48.0],
 ['grass', 'lawn', 'n', 'n', 48.0],
 ['copper', 'metal', 'n', 'n', 48.0],
 ['photo', 'photography', 'n', 'n', 47.0],
 ['cemetery', 'graveyard', 'n', 'n', 47.0],
 ['gravestone', 'graveyard', 'n', 'n', 47.0],
 ['sun', 'sunshine', 'n', 'n', 47.0],
 ['black', 'dark', 'j', 'j', 47.0],
 ['cathedral', 'church', 'n', 'n', 47.0],
 ['frozen', 'ice', 'j', 'n', 47.0],
 ['station', 'subway', 'n', 'n', 47.0],
 ['child', 'kid',