In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import time
import spacy
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

nlp = spacy.load('en_core_web_sm')

def preprocess_sentence_spacy(sentence, tokenize=True):
    if tokenize:
        doc = nlp(sentence.lower())
        return [token.text for token in doc]
    else:
        return sentence.lower().split()
    
# Definirea clasei InferSent cu metoda preprocess_sentence_spacy
class InferSent(nn.Module):

    def __init__(self, config):
        super(InferSent, self).__init__()
        self.bsize = config['bsize']
        self.word_emb_dim = config['word_emb_dim']
        self.enc_lstm_dim = config['enc_lstm_dim']
        self.pool_type = config['pool_type']
        self.dpout_model = config['dpout_model']
        self.version = 1 if 'version' not in config else config['version']

        self.enc_lstm = nn.LSTM(
            input_size=self.word_emb_dim,
            hidden_size=self.enc_lstm_dim,
            num_layers=1,
            bidirectional=True,
            dropout=self.dpout_model
        )

        assert self.version in [1, 2]
        if self.version == 1:
            self.bos = '<s>'
            self.eos = '</s>'
            self.max_pad = True
            self.moses_tok = False
        elif self.version == 2:
            self.bos = '<p>'
            self.eos = '</p>'
            self.max_pad = False
            self.moses_tok = True

        # Strat de regresie pentru similaritate
        self.regressor = nn.Linear(2 * self.enc_lstm_dim * 2, 1)  # 2*lstm_dim * 2 pentru concatenarea premisei și hypothesis

        # Încarcă modelul spaCy o singură dată
        self.nlp = spacy.load('en_core_web_sm')

    @staticmethod
    def preprocess_sentence_spacy(sentence, tokenize=True):
        nlp = spacy.load('en_core_web_sm')
        if tokenize:
            doc = nlp(sentence.lower())
            return [token.text for token in doc]
        else:
            return sentence.lower().split()

    def is_cuda(self):
        # Verifică dacă modelul este pe GPU
        return next(self.parameters()).is_cuda

    def forward(self, sent_tuple):
        sent, sent_len = sent_tuple  # sent: (batch_size, seq_len, emb_dim); sent_len: (batch_size)

        # Sort by length in descending order
        sent_len_sorted, idx_sort = torch.sort(sent_len, descending=True)
        sent_sorted = sent.index_select(0, idx_sort)

        # Pack the sequence
        sent_packed = nn.utils.rnn.pack_padded_sequence(sent_sorted, sent_len_sorted.cpu(), batch_first=True, enforce_sorted=True)
        sent_output_packed, _ = self.enc_lstm(sent_packed)  # sent_output_packed: PackedSequence; _ : hidden state

        # Unpack the sequence
        sent_output, _ = nn.utils.rnn.pad_packed_sequence(sent_output_packed, batch_first=True)  # sent_output: (batch_size, seq_len, 2*enc_lstm_dim)

        # Unsort to original order
        _, idx_unsort = torch.sort(idx_sort, descending=False)
        sent_output = sent_output.index_select(0, idx_unsort)
        sent_len_sorted = sent_len_sorted.index_select(0, idx_unsort)

        # Pooling
        if self.pool_type == "mean":
            # Calculate the sum over the sequence dimension and divide by the lengths
            sent_len_unsorted = sent_len_sorted.float().unsqueeze(1)  # (batch_size, 1)
            emb = torch.sum(sent_output, dim=1) / sent_len_unsorted  # (batch_size, 2*enc_lstm_dim)
        elif self.pool_type == "max":
            if not self.max_pad:
                sent_output[sent_output == 0] = -1e9
            emb, _ = torch.max(sent_output, dim=1)  # (batch_size, 2*enc_lstm_dim)

        return emb

    def encode_sentence_pair(self, premise_tuple, hypothesis_tuple):
        encoded_premise = self.forward(premise_tuple)
        encoded_hypothesis = self.forward(hypothesis_tuple)
        combined = torch.cat((encoded_premise, encoded_hypothesis), dim=1)  # (batch_size, 4*enc_lstm_dim)
        return combined

    def regress_similarity(self, combined):
        similarity = self.regressor(combined)
        return similarity.squeeze(1)  # Returnează un tensor de dimensiune (batch_size)

    def set_w2v_path(self, w2v_path):
        self.w2v_path = w2v_path

    def get_word_dict(self, sentences, tokenize=True):
        # Crează vocabularul de cuvinte
        word_dict = {}
        sentences = [s.split() if not tokenize else self.tokenize(s) for s in sentences]
        for sent in sentences:
            for word in sent:
                if word not in word_dict:
                    word_dict[word] = ''
        word_dict[self.bos] = ''
        word_dict[self.eos] = ''
        return word_dict

    def get_w2v(self, word_dict):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # Crează word_vec cu vectori w2v
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                split_line = line.strip().split(' ')
                word = split_line[0]
                vec = np.array([float(val) for val in split_line[1:]], dtype='float32')
                if word in word_dict:
                    word_vec[word] = vec
        print('Found %s/%s words with w2v vectors' % (len(word_vec), len(word_dict)))
        return word_vec

    def get_w2v_k(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # Crează word_vec cu primii K vectori w2v
        k = 0
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                if k > K and all([w in word_vec for w in [self.bos, self.eos]]):
                    break
                split_line = line.strip().split(' ')
                word = split_line[0]
                vec = np.array([float(val) for val in split_line[1:]], dtype='float32')
                if k <= K or word in [self.bos, self.eos]:
                    word_vec[word] = vec
                    k += 1
        return word_vec

    def build_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        word_dict = self.get_word_dict(sentences, tokenize)
        self.word_vec = self.get_w2v(word_dict)
        print('Vocab size : %s' % (len(self.word_vec)))

    def build_vocab_k_words(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        self.word_vec = self.get_w2v_k(K)
        print('Vocab size : %s' % (len(self.word_vec)))

    def update_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'warning : w2v path not set'
        assert hasattr(self, 'word_vec'), 'build_vocab before updating it'
        word_dict = self.get_word_dict(sentences, tokenize)

        # Păstrează doar cuvintele noi
        new_words = {word: '' for word in word_dict if word not in self.word_vec}
        if new_words:
            new_word_vec = self.get_w2v(new_words)
            self.word_vec.update(new_word_vec)
            print('New vocab size : %s (added %s words)'% (len(self.word_vec), len(new_word_vec)))
        else:
            print('No new words to add.')

    def get_batch(self, batch):
        # Determină dimensiunile batch-ului
        batch_size = len(batch)
        max_len = max(len(s) for s in batch)
        
        # Inițializează un array de numpy pentru embed
        embed = np.zeros((batch_size, max_len, self.word_emb_dim), dtype=np.float32)
        
        for i, sentence in enumerate(batch):
            for j, word in enumerate(sentence):
                embed[i, j, :] = self.word_vec.get(word, self.word_vec.get('<unk>', np.zeros(self.word_emb_dim)))
        
        return torch.from_numpy(embed)  # Returnează un tensor PyTorch

    def tokenize(self, s):
        # Folosește metoda preprocess_sentence_spacy pentru tokenizare
        return self.preprocess_sentence_spacy(s)

    def prepare_samples(self, sentences, bsize, tokenize, verbose):
        # Adaugă <s> și </s> și tokenizează dacă este necesar
        sentences = [
            [self.bos] + s + [self.eos] for s in sentences
        ]
        n_w = np.sum([len(x) for x in sentences])

        # Filtrează cuvintele fără vectori w2v
        for i in range(len(sentences)):
            s_f = [word for word in sentences[i] if word in self.word_vec]
            if not s_f:
                import warnings
                warnings.warn('No words in "%s" (idx=%s) have w2v vectors. Replacing by "</s>"..' % (sentences[i], i))
                s_f = [self.eos]
            sentences[i] = s_f

        lengths = np.array([len(s) for s in sentences])
        n_wk = np.sum(lengths)
        if verbose:
            print('Nb words kept : %s/%s (%.1f%%)' % (n_wk, n_w, 100.0 * n_wk / n_w))

        # Sortează propozițiile după lungime descrescătoare
        lengths_sorted, idx_sort = torch.sort(torch.tensor(lengths), descending=True)
        sentences_sorted = [sentences[i] for i in idx_sort]

        if verbose:
            print(f"First 5 sorted sentence lengths: {lengths_sorted[:5].tolist()}")
            print(f"First 5 sorted sentences: {sentences_sorted[:5]}")

        return sentences_sorted, lengths_sorted.numpy(), idx_sort.numpy()

    def encode(self, sentences, bsize=64, tokenize=True, verbose=False):
        tic = time.time()
        sentences_sorted, lengths_sorted, idx_sort = self.prepare_samples(
                        sentences, bsize, tokenize, verbose)

        embeddings = []
        for stidx in range(0, len(sentences_sorted), bsize):
            batch_sentences = sentences_sorted[stidx:stidx + bsize]
            batch = self.get_batch(batch_sentences)
            if self.is_cuda():
                batch = batch.cuda()
            lengths = lengths_sorted[stidx:stidx + bsize]
            with torch.no_grad():
                batch_output = self.forward((batch, torch.tensor(lengths).cuda() if self.is_cuda() else torch.tensor(lengths)))
                embeddings.append(batch_output.cpu().numpy())
        embeddings = np.vstack(embeddings)

        # Unsort
        idx_unsort = np.argsort(idx_sort)
        embeddings = embeddings[idx_unsort]

        if verbose:
            print('Speed : %.1f sentences/s (%s mode, bsize=%s)' % (
                    len(embeddings)/(time.time()-tic),
                    'gpu' if self.is_cuda() else 'cpu', bsize))
        return embeddings

    def visualize(self, sent, tokenize=True):
        sent = sent.split() if not tokenize else self.tokenize(sent)
        sent = [self.bos] + [word for word in sent if word in self.word_vec] + [self.eos]

        if len(sent) == 2 and sent[0] == self.bos and sent[1] == self.eos:
            import warnings
            warnings.warn('No words in "%s" have w2v vectors. Replacing by "%s %s"..' % (sent, self.bos, self.eos))
        
        batch = self.get_batch([sent])
        
        if self.is_cuda():
            batch = batch.cuda()
        output = self.forward((batch, torch.tensor([len(sent)]).cuda() if self.is_cuda() else torch.tensor([len(sent)])))
        output, idxs = torch.max(output, dim=1)
        
        idxs = idxs.cpu().numpy()
        argmaxs = [np.sum((idxs == k)) for k in range(len(sent))]

        # visualize model
        x = range(len(sent))
        y = [100.0 * n / np.sum(argmaxs) for n in argmaxs]
        plt.xticks(x, sent, rotation=45)
        plt.bar(x, y)
        plt.ylabel('%')
        plt.title('Visualisation of words importance')
        plt.show()

        return output, idxs


In [7]:
import pandas as pd
import spacy
import os

# Asigură-te că ai definit funcția preprocess_sentence_spacy și clasa InferSent înainte de acest punct

# Încarcă setul de date
data_path = r'C:\facultate an 3\projects-simquery\data\sts_train.csv'  # Asigură-te că calea este corectă
data = pd.read_csv(data_path, delimiter='\t')

# Filtrează datele pentru a elimina rândurile cu valori lipsă
data = data.dropna(subset=['sent_1', 'sent_2', 'sim'])

# Extrage propozițiile și scorurile
sentences_1 = data['sent_1'].tolist()
sentences_2 = data['sent_2'].tolist()
similarities = data['sim'].tolist()

# Definirea dicționarului de configurare
config = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'mean',  # sau 'max'
    'dpout_model': 0.0,
    'version': 1
}

# Instanțierea modelului
model = InferSent(config)

# Setarea căii către vectorii word2vec pre-antrenați
model.set_w2v_path(r'C:\facultate an 3\projects-simquery\data\word2vec.txt')  # Înlocuiește cu calea corectă

# Construirea vocabularului
model.build_vocab(sentences_1 + sentences_2, tokenize=True)

# Tokenizează propozițiile
sentences_1 = [model.tokenize(s) for s in sentences_1]
sentences_2 = [model.tokenize(s) for s in sentences_2]

print("Tokenization successful using spaCy!")


KeyboardInterrupt: 