In [1]:
#!pip install gensim --upgrade
#!pip install numpy --upgrade
#!pip install torch --upgrade

In [2]:
import math
import gensim
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader,Dataset
import gensim.downloader as api
from typing import Optional
import os


In [3]:
# If we want to save the model to a local file
glove_path = api.load('glove-wiki-gigaword-300', return_path=True)  
print(glove_path)
try:
    glove_model = gensim.models.keyedvectors.load_word2vec_format(glove_path)
except:
    glove_model = gensim.models.keyedvectors.load_word2vec_format(glove_path, binary=True)
#glove_model = api.load('word2vec-ruscorpora-300')#  If we want to load without saving localy 

C:\Users\doron.z/gensim-data\glove-wiki-gigaword-300\glove-wiki-gigaword-300.gz


In [4]:
weights = torch.FloatTensor(glove_model.vectors)
embedding = nn.Embedding.from_pretrained(weights)
embedding.requires_grad_(False)
print(embedding)

Embedding(400000, 300)


In [5]:
def read_reviews(data_dir,label):
    words_lost = 0 #counter of lost words from the reviews
    total_words_from_files = 0
    all_reviews_array_with_labels = []
    #data_dir = r'C:\Users\doron.z\Desktop\General Stuff\university\deep learning 2\train_data\train\neg'
    for (roots,dirs,files) in os.walk(data_dir):
        for file_dir in files: #for each file in the folder
            file = open(roots+f'/{file_dir}','r',encoding='utf-8')
            sentences = file.readlines() #each file has 1 sentence/review
            sentences = [sen.strip().lower() for sen in sentences]
            sentences = [sen.split() for sen in sentences if sen]
            #print(sentences)
            #print(len(sentences[0]))
            representation = []        
            for word in sentences[0]: #for every word in the review from the file

                if word not in glove_model.key_to_index:
                    words_lost +=1
                    total_words_from_files+=1
                    continue
                else:
                    word_id = torch.tensor(glove_model.key_to_index[word])  # ID of the word in the embedding
                    total_words_from_files+=1

                vec = glove_model[word] 
                representation.append(vec)
                #print(torch.equal(torch.Tensor(vec),embedding(word_id)))
                
            representation = np.asarray(representation) #np array of the embedded words of the sentence
            # the shape of the representation is (#of words in the sentence,length of the embedding vector=300)
            #print(representation.shape)
            
            all_reviews_array_with_labels.append(representation)
        all_reviews_array_with_labels = np.asarray(all_reviews_array_with_labels)
    return all_reviews_array_with_labels,words_lost,total_words_from_files
            #print("words_lost = "+str(words_lost))
            
            #print(representation[0])

    

In [6]:
neg_reviews,neg_words_lost,neg_total_words_from_files = read_reviews(r'C:\Users\doron.z\Desktop\General Stuff\university\deep learning 2\train_data\train\neg',0)  
print(neg_reviews.shape)
print(neg_reviews[0])
print(neg_reviews[0].shape)
print("The number of words dropped from the negative reviews is- "+str(neg_words_lost)+" which is "+str(format(neg_words_lost/neg_total_words_from_files*100,".2f"))+"% of all words in from reviews.")

(18750,)
[[-0.20664   -0.1377    -0.11119   ...  0.27927   -0.0069871  0.13851  ]
 [-0.076947  -0.021211   0.21271   ...  0.18351   -0.29183   -0.046533 ]
 [-0.29712    0.094049  -0.096662  ...  0.059717  -0.22853    0.29602  ]
 ...
 [-0.51102    0.61752   -0.35497   ... -0.71145   -0.17716   -0.18386  ]
 [-0.33848    0.42841   -0.10284   ... -0.79888   -0.41967   -0.14039  ]
 [-0.094833   0.24367    0.18525   ... -0.11331   -0.047248  -0.11424  ]]
(100, 300)
The number of words dropped from the negative reviews is- 709154 which is 16.52% of all words in from reviews.


  all_reviews_array_with_labels = np.asarray(all_reviews_array_with_labels)


In [7]:
pos_reviews,pos_words_lost,pos_total_words_from_files = read_reviews(r'C:\Users\doron.z\Desktop\General Stuff\university\deep learning 2\train_data\train\pos',1)  
print(pos_reviews.shape)
print(pos_reviews[0])
print(pos_reviews[0].shape)
print("The number of words dropped from the positive reviews is- "+str(pos_words_lost)+" which is "+str(format(pos_words_lost/pos_total_words_from_files*100,".2f"))+"% of all words from those reviews.")

(18750,)
[[-0.62976    0.26793    0.1808    ...  0.32447    0.37457   -0.3564   ]
 [-0.33112    0.53214    0.22707   ... -0.82863   -0.24082    0.0065358]
 [-0.1749     0.22956    0.24924   ... -0.24131   -0.40402    0.054744 ]
 ...
 [ 0.035287  -0.11865    0.28912   ...  0.071298   0.34976   -0.1023   ]
 [-0.18256    0.49851   -0.1639    ... -0.27224   -0.19107   -0.094104 ]
 [ 0.033284  -0.040754  -0.048377  ... -0.15408    0.17806   -0.19683  ]]
(118, 300)
The number of words dropped from the positive reviews is- 702107 which is 15.99% of all words from those reviews.


  all_reviews_array_with_labels = np.asarray(all_reviews_array_with_labels)


In [8]:
#x = np.concatenate((neg_reviews,pos_reviews),axis =0)
#print(x.shape)
#print(len(neg_reviews))
#train_x_neg = neg_reviews
#train_y_neg
y_neg = np.full_like(neg_reviews,1.0)
y_neg

array([1.0, 1.0, 1.0, ..., 1.0, 1.0, 1.0], dtype=object)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [10]:
def split_reviews_array_to_train_val_test(reviews_array,wanted_y_label,split_frac = 0.8):
    #y = np.full_like(reviews_array,wanted_y_label,dtype='float32')
    
    ## split data into training, validation, and test data (x and y)
    split_idx = int(len(reviews_array)*split_frac)
    train_x, remaining_x = reviews_array[:split_idx], reviews_array[split_idx:]
    
    #train_y, remaining_y = y[:split_idx], y[split_idx:]
    train_y = np.full_like(train_x,wanted_y_label,dtype='float32')
    
    test_idx = int(len(remaining_x)*0.5)
    val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
    #val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]
    
    val_y = np.full_like(val_x,wanted_y_label,dtype='float32') 
    test_y = np.full_like(test_x,wanted_y_label,dtype='float32')

    # print out the shapes of your resultant feature data
    print("For reviews with the label:"+str(wanted_y_label))
    print("\t\t\tFeature Shapes:")
    print("Train set: \t\t{}".format(train_x.shape),
          "\nValidation set: \t{}".format(val_x.shape),
          "\nTest set: \t\t{}".format(test_x.shape),
          "\nwith first object shape: "+str(train_x[0].shape)#+
          #"\nwith first object shape: "+str(train_y.shape)
          )
    return train_x,train_y, val_x, test_x, val_y, test_y

In [11]:
train_neg_x, train_neg_y, val_neg_x, test_neg_x, val_neg_y, test_neg_y = split_reviews_array_to_train_val_test(neg_reviews,0.0)
print("\n")
train_pos_x, train_pos_y, val_pos_x, test_pos_x, val_pos_y, test_pos_y = split_reviews_array_to_train_val_test(pos_reviews,1.0)

#Merging the neg and pos to the final sets
train_x = np.concatenate((train_neg_x,train_pos_x),axis = 0)
train_y = np.concatenate((train_neg_y,train_pos_y),axis = 0)
val_x = np.concatenate((val_neg_x,val_pos_x),axis = 0)
val_y = np.concatenate((val_neg_y,val_pos_y),axis = 0)
test_x = np.concatenate((test_neg_x,test_pos_x),axis = 0)
test_y = np.concatenate((test_neg_y,test_pos_y),axis = 0)

#print(type(train_x[0][0]))

For reviews with the label:0.0
			Feature Shapes:
Train set: 		(15000,) 
Validation set: 	(1875,) 
Test set: 		(1875,) 
with first object shape: (100, 300)


For reviews with the label:1.0
			Feature Shapes:
Train set: 		(15000,) 
Validation set: 	(1875,) 
Test set: 		(1875,) 
with first object shape: (118, 300)


In [12]:
print(type(train_x[0][0][0]))
print(train_x[0][0][0])
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)
print(test_x.shape)
print(test_y.shape)

<class 'numpy.float32'>
-0.20664
(30000,)
(3750,)
(3750,)
(3750,)
(3750,)


In [13]:
print(train_x[0][0].shape) #word
print(train_x[0].shape) #sentence
print(train_x.shape) # all sentences
print(train_x.shape[0])

(300,)
(100, 300)
(30000,)
30000


In [14]:
"""
a = train_x[1]
print(a.shape)
y = np.zeros(((2155 - a.shape[0]), 300))
b = np.concatenate((a,y),axis=0)
print(b.shape)
"""

'\na = train_x[1]\nprint(a.shape)\ny = np.zeros(((2155 - a.shape[0]), 300))\nb = np.concatenate((a,y),axis=0)\nprint(b.shape)\n'

In [15]:
max_sen = 250

for i in range(0,train_x.shape[0]):
    if train_x[i].shape[0] < max_sen:
        #padding = [[0]]* (max_sen - sentence.shape[0])
        padding = np.zeros(((max_sen - train_x[i].shape[0]), 300))
        train_x[i] = np.concatenate((train_x[i],padding), axis=0)
    else:
        train_x[i] = train_x[i][:250]

        
print(train_x[0].shape) #sentence
print(train_x[120].shape) #sentence


(250, 300)
(250, 300)


In [16]:
max_sen = 250

for i in range(0,test_x.shape[0]):
    if test_x[i].shape[0] < max_sen:
        #padding = [[0]]* (max_sen - sentence.shape[0])
        padding = np.zeros(((max_sen - test_x[i].shape[0]), 300))
        test_x[i] = np.concatenate((test_x[i],padding), axis=0)
    else:
        test_x[i] = test_x[i][:250]

        
print(test_x[0].shape) #sentence
print(test_x[120].shape) #sentence

(250, 300)
(250, 300)


In [17]:
max_sen = 250

for i in range(0,val_x.shape[0]):
    if val_x[i].shape[0] < max_sen:
        #padding = [[0]]* (max_sen - sentence.shape[0])
        padding = np.zeros(((max_sen - val_x[i].shape[0]), 300))
        val_x[i] = np.concatenate((val_x[i],padding), axis=0)
    else:
        val_x[i] = val_x[i][:250]

        
print(val_x[0].shape) #sentence
print(val_x[120].shape) #sentence

(250, 300)
(250, 300)


In [18]:
def dataset_padding(dataset,max_sen = 250):
    for i in range(0,dataset.shape[0]):
        if dataset[i].shape[0] < max_sen:
            #padding = [[0]]* (max_sen - sentence.shape[0])
            padding = np.zeros(((max_sen - dataset[i].shape[0]), 300))
            dataset[i] = np.concatenate((dataset[i],padding), axis=0)
        else:
            dataset[i] = train_x[i][:250]
    return dataset

In [19]:
train_x = dataset_padding(train_x)
#val_x = dataset_padding(val_x)
#test_x = dataset_padding(test_x)

In [20]:
class MyDataSet(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.target = y
        #self.length = [np.sum(1 - np.equal(x,0)) for x in X]
        
    def __getitem__(self, index):
        #x = self.data[index]
        
        x = torch.from_numpy(self.data[index])
        y = self.target[index]
        #x_len = self.length[index]
        
        return x, y#, x_len
    
    def __len__(self):
        return len(self.data)

In [21]:
# Create Tensor datasets by concatinating Xs and Ys
#train_data = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(train_y))
#valid_data = TensorDataset(val_x, torch.from_numpy(val_y))
#test_data = TensorDataset(test_x, torch.from_numpy(test_y))
#print(train_data.tensors)

train_dataset = MyDataSet(train_x, train_y)
val_dataset = MyDataSet(val_x, val_y)
test_dataset = MyDataSet(test_x, test_y)

BATCH_SIZE = 5

In [22]:
train_dataset[0]

(tensor([[-0.2066, -0.1377, -0.1112,  ...,  0.2793, -0.0070,  0.1385],
         [-0.0769, -0.0212,  0.2127,  ...,  0.1835, -0.2918, -0.0465],
         [-0.2971,  0.0940, -0.0967,  ...,  0.0597, -0.2285,  0.2960],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
        dtype=torch.float64),
 0.0)

In [23]:
train_loader= DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = int(val_x.shape[0]), shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = int(val_x.shape[0]), shuffle=True)

In [24]:
#Check if it worrked?
for x,y in train_loader:
    print(x.shape)
    print(y.shape)
    break
for x,y in val_loader:
    print(x.shape)
    print(y.shape)
    break
for x,y in test_loader:
    print(x.shape)
    print(y.shape)
    break

torch.Size([5, 250, 300])
torch.Size([5])
torch.Size([3750, 250, 300])
torch.Size([3750])
torch.Size([3750, 250, 300])
torch.Size([3750])


In [126]:
class NaiveCustomLSTM(nn.Module):
    def __init__(self, input_sz: int, hidden_sz: int):
        super().__init__()
        self.input_size = input_sz
        self.hidden_size = hidden_sz
        
        #i_t
        self.U_i = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.V_i = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_i = nn.Parameter(torch.Tensor(hidden_sz))
        
        #f_t
        self.U_f = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.V_f = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_f = nn.Parameter(torch.Tensor(hidden_sz))
        
        #c_t
        self.U_c = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.V_c = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_c = nn.Parameter(torch.Tensor(hidden_sz))
        
        #o_t
        self.U_o = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.V_o = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_o = nn.Parameter(torch.Tensor(hidden_sz))
        
        self.init_weights()
    
    #initializing the weights
    def init_weights(self):
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)
            
    def forward(self, x, init_states = None):               
       # assumes x.shape represents (batch_size, sequence_size, input_size)       
        print("AAAAAAAAA")
        print(x.size())
        print("BBBBBBBBBBBB")
        bs, seq_sz,_ = x.size()
        print("bs=")
        print(bs)
        print("seq_sz=")
        print(seq_sz)
        print("x[0]=")
        print(x[0])
        print(x[0].size())
        hidden_seq = []
        
        if init_states is None:
            h_t, c_t = (
                torch.zeros(bs, self.hidden_size).to(x.device),
                torch.zeros(bs, self.hidden_size).to(x.device),
                        )
        else:
            h_t, c_t = init_states
            
        for t in range(seq_sz):
            x_t = x[:, t]
            
            i_t = torch.sigmoid(x_t * self.U_i + h_t * self.V_i + self.b_i)
            f_t = torch.sigmoid(x_t * self.U_f + h_t * self.V_f + self.b_f)
            g_t = torch.tanh(x_t * self.U_c + h_t * self.V_c + self.b_c)
            o_t = torch.sigmoid(x_t * self.U_o + h_t * self.V_o + self.b_o)
            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * torch.tanh(c_t)
            
            hidden_seq.append(h_t.unsqueeze(0))
        
        #reshape hidden_seq p/ retornar
        hidden_seq = torch.cat(hidden_seq, dim=0)
        hidden_seq = hidden_seq.transpose(0, 1).contiguous()
        return hidden_seq, (h_t, c_t)

In [127]:
# Define Optimizer and Loss Function
LSTM_model = NaiveCustomLSTM(input_sz= 250, hidden_sz= 300)
optimizer = torch.optim.SGD(LSTM_model.parameters(), lr=0.05)
loss_func = torch.nn.MSELoss()
print(optimizer)
print(loss_func)

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.05
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)
MSELoss()


In [128]:
def calc_loss(inputs, labels, device, model):
    # if training on gpu
    inputs, labels = inputs.to(device), labels.to(device)
        
    # zero accumulated gradients
    model.zero_grad()

    # get the output from the model
    # x.size() -> [batch_size]
    batch_size = inputs.size(0)
        
    # IMPORTANT - change the dimensions of x before it enters the NN, batch size must always be first
   # x = inputs.unsqueeze(0)         # x.size() -> [1, batch_size]
    #x = x.view(batch_size, -1)      # x.size() -> [batch_size, 1]
    predictions = model(x)

    loss = loss_func(predictions.squeeze(), labels.float())
    
    return loss, predictions

In [129]:
def train_model(epochs, train_loader, model, optimizer):
    model.to(device)
    counter = 0
    for e in range(epochs):
        # batch loop
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            
            loss, predictions = calc_loss(inputs, labels, device, model)
            # Try not clipping
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(model.parameters(), clip)

            # Doing the optimizer step
            optimizer.step()

            # Actually training the net
            
            # Print weights
            #print(model[0].weight)
            # loss stats
            if counter % print_every == 0:
                fig, mean_loss = plot_predictions(valid_loader, model)
                print(f"Epoch: {e + 1}/{epochs}...\
                        \nStep: {counter}...\
                        \nVal Loss: {mean_loss}")         
                fig.show()
            counter += batch_size


In [130]:
def plot_predictions(valid_loader ,model):
    val_losses = []
    fig = plt.figure()
    ax = fig.add_subplot()
    
    # Get validation loss
    model.eval()
    for inputs, labels in valid_loader:
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        order = np.argsort(inputs[:, 0])
        sorted_inputs = inputs[order]
        sorted_labels = labels[np.argsort(labels)]
        val_loss, val_predictions = calc_loss(inputs, labels, device, model)
                
        val_losses.append(val_loss.item())
        
        # plot and show learning process
        ax.cla()
        ax.scatter(sorted_inputs[:,0].cpu().data.numpy(), sorted_labels.cpu().data.numpy())
        ax.plot(sorted_inputs[:,0].cpu().data.numpy(), val_predictions[order].cpu().data.numpy().squeeze(), 'r-')
        plt.pause(0.1)
    ax.text(0.5, 0, 'Loss=%.4f' % np.mean(val_losses), fontdict={'size': 10, 'color':  'red'})
    return fig, np.mean(val_losses)

In [131]:
# Define training params
epochs = 4
counter = 0
print_every = 2500
clip = 1000 # gradient clipping

In [132]:
#train the model
train_model(epochs, train_loader, LSTM_model, optimizer)

AAAAAAAAA
torch.Size([3750, 250, 300])
BBBBBBBBBBBB
bs=
3750
seq_sz=
250
x[0]=
tensor([[-0.1329,  0.1699, -0.1436,  ..., -0.2378,  0.1477,  0.6290],
        [-0.0983,  0.5568,  0.5399,  ...,  0.2752, -0.1221,  0.0906],
        [-0.2044,  0.1643,  0.0418,  ..., -0.3401, -0.0771, -0.0841],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       dtype=torch.float64)
torch.Size([250, 300])


RuntimeError: The size of tensor a (3750) must match the size of tensor b (250) at non-singleton dimension 0