In [17]:
import torch
import numpy as np
import torch.nn as nn
import math
import os
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F
import json

In [18]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        #does not pass all input words with eachother. Each word goes through independantly
        #and the output are the embeddings of the word. We want this because we do not 
        #want to concacenate the embeddings to the output nodes.
        self.embeddings = nn.Embedding(vocab_size, embedding_dim) 
        #now takes in all embeddings of each word stretched out
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        self.tokens = {}

    def forward(self, inputs):
        #embeds will be flattened matrix
        embeds = self.embeddings(inputs).view((1, -1))
        #rectified relu to learn embeddings
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        #output is the log probablities of all vocabulary
        return log_probs

    


In [19]:
#file path to credit card csv file
file_path = os.path.join(os.path.expanduser("~"), "Downloads", "mental_health.csv")
orig_dataset = pd.read_csv(file_path) #read csv file as pandas object
orig_dataset = orig_dataset.to_numpy()
print("Dataset size: "+ str(len(orig_dataset)))
dataset = orig_dataset[0:100] #use part of the dataset
print("Train Dataset size: "+ str(len(dataset)))
minFreq = {} #word must appear n times to be added to dictionary
dictionary = {} #relevant words in the dicationary
index = 2
for example in range(len(dataset)):
    for cont_response in range(2):
        if type(dataset[example][cont_response]) == float: #NaN values
            continue
        for word in dataset[example][cont_response].split():
            if word not in minFreq:
                minFreq[word]=1
            else:
                if minFreq[word]==3: #word needs to appear
                    dictionary[word] = index
                    index+=1
                minFreq[word]+=1

print( "Total Dictionary Size: 10,489")
print("Training Dictionary Size: " + str(index))

CONTEXT_SIZE = 3 #look 3 words back to predict current word
EMBEDDING_DIM = 252 #total embeddings for each word
all_ngrams = [] #ngram setup -> [(['through', 'going', "I'm"], 'some')]
for example in range(len(dataset)): 
    for cont_response in range(2): #context than response
        if type(dataset[example][cont_response]) == float: #NaN values
            continue
        cur_Sentence = dataset[example][cont_response].split() #seperate by word
        ngrams = [ #[(['through', 'going', "I'm"], 'some')]
            ([cur_Sentence[i - j - 1] for j in range(CONTEXT_SIZE)],cur_Sentence[i])
            for i in range(CONTEXT_SIZE, len(cur_Sentence))
            ]
        #append the grams to all_ngrams
        for i in ngrams:
            all_ngrams.append(i) 
loss_function = nn.NLLLoss() #loss layer
model = NGramLanguageModeler(index, EMBEDDING_DIM, CONTEXT_SIZE) #intialize Ngram model
optimizer = optim.Adam(model.parameters(), lr=0.0001)
model.tokens = dictionary
for epoch in range(10):
    total_loss = 0
    print("Epoch: "+ str(epoch))
    maxFreq = 3 #max number of times a word can be trained
    #dictionary to keep track of times word is trained. Will skip if words have been trained maxFreq times
    maxFreqDict = {}
    for context, target in all_ngrams:
        #if unknown word, just don't train
        if context[0] not in dictionary or context[1] not in dictionary or context[2] not in dictionary:
                continue
        if target not in dictionary:
                continue
        #add context words if not found in dict
        if context[0] not in maxFreqDict:
            maxFreqDict[context[0]] = 1
        if context[1] not in maxFreqDict:
            maxFreqDict[context[1]] = 1
        #if both words have been trained equal to or more than maxFreq times, continue
        #already has been trained enough
        if maxFreqDict[context[0]] >= maxFreq and maxFreqDict[context[1]] >= maxFreq:
            continue
        #update how many times the context words have been trained
        maxFreqDict[context[0]]+=1
        maxFreqDict[context[1]]+=1
            
        #turn each word to an integer and wrapped in tensor so pass as an input to the model
        context_idxs = torch.tensor([dictionary[w] for w in context], dtype=torch.long)

        #zero out gradients cause it accumulates
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)
        
        
            

        #apply the loss function to the log probabilties with the correct target word
        loss = loss_function(log_probs, torch.tensor([dictionary[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    print(f"Total_Loss: {total_loss}")





Dataset size: 3512
Train Dataset size: 100
Total Dictionary Size: 10,489
Training Dictionary Size: 586
Epoch: 0
Total_Loss: 5931.0312304496765
Epoch: 1
Total_Loss: 4418.207268476486
Epoch: 2
Total_Loss: 3554.61175686121
Epoch: 3
Total_Loss: 2969.3809459507465
Epoch: 4
Total_Loss: 2433.111342936754
Epoch: 5
Total_Loss: 1901.7123607248068
Epoch: 6
Total_Loss: 1392.3067227303982
Epoch: 7
Total_Loss: 951.2950777262449
Epoch: 8
Total_Loss: 616.0986571870744
Epoch: 9
Total_Loss: 386.4205076172948


In [20]:
torch.save(model.state_dict(), "embedding_model")

In [23]:
print(model.embeddings(torch.tensor([1])))

tensor([[-0.3156, -1.6006, -0.6120,  2.2860, -1.5039,  0.4343,  2.2365, -1.1154,
          2.8599,  1.9770, -0.6094, -0.4991, -1.1724,  1.1033,  0.7778, -0.9378,
         -1.7616,  0.7905,  0.3753,  0.3964, -0.1820,  0.4403,  1.5512, -0.3941,
          0.3050,  0.3628, -0.3021, -0.1019, -0.3420,  0.3081,  0.8020,  0.2249,
          1.8731, -1.4247, -0.6094, -0.3284,  0.6729, -0.3771, -0.4615,  0.1513,
          0.1155,  1.7498,  1.5085, -0.9044, -1.3142,  0.3005, -1.0747,  1.7163,
         -0.3107, -0.7557,  0.9453, -0.2776, -0.7360,  0.9223, -1.0980, -1.8236,
         -0.3535,  1.6464, -0.6826,  0.0778, -0.5421, -0.3312,  1.6674, -0.5891,
         -1.8353, -1.5515,  0.6684,  0.6346,  0.8659,  0.1948,  0.6650,  0.6090,
          0.0421, -0.9157, -0.7937,  0.1100, -0.4822, -0.0822, -1.2967,  0.2717,
         -0.8447,  0.4285,  0.4587, -1.6312,  1.5715, -0.0614, -0.0248,  1.4613,
         -0.2493, -0.5492, -0.6738, -0.1331, -0.1685,  1.0556, -1.2171,  1.0258,
         -0.7952, -1.0754, -

In [22]:
json.dump(model.tokens, open("tokens.txt",'w'))