In [6]:
import torch
import numpy as np
import torch.nn as nn
import math
import os
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F
import json

In [8]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        #does not pass all input words with eachother. Each word goes through independantly
        #and the output are the embeddings of the word. We want this because we do not 
        #want to concacenate the embeddings to the output nodes.
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim) 
        #now takes in all embeddings of each word stretched out
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        self.tokens = {}

    def forward(self, inputs):
        #embeds will be flattened matrix
        embeds = self.embeddings(inputs).view((1, -1))
        #rectified relu to learn embeddings
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        #output is the log probablities of all vocabulary
        return log_probs
    def multiply_embedding_weights(self):
        # Multiply the weights of the embedding layer by sqrt(embedding_dim)
        self.embeddings.weight.data = self.embeddings.weight.data * (self.embedding_dim ** 0.5)
    


In [9]:
#file path to credit card csv file
file_path = os.path.join(os.path.expanduser("~"), "Downloads", "mental_health.csv")
orig_dataset = pd.read_csv(file_path) #read csv file as pandas object
orig_dataset = orig_dataset.to_numpy()
print("Dataset size: "+ str(len(orig_dataset)))
dataset = orig_dataset[np.random.choice(orig_dataset.shape[0], size=1100, replace=True)]
print("Train Dataset size: "+ str(len(dataset)))
minFreq = {} #word must appear n times to be added to dictionary
dictionary = {} #relevant words in the dicationary
index = 5
for example in range(len(dataset)):
    for cont_response in range(2):
        if type(dataset[example][cont_response]) == float: #NaN values
            continue
        for word in dataset[example][cont_response].split():
            if word not in minFreq:
                minFreq[word]=1
            else:
                if minFreq[word]==3: #word needs to appear
                    dictionary[word] = index
                    index+=1
                minFreq[word]+=1

print( "Total Dictionary Size: 10,489")
print("Training Dictionary Size: " + str(index))

CONTEXT_SIZE = 3 #look 3 words back to predict current word
EMBEDDING_DIM = 252 #total embeddings for each word
all_ngrams = [] #ngram setup -> [(['through', 'going', "I'm"], 'some')]
for example in range(len(dataset)): 
    for cont_response in range(2): #context than response
        if type(dataset[example][cont_response]) == float: #NaN values
            continue
        cur_Sentence = dataset[example][cont_response].split() #seperate by word
        ngrams = [ #[(['through', 'going', "I'm"], 'some')]
            ([cur_Sentence[i - j - 1] for j in range(CONTEXT_SIZE)],cur_Sentence[i])
            for i in range(CONTEXT_SIZE, len(cur_Sentence))
            ]
        #append the grams to all_ngrams
        for i in ngrams:
            all_ngrams.append(i) 
loss_function = nn.NLLLoss() #loss layer
model = NGramLanguageModeler(index, EMBEDDING_DIM, CONTEXT_SIZE) #intialize Ngram model
optimizer = optim.Adam(model.parameters(), lr=0.0001)
model.tokens = dictionary
for epoch in range(25):
    total_loss = 0
    print("Epoch: "+ str(epoch))
    maxFreq = 3 #max number of times a word can be trained
    #dictionary to keep track of times word is trained. Will skip if words have been trained maxFreq times
    maxFreqDict = {}
    for context, target in all_ngrams:
        #if unknown word, just don't train
        if context[0] not in dictionary or context[1] not in dictionary or context[2] not in dictionary:
                continue
        if target not in dictionary:
                continue
        #add context words if not found in dict
        if context[0] not in maxFreqDict:
            maxFreqDict[context[0]] = 1
        if context[1] not in maxFreqDict:
            maxFreqDict[context[1]] = 1
        #if both words have been trained equal to or more than maxFreq times, continue
        #already has been trained enough
        if maxFreqDict[context[0]] >= maxFreq and maxFreqDict[context[1]] >= maxFreq:
            continue
        #update how many times the context words have been trained
        maxFreqDict[context[0]]+=1
        maxFreqDict[context[1]]+=1
            
        #turn each word to an integer and wrapped in tensor so pass as an input to the model
        context_idxs = torch.tensor([dictionary[w] for w in context], dtype=torch.long)

        #zero out gradients cause it accumulates
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)
        
        
            

        #apply the loss function to the log probabilties with the correct target word
        loss = loss_function(log_probs, torch.tensor([dictionary[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    print(f"Total_Loss: {total_loss}")





Dataset size: 3512
Train Dataset size: 1100
Total Dictionary Size: 10,489
Training Dictionary Size: 4928
Epoch: 0
Total_Loss: 60804.26291155815
Epoch: 1
Total_Loss: 46571.5169621706
Epoch: 2
Total_Loss: 41252.384687781334
Epoch: 3
Total_Loss: 36316.487325042486
Epoch: 4
Total_Loss: 30948.29644267261
Epoch: 5
Total_Loss: 24833.90383963287
Epoch: 6
Total_Loss: 18685.234041979536
Epoch: 7
Total_Loss: 13621.824801164097
Epoch: 8
Total_Loss: 9793.968410769536
Epoch: 9
Total_Loss: 6935.741658010666
Epoch: 10
Total_Loss: 4771.109002284051
Epoch: 11
Total_Loss: 3156.6321763049145
Epoch: 12
Total_Loss: 1982.497338891295
Epoch: 13
Total_Loss: 1187.9729907933788
Epoch: 14
Total_Loss: 679.8280201242029
Epoch: 15
Total_Loss: 385.161126025614
Epoch: 16
Total_Loss: 229.17182286976077
Epoch: 17
Total_Loss: 161.37878057601776
Epoch: 18
Total_Loss: 112.5939684582404
Epoch: 19
Total_Loss: 106.8165105948903
Epoch: 20
Total_Loss: 89.95412244207483
Epoch: 21
Total_Loss: 84.07676995928219
Epoch: 22
Total_Los

In [10]:
model.multiply_embedding_weights()
torch.save(model.state_dict(), "embedding_model")

In [11]:
print(model.embeddings(torch.tensor([0])))

tensor([[-27.3289,   0.5755,   7.6977,  -1.7063,  11.6704, -16.0344, -13.8994,
          -5.5939,  23.0890,  -4.3854,  -8.6859,  10.1936,  24.3662, -28.8604,
         -10.1478, -15.4639, -33.8165, -29.0961, -15.7963,   4.3913, -10.6751,
          -5.4325,  -1.3452, -46.0790,  40.6227,   8.9821,  -8.2570,  -4.0520,
          -2.2921, -19.6806,  20.7959,  17.1168,  13.2371,  -6.3065, -14.8282,
          -6.2137,   1.8696,  25.3100,   1.8748,  -5.2926,  11.0353,  10.8461,
          -9.0835,  -9.5109,   4.8956,  11.1274, -32.6840, -25.0068,   3.6075,
          -4.2316, -31.4773, -15.6223,  13.5155,  -9.6214, -46.1658,  -3.8901,
          25.2988,  -2.0704,  13.7544, -32.0602,  -5.4711,  -6.2360,  -0.6747,
         -14.0538,   0.8529,  12.0244,  34.4057,  -2.3645,  11.3389,   3.8502,
          -0.3880,  14.1577,  15.6571,   1.5358,  -1.1155,  -5.0325,   2.5739,
          -3.1278,  14.8901,  15.7941,  19.9152,   4.4657,  27.4171,  12.8808,
           5.6889,   5.4184, -10.4300,  10.7177,   9

In [12]:
json.dump(model.tokens, open("tokens.txt",'w'))

In [13]:
import torch
import numpy as np
import torch.nn as nn
import math
import os
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F
import json


avg_res = 0
avg_cont = 0
t =0

#file path to credit card csv file
file_path = os.path.join(os.path.expanduser("~"), "Downloads", "mental_health.csv")
orig_dataset = pd.read_csv(file_path) #read csv file as pandas object
orig_dataset =  orig_dataset.to_numpy()
orig_dataset = orig_dataset.tolist()
for example in orig_dataset:
    acc =0 
    for r_c in example:
        if type(r_c) is float:
            continue
        if acc ==0: 
            avg_cont+=len(r_c.split())
        else:
            avg_res+=len(r_c.split())
        acc+=1
    t+=1
print(avg_cont/t)
print(avg_res/t)

55.18080865603645
177.001993166287


In [19]:
print(model.embeddings(torch.tensor([[0, 1, 2], [3,4,5]])))


tensor([[-2.7329e+01,  5.7545e-01,  7.6977e+00, -1.7063e+00,  1.1670e+01,
         -1.6034e+01, -1.3899e+01, -5.5939e+00,  2.3089e+01, -4.3854e+00,
         -8.6859e+00,  1.0194e+01,  2.4366e+01, -2.8860e+01, -1.0148e+01,
         -1.5464e+01, -3.3817e+01, -2.9096e+01, -1.5796e+01,  4.3913e+00,
         -1.0675e+01, -5.4325e+00, -1.3452e+00, -4.6079e+01,  4.0623e+01,
          8.9821e+00, -8.2570e+00, -4.0520e+00, -2.2921e+00, -1.9681e+01,
          2.0796e+01,  1.7117e+01,  1.3237e+01, -6.3065e+00, -1.4828e+01,
         -6.2137e+00,  1.8696e+00,  2.5310e+01,  1.8748e+00, -5.2926e+00,
          1.1035e+01,  1.0846e+01, -9.0835e+00, -9.5109e+00,  4.8956e+00,
          1.1127e+01, -3.2684e+01, -2.5007e+01,  3.6075e+00, -4.2316e+00,
         -3.1477e+01, -1.5622e+01,  1.3515e+01, -9.6214e+00, -4.6166e+01,
         -3.8901e+00,  2.5299e+01, -2.0704e+00,  1.3754e+01, -3.2060e+01,
         -5.4711e+00, -6.2360e+00, -6.7468e-01, -1.4054e+01,  8.5287e-01,
          1.2024e+01,  3.4406e+01, -2.