In [39]:
import os
import pandas as pd
from pathlib import Path

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x2c39473ad90>

The following codebase implements a deep learning framework in PyTorch following Robert Guthrie's Pytorch tutorial at https://github.com/rguthrie3/DeepLearningForNLPInPytorch

In [40]:
#Read in train data.
data_folder = Path(os.getcwd()).parents[1].joinpath('data')
train_df = pd.read_csv(data_folder.joinpath('train_clean.csv'), index_col=0)
train_df.head()

Unnamed: 0,id,keyword,location,text,target,valid_location
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,0
2,5,,,All residents asked to 'shelter in place' are ...,1,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0


In [81]:
test_df = pd.read_csv(data_folder.joinpath('test.csv'), index_col=0)
test_df.reset_index(inplace=True)
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [42]:
#Preparing the data

# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
word_to_ix = {}
for sent in train_df['text']:
    for word in sent.split():
        word = word.lower()
        if word.isalnum():
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2

In [70]:
list1 = ['a','b','x']
dict1 = {'a':1, 'b':2, 'c':3, 'd':4}
list(set(list1) & set(dict1.keys()))

['b', 'a']

In [75]:
def make_bow_vector(sentence, word_to_ix, mode='train'):
    vec = torch.zeros(len(word_to_ix))
    
    if mode=='train':
        word_list = sentence.split()
    else:
        word_list = [word.lower() for word in sentence.split()]
        word_list = list(set(word_to_ix.keys()) & set(word_list))
    for word in word_list:
        word = word.lower()
        if word.isalnum():
            vec[word_to_ix[word]] += 1
    return vec.view(1, -1)

def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])

In [48]:
class BoWClassifier(nn.Module): # inheriting from nn.Module!
    
    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()
        
        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)
        
        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here
        
    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec))

In [49]:
model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function of a module,
# which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the Pytorch devs, your module (in this case, BoWClassifier)
# will store knowledge of the nn.Linear's parameters
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[-8.7257e-05, -5.0000e-04,  6.6152e-04,  ..., -4.1416e-03,
         -1.7988e-03, -3.5482e-03],
        [-3.9349e-03, -3.2750e-03,  1.4352e-03,  ..., -1.2616e-03,
         -1.3984e-03,  5.2349e-05]], requires_grad=True)
Parameter containing:
tensor([ 0.0062, -0.0006], requires_grad=True)


In [50]:
# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable
sample = train_df['text'][1]
bow_vector = make_bow_vector(sample, word_to_ix)
log_probs = model(autograd.Variable(bow_vector))
print(log_probs)

tensor([[-0.6805, -0.7059]], grad_fn=<LogSoftmaxBackward0>)


  return F.log_softmax(self.linear(bow_vec))


In [51]:
bow_vector.shape

torch.Size([1, 11792])

In [52]:
label_to_ix = { 0: 0, 1: 1 }

In [53]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [54]:
for epoch in range(100):
    for instance, label in zip(train_df['text'],train_df['target']):
        # Step 1. Remember that Pytorch accumulates gradients.  We need to clear them out
        # before each instance
        model.zero_grad()
    
        # Step 2. Make our BOW vector and also we must wrap the target in a Variable
        # as an integer.  For example, if the target is SPANISH, then we wrap the integer
        # 0.  The loss function then knows that the 0th element of the log probabilities is
        # the log probability corresponding to SPANISH
        bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
        target = autograd.Variable(make_target(label, label_to_ix))
        #target = label
        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)
    
        # Step 4. Compute the loss, gradients, and update the parameters by calling
        # optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

  return F.log_softmax(self.linear(bow_vec))


In [82]:
pred_list = []
for instance in test_df['text']:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix, mode='test'))
    log_probs = model(bow_vec)
    pred_list.append((int(torch.argmax(log_probs))))

  return F.log_softmax(self.linear(bow_vec))


In [83]:
test_df['target'] = pred_list
test_df[['id','target']].to_csv(data_folder.joinpath('submission_26102024.csv'))