In [14]:
import re                                  # library for regular expression operations
import string                              # for string operations

import nltk
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

# Download stopwords if you haven't already
nltk.download('stopwords')
stemmer = PorterStemmer()


import numpy as np
from collections import defaultdict, Counter


import torch
import torch.nn as nn
import torch.optim as optim

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kolvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
def wordTockenize(sentence):
    tokens = word_tokenize(sentence)
    return tokens

In [16]:
punct = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~"

def removePunctandStopWords(tokens):
    toks = []
    for i, token in enumerate(tokens):
        if token.lower() not in stopwords.words('english'):
            if token not in punct:
                toks.append(token)

    return toks



In [17]:

def stem(tokens):
    for i, token in enumerate(tokens):
        tokens[i] = stemmer.stem(token)
    return tokens

In [18]:
def build_freqs(tokens):
    # Build from tockenized preprocessed words
    freqs = defaultdict()

    for word in tokens:
        if word in freqs:
            freqs[word] += 1
        else:
            freqs[word] = 1    
    return freqs



sentence = "String should be tokenized. String must be stemmed. String must not have punctuations"

tokens = wordTockenize(sentence)
tokens = removePunctandStopWords(tokens)
tokens = stem(tokens)

freq = build_freqs(tokens)

freq.items()


dict_items([('string', 3), ('token', 1), ('must', 2), ('stem', 1), ('punctuat', 1)])

In [19]:
postiveSentences = ["I am happy", "Happiness is bliss", "I enjoy life", "I enjoy learning machine learning", "I like to build machine learning models", "I like to play basketball", "I like my colleagues at FEV", "It was a great day", "Live your life for the fullest", "I love my roomates", "I like something", "I like footaball"]

negativeSentences = ["I am sad", "I dont not like", "I hate to", "I a worried", "It is slow", "We should stop", 'I dont like machine learning', "I give up", "I hate it", "Sadness is not good", "He is upset"]

In [20]:
yp = np.ones((10,1))
yn = np.zeros((10,1))

def buildData(listVal):
    all = []

    if type(listVal) is list:
        for data in listVal:
            tokens = wordTockenize(data)
            tokens = removePunctandStopWords(tokens)
            tokens = stem(tokens)
            all += tokens
    else:
        tokens = wordTockenize(listVal)
        tokens = removePunctandStopWords(tokens)
        tokens = stem(tokens)
        all += tokens

    return all



def buildFreqTable(tokens, y):
    dict = defaultdict()
    freq = build_freqs(tokens)
    for key in freq.keys():
        val = (key, y)
        dict[val] = freq[key]

    return dict

def extractFeatures(words, freq):
    x  = np.zeros(3)

    # bias term is set to 1
    x[0] = 1 

 # loop through each word in the list of words
    for word in words:
        
        if (word, 1.0) in freq:
            # increment the word count for the positive label 1
            x[1] += freq.get((word, 1.0))
        else:
            x[1] += 0
            
        if (word, 0.0) in freq:
            # increment the word count for the positive label 0
            x[2] += freq.get((word, 0.0))
        else:
            x[2] += 0
        
    x = x[None, :]  # adding batch dimension for further processing
    assert(x.shape == (1, 3))
    return x


In [21]:
buildDataPos = buildData(postiveSentences)
buildDataNeg = buildData(negativeSentences)

labelledDataPos = buildFreqTable(buildDataPos, 1.0)
labelledDataNeg = buildFreqTable(buildDataNeg, 0.0)


allWords = buildDataPos + buildDataNeg
allDictword = merged_dict = defaultdict(int, dict(Counter(labelledDataPos) + Counter(labelledDataNeg)))


train_x = postiveSentences + negativeSentences
train_y = np.concatenate((np.ones(len(postiveSentences)), np.zeros(len(negativeSentences))), axis=0).reshape(-1, 1)
tmp = extractFeatures(allWords, allDictword)
pass

In [22]:
# Define the neural network class
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        
        # Define the layers
        self.fc1 = nn.Linear(3, 4)  # Fully connected layer with 3 input neurons and 4 hidden neurons
        self.relu = nn.ReLU()       # Activation function (ReLU)
        self.fc2 = nn.Linear(4, 4)  # Fully connected layer with 4 hidden neurons and 4 output neurons
        self.fc3 = nn.Linear(4, 1)  # Fully connected layer with 4 hidden neurons and 4 output neurons
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

# Create an instance of the neural network
model = NeuralNetwork()

# Define a custom loss function (MSE)
criterion = nn.MSELoss()

# Define an optimizer (e.g., stochastic gradient descent)
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Print the model architecture
print(model)

NeuralNetwork(
  (fc1): Linear(in_features=3, out_features=4, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=4, out_features=4, bias=True)
  (fc3): Linear(in_features=4, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [23]:
# We can shuffle the data if we wish to

# Create a PyTorch tensor
original_tensor = torch.tensor([1, 2, 3, 4, 5])

# Get the number of elements in the tensor
num_elements = original_tensor.size(0)

# Generate a random permutation of indices
random_indices = torch.randperm(num_elements)

# Use the random indices to shuffle the tensor
shuffled_tensor = original_tensor[random_indices]

# Print the shuffled tensor
print(shuffled_tensor)

tensor([5, 1, 2, 4, 3])


In [24]:
# Value of input X to train the model
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extractFeatures(buildData(train_x[i]), allDictword)

# training labels corresponding to X
Y = train_y


[[1. 0. 0.]]


In [25]:
# Training the data Model
def trainingModel(X, Y):
    # Training loop
    num_epochs = 10000
    index = 0

    for epoch in range(num_epochs):
        index = index % len(train_x)
        
        # i = i % len(train_x)
        input_data = torch.tensor(X[index], dtype=torch.float32)
        target = torch.tensor(Y[index], dtype=torch.float32)

        
        index += 1

        # Forward pass
        predictions = model(input_data)
        
        # Compute the loss
        loss = criterion(predictions, target)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Backpropagation
        loss.backward()
        
        # Update the model parameters
        optimizer.step()
        
        # Print the loss for monitoring
        if (epoch + 1) % 100 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')


trainingModel(X, Y)

Epoch [100/10000], Loss: 0.2219
Epoch [200/10000], Loss: 0.2844
Epoch [300/10000], Loss: 0.2288
Epoch [400/10000], Loss: 0.2155
Epoch [500/10000], Loss: 0.2782
Epoch [600/10000], Loss: 0.2250
Epoch [700/10000], Loss: 0.2218
Epoch [800/10000], Loss: 0.2683
Epoch [900/10000], Loss: 0.2201
Epoch [1000/10000], Loss: 0.1820
Epoch [1100/10000], Loss: 0.4264
Epoch [1200/10000], Loss: 0.1221
Epoch [1300/10000], Loss: 0.1371
Epoch [1400/10000], Loss: 0.2189
Epoch [1500/10000], Loss: 0.0381
Epoch [1600/10000], Loss: 0.2083
Epoch [1700/10000], Loss: 0.1752
Epoch [1800/10000], Loss: 0.0455
Epoch [1900/10000], Loss: 0.5995
Epoch [2000/10000], Loss: 0.1157
Epoch [2100/10000], Loss: 0.0164
Epoch [2200/10000], Loss: 0.0825
Epoch [2300/10000], Loss: 0.0714
Epoch [2400/10000], Loss: 0.1250
Epoch [2500/10000], Loss: 0.0516
Epoch [2600/10000], Loss: 0.0982
Epoch [2700/10000], Loss: 0.0217
Epoch [2800/10000], Loss: 0.0301
Epoch [2900/10000], Loss: 0.0369
Epoch [3000/10000], Loss: 0.0684
Epoch [3100/10000],

In [26]:

my_sentence = 'i like'

x_calc = extractFeatures(buildData(my_sentence), allDictword)
y_hat = model(torch.tensor(x_calc, dtype=torch.float32))
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')


tensor([[0.8339]], grad_fn=<SigmoidBackward0>)
Positive sentiment
