In [1]:
import os
import re
import string
import numpy as np
from collections import Counter
from nltk.tokenize import word_tokenize
import pickle

In [2]:
# Define the emoticon pattern
emoticon_pattern = r'[:=;][oO\-]?[D\)\]\(\]/\\OpP]'

In [3]:
# Function to process text
def process_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    words = re.findall(r'\w+|[' + string.punctuation + ']', text)
    return words

In [4]:
# Function to create vocabulary
def create_vocab(directory):
    vocab = Counter()
    for path, _, files in os.walk(directory):
        for file in files:
            with open(os.path.join(path, file), 'r') as f:
                text = f.read()
                words = process_text(text)
                vocab.update(words)
    return vocab

In [5]:
# Function to load documents
def load_documents(directory):
    documents = []
    for path, _, files in os.walk(directory):
        for file in files:
            with open(os.path.join(path, file), 'r') as f:
                text = f.read()
                words = process_text(text)
                documents.append(words)
    return documents

In [6]:
# Function to create n-grams
def create_ngrams(documents):
    positive_ngrams = []
    for doc in documents:
        for i in range(len(doc) - 1):
            positive_ngrams.append((doc[i], doc[i + 1]))
    return positive_ngrams

In [7]:
# Function to create negative n-grams
def create_negative_samples(positive_ngrams, vocab):
    negative_ngrams = []
    for first_word, _ in positive_ngrams:
        for _ in range(2):  # Create 2 negative samples for each positive n-gram
            second_word = np.random.choice(list(vocab.keys()))
            while second_word == first_word:
                second_word = np.random.choice(list(vocab.keys()))
            negative_ngrams.append((first_word, second_word))
    return negative_ngrams

In [8]:
# Convert n-grams to feature vectors
def ngrams_to_features(ngrams, vocab):
    X = []
    for first_word, second_word in ngrams:
        if first_word in vocab and second_word in vocab:
            feature_vector = np.zeros(len(vocab))
            feature_vector[vocab[first_word]] = 1
            feature_vector[vocab[second_word]] = 1
            X.append(feature_vector)
    return np.array(X)

In [9]:
# Define sigmoid and MSE functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def mse(y_true, y_pred):
    return np.mean(np.square(y_true - y_pred))

def mse_derivative(y_true, y_pred):
    return 2 * (y_pred - y_true) / y_true.size

In [10]:
# Define the Neural Network class
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        self.W1 = np.random.randn(hidden_size, input_size)
        self.b1 = np.zeros((hidden_size, 1))
        self.W2 = np.random.randn(output_size, hidden_size)
        self.b2 = np.zeros((output_size, 1))

    def forward(self, X):
        self.Z1 = np.dot(self.W1, X) + self.b1
        self.A1 = sigmoid(self.Z1)
        self.Z2 = np.dot(self.W2, self.A1) + self.b2
        self.A2 = sigmoid(self.Z2)
        return self.A2

    def backward(self, X, Y, output, learning_rate=0.0001):
        dZ2 = mse_derivative(Y, output) * sigmoid_derivative(output)
        dW2 = np.dot(dZ2, self.A1.T)
        db2 = np.sum(dZ2, axis=1, keepdims=True)
        dZ1 = np.dot(self.W2.T, dZ2) * sigmoid_derivative(self.A1)
        dW1 = np.dot(dZ1, X.T)
        db1 = np.sum(dZ1, axis=1, keepdims=True)
        
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2

    def train(self, X, Y, epochs):
        for epoch in range(epochs):
            output = self.forward(X)
            loss = mse(Y, output)
            print(f'Epoch {epoch}, Loss: {loss}')
            self.backward(X, Y, output)

    def predict(self, X):
        output = self.forward(X)
        return (output > 0.5).astype(int)

In [11]:
positive_train_dir = '/Users/amanmehmood/AIT 726/programming_assignment_2/data/tweet/train/positive'

In [12]:
documents = load_documents(positive_train_dir)

In [13]:
vocab = create_vocab(positive_train_dir)

In [14]:
vocab = {word: idx for idx, (word, _) in enumerate(vocab.items())}

In [15]:
positive_ngrams = create_ngrams(documents)
negative_ngrams = create_negative_samples(positive_ngrams, vocab)

In [16]:
positive_features = ngrams_to_features(positive_ngrams, vocab)
negative_features = ngrams_to_features(negative_ngrams, vocab)

In [17]:
positive_labels = np.ones((len(positive_features), 1))
negative_labels = np.zeros((len(negative_features), 1))

In [18]:
X_train = np.vstack((positive_features, negative_features)).T
Y_train = np.vstack((positive_labels, negative_labels)).T

In [19]:
input_size = X_train.shape[0]
hidden_size = 20
output_size = 1

In [20]:
nn = NeuralNetwork(input_size, hidden_size, output_size)
nn.train(X_train, Y_train, epochs=100)

Epoch 0, Loss: 0.6296232039137326
Epoch 1, Loss: 0.6296225751883591
Epoch 2, Loss: 0.6296219464448214
Epoch 3, Loss: 0.6296213176831188
Epoch 4, Loss: 0.6296206889032507
Epoch 5, Loss: 0.6296200601052162
Epoch 6, Loss: 0.6296194312890148
Epoch 7, Loss: 0.6296188024546456
Epoch 8, Loss: 0.6296181736021083
Epoch 9, Loss: 0.6296175447314017
Epoch 10, Loss: 0.6296169158425255
Epoch 11, Loss: 0.6296162869354787
Epoch 12, Loss: 0.6296156580102608
Epoch 13, Loss: 0.6296150290668712
Epoch 14, Loss: 0.6296144001053089
Epoch 15, Loss: 0.6296137711255734
Epoch 16, Loss: 0.6296131421276638
Epoch 17, Loss: 0.6296125131115798
Epoch 18, Loss: 0.6296118840773206
Epoch 19, Loss: 0.6296112550248851
Epoch 20, Loss: 0.6296106259542729
Epoch 21, Loss: 0.6296099968654834
Epoch 22, Loss: 0.6296093677585157
Epoch 23, Loss: 0.6296087386333694
Epoch 24, Loss: 0.6296081094900432
Epoch 25, Loss: 0.6296074803285372
Epoch 26, Loss: 0.62960685114885
Epoch 27, Loss: 0.6296062219509813
Epoch 28, Loss: 0.62960559273493

In [25]:
with open('C:\\Users\\AMEHMOOD\\Documents\\Repos\\programming_assignment_2\\models\\trained_lm_nn.pkl', 'wb') as f:
    pickle.dump(nn, f)

In [21]:
positive_test_dir = '/Users/amanmehmood/AIT 726/programming_assignment_2/data/tweet/test/positive'

In [22]:
test_documents = load_documents(positive_test_dir)

In [23]:
test_positive_ngrams = create_ngrams(test_documents)
test_negative_ngrams = create_negative_samples(test_positive_ngrams, vocab)

test_positive_features = ngrams_to_features(test_positive_ngrams, vocab)
test_negative_features = ngrams_to_features(test_negative_ngrams, vocab)

test_positive_labels = np.ones((len(test_positive_features), 1))
test_negative_labels = np.zeros((len(test_negative_features), 1))

X_test = np.vstack((test_positive_features, test_negative_features)).T
Y_test = np.vstack((test_positive_labels, test_negative_labels)).T

In [24]:
predictions = nn.predict(X_test)
accuracy = np.mean(predictions == Y_test) * 100

In [26]:
with open('/Users/amanmehmood/AIT 726/programming_assignment_2/reports/lm_evaluation.log', 'w') as f:
    f.write(f'Accuracy: {accuracy}\n')