In [56]:
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import Counter
import math
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle

In [26]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11002]
[nltk_data]     getaddrinfo failed>


False

In [27]:
stemmer = PorterStemmer()

stop_words = set(stopwords.words('english'))

emoticon_pattern = r'[:=;][oO\-]?[D\)\]\(\]/\\OpP]'

In [28]:
def process_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    
    words = re.split(r'\W+', text)
    words = [stemmer.stem(word) for word in words if word not in string.punctuation]
    words = [word for word in words if word not in stop_words and not re.match(emoticon_pattern, word)]

    return words

In [29]:
def create_vocab(directory):
    vocab = Counter()

    for path, _, files in os.walk(directory):
        for file in files:
            with open(os.path.join(path, file), 'r') as f:
                text = f.read()
                words = process_text(text)
                vocab.update(words)

    return vocab

In [30]:
vocab = create_vocab('C:\\Users\\AMEHMOOD\\Documents\\Repos\\programming_assignment_2\\data\\raw\\tweet\\train')

In [31]:
def calculate_tf(document_words, vocab):
    tf = dict.fromkeys(vocab, 0)
    word_count = Counter(document_words)

    for word, count in word_count.items():
        if word in vocab:
            tf[word] = count / len(document_words)

    return tf

In [32]:
def calculate_idf(documents, vocab):
    N = len(documents)
    idf = dict.fromkeys(vocab, 0)

    for document in documents:
        for word in vocab:
            if word in document:
                idf[word] += 1
    
    for word in vocab:
        idf[word] = math.log(N / float(idf[word]))
        
    return idf

In [33]:
def calculate_tf_idf(tf, idf, vocab):
    tf_idf = dict.fromkeys(vocab, 0)

    for word in vocab:
        tf_idf[word] = tf[word] * idf[word]
        
    return tf_idf

In [34]:
def load_documents(directory):
    documents = []
    
    for path, _, files in os.walk(directory):
        for file in files:
            with open(os.path.join(path, file), 'r') as f:
                text = f.read()
                words = process_text(text)
                documents.append(words)
                
    return documents

In [35]:
documents = load_documents('C:\\Users\\AMEHMOOD\\Documents\\Repos\\programming_assignment_2\\data\\raw\\tweet\\train')

In [36]:
tfs = [calculate_tf(document, vocab) for document in documents]

idf = calculate_idf(documents, vocab)

tf_idfs = [calculate_tf_idf(tf, idf, vocab) for tf in tfs]

In [37]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def mse(y_true, y_pred):
    return np.mean(np.square(y_true - y_pred))

def mse_derivative(y_true, y_pred):
    return 2 * (y_pred - y_true) / y_true.size

In [48]:
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        self.W1 = np.random.randn(hidden_size, input_size)
        self.b1 = np.zeros((hidden_size, 1))
        self.W2 = np.random.randn(output_size, hidden_size)
        self.b2 = np.zeros((output_size, 1))

    def forward(self, X):
        self.Z1 = np.dot(self.W1, X) + self.b1
        self.A1 = sigmoid(self.Z1)
        self.Z2 = np.dot(self.W2, self.A1) + self.b2
        self.A2 = sigmoid(self.Z2)
        return self.A2

    def backward(self, X, Y, output, learning_rate=0.0001):
        dZ2 = mse_derivative(Y, output) * sigmoid_derivative(output)
        dW2 = np.dot(dZ2, self.A1.T)
        db2 = np.sum(dZ2, axis=1, keepdims=True)
        dZ1 = np.dot(self.W2.T, dZ2) * sigmoid_derivative(self.A1)
        dW1 = np.dot(dZ1, X.T)
        db1 = np.sum(dZ1, axis=1, keepdims=True)
        
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2

    def train(self, X, Y, epochs):
        for epoch in range(epochs):
            output = self.forward(X)
            loss = mse(Y, output)
            print(f'Epoch {epoch}, Loss: {loss}')
            self.backward(X, Y, output)

    def predict(self, X):
        output = self.forward(X)
        return (output > 0.5).astype(int)

In [49]:
nn = NeuralNetwork(input_size=len(vocab), hidden_size=20, output_size=1)

X_train = np.array([list(tf_idf.values()) for tf_idf in tf_idfs])

Y_train = np.random.randint(0, 2, (1, len(tf_idfs)))

nn.train(X_train.T, Y_train, epochs=100)

Epoch 0, Loss: 0.2912259491932207
Epoch 1, Loss: 0.29122582938843217
Epoch 2, Loss: 0.29122570959089533
Epoch 3, Loss: 0.29122558980060953
Epoch 4, Loss: 0.29122547001757426
Epoch 5, Loss: 0.29122535024178886
Epoch 6, Loss: 0.2912252304732529
Epoch 7, Loss: 0.2912251107119658
Epoch 8, Loss: 0.29122499095792675
Epoch 9, Loss: 0.2912248712111354
Epoch 10, Loss: 0.29122475147159116
Epoch 11, Loss: 0.29122463173929347
Epoch 12, Loss: 0.29122451201424154
Epoch 13, Loss: 0.2912243922964351
Epoch 14, Loss: 0.29122427258587336
Epoch 15, Loss: 0.29122415288255593
Epoch 16, Loss: 0.2912240331864821
Epoch 17, Loss: 0.2912239134976513
Epoch 18, Loss: 0.29122379381606306
Epoch 19, Loss: 0.2912236741417166
Epoch 20, Loss: 0.2912235544746116
Epoch 21, Loss: 0.2912234348147475
Epoch 22, Loss: 0.29122331516212346
Epoch 23, Loss: 0.29122319551673903
Epoch 24, Loss: 0.2912230758785937
Epoch 25, Loss: 0.29122295624768685
Epoch 26, Loss: 0.29122283662401793
Epoch 27, Loss: 0.29122271700758634
Epoch 28, Los

In [50]:
test_documents = load_documents('C:\\Users\\AMEHMOOD\\Documents\\Repos\\programming_assignment_2\\data\\raw\\tweet\\test')

In [51]:
test_tfs = [calculate_tf(document, vocab) for document in test_documents]


test_tf_idfs = [calculate_tf_idf(tf, idf, vocab) for tf in test_tfs]

In [52]:
X_test = np.array([list(tf_idf.values()) for tf_idf in test_tf_idfs])

Y_test = np.random.randint(0, 2, (1, len(test_tf_idfs)))

In [53]:
predictions = nn.predict(X_test.T)

In [54]:
accuracy = accuracy_score(Y_test.flatten(), predictions.flatten())
confusion_matrix = confusion_matrix(Y_test.flatten(), predictions.flatten())

In [58]:
with open('C:\\Users\\AMEHMOOD\\Documents\\Repos\\programming_assignment_2\\reports\\sa_evaluation.log', 'w') as f:
    f.write(f'Accuracy: {accuracy}\n')
    f.write(f'Confusion Matrix:\n{confusion_matrix}\n')

In [57]:
with open('C:\\Users\\AMEHMOOD\\Documents\\Repos\\programming_assignment_2\\models\\trained_sa_nn.pkl', 'wb') as f:
    pickle.dump(nn, f)