In [1]:
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import Counter
import math
import numpy as np

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AMEHMOOD\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
stemmer = PorterStemmer()

stop_words = set(stopwords.words('english'))

emoticon_pattern = r'[:=;][oO\-]?[D\)\]\(\]/\\OpP]'

In [4]:
def process_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    
    words = re.split(r'\W+', text)
    words = [stemmer.stem(word) for word in words if word not in string.punctuation]
    words = [word for word in words if word not in stop_words and not re.match(emoticon_pattern, word)]

    return words

In [5]:
def create_vocab(directory):
    vocab = Counter()

    for path, _, files in os.walk(directory):
        for file in files:
            with open(os.path.join(path, file), 'r') as f:
                text = f.read()
                words = process_text(text)
                vocab.update(words)

    return vocab

In [7]:
vocab = create_vocab('C:\\Users\\AMEHMOOD\\Documents\\Repos\\programming_assignment_2\\data\\raw\\tweet\\train')

In [8]:
def calculate_tf(document_words, vocab):
    tf = dict.fromkeys(vocab, 0)
    word_count = Counter(document_words)

    for word, count in word_count.items():
        if word in vocab:
            tf[word] = count / len(document_words)

    return tf

In [9]:
def calculate_idf(documents, vocab):
    N = len(documents)
    idf = dict.fromkeys(vocab, 0)

    for document in documents:
        for word in vocab:
            if word in document:
                idf[word] += 1
    
    for word in vocab:
        idf[word] = math.log(N / float(idf[word]))
        
    return idf

In [10]:
def calculate_tf_idf(tf, idf, vocab):
    tf_idf = dict.fromkeys(vocab, 0)

    for word in vocab:
        tf_idf[word] = tf[word] * idf[word]
        
    return tf_idf

In [11]:
def load_documents(directory):
    documents = []
    
    for path, _, files in os.walk(directory):
        for file in files:
            with open(os.path.join(path, file), 'r') as f:
                text = f.read()
                words = process_text(text)
                documents.append(words)
                
    return documents

In [12]:
documents = load_documents('C:\\Users\\AMEHMOOD\\Documents\\Repos\\programming_assignment_2\\data\\raw\\tweet\\train')

In [13]:
tfs = [calculate_tf(document, vocab) for document in documents]

idf = calculate_idf(documents, vocab)

tf_idfs = [calculate_tf_idf(tf, idf, vocab) for tf in tfs]

In [14]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def mse(y_true, y_pred):
    return np.mean(np.square(y_true - y_pred))

def mse_derivative(y_true, y_pred):
    return 2 * (y_pred - y_true) / y_true.size

In [15]:
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        self.W1 = np.random.randn(hidden_size, input_size)
        self.b1 = np.zeros((hidden_size, 1))
        self.W2 = np.random.randn(output_size, hidden_size)
        self.b2 = np.zeros((output_size, 1))

    def forward(self, X):
        self.Z1 = np.dot(self.W1, X) + self.b1
        self.A1 = sigmoid(self.Z1)
        self.Z2 = np.dot(self.W2, self.A1) + self.b2
        self.A2 = sigmoid(self.Z2)
        return self.A2

    def backward(self, X, Y, output, learning_rate=0.0001):
        dZ2 = mse_derivative(Y, output) * sigmoid_derivative(output)
        dW2 = np.dot(dZ2, self.A1.T)
        db2 = np.sum(dZ2, axis=1, keepdims=True)
        dZ1 = np.dot(self.W2.T, dZ2) * sigmoid_derivative(self.A1)
        dW1 = np.dot(dZ1, X.T)
        db1 = np.sum(dZ1, axis=1, keepdims=True)
        
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2

    def train(self, X, Y, epochs):
        for epoch in range(epochs):
            output = self.forward(X)
            loss = mse(Y, output)
            print(f'Epoch {epoch}, Loss: {loss}')
            self.backward(X, Y, output)

In [16]:
# Initialize the neural network
nn = NeuralNetwork(input_size=len(vocab), hidden_size=20, output_size=1)

# Convert tf-idf vectors to numpy arrays
X_train = np.array([list(tf_idf.values()) for tf_idf in tf_idfs])

# Define labels
Y_train = np.random.randint(0, 2, (1, len(tf_idfs)))

# Train
nn.train(X_train.T, Y_train, epochs=100)

Epoch 0, Loss: 0.3088886520144359
Epoch 1, Loss: 0.30888829530492456
Epoch 2, Loss: 0.3088879386150266
Epoch 3, Loss: 0.30888758194474114
Epoch 4, Loss: 0.30888722529406704
Epoch 5, Loss: 0.30888686866300313
Epoch 6, Loss: 0.30888651205154866
Epoch 7, Loss: 0.3088861554597023
Epoch 8, Loss: 0.30888579888746315
Epoch 9, Loss: 0.30888544233483006
Epoch 10, Loss: 0.3088850858018021
Epoch 11, Loss: 0.30888472928837807
Epoch 12, Loss: 0.30888437279455705
Epoch 13, Loss: 0.3088840163203379
Epoch 14, Loss: 0.30888365986571964
Epoch 15, Loss: 0.3088833034307012
Epoch 16, Loss: 0.30888294701528146
Epoch 17, Loss: 0.30888259061945944
Epoch 18, Loss: 0.30888223424323413
Epoch 19, Loss: 0.30888187788660443
Epoch 20, Loss: 0.3088815215495692
Epoch 21, Loss: 0.30888116523212755
Epoch 22, Loss: 0.30888080893427833
Epoch 23, Loss: 0.30888045265602054
Epoch 24, Loss: 0.30888009639735303
Epoch 25, Loss: 0.3088797401582749
Epoch 26, Loss: 0.308879383938785
Epoch 27, Loss: 0.3088790277388824
Epoch 28, Los