In [None]:
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import Counter
import math

In [None]:
nltk.download('stopwords')

In [None]:
stemmer = PorterStemmer()

stop_words = set(stopwords.words('english'))

emoticon_pattern = r'[:=;][oO\-]?[D\)\]\(\]/\\OpP]'

In [None]:
def process_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    
    words = re.split(r'\W+', text)
    words = [stemmer.stem(word) for word in words if word not in string.punctuation]
    words = [word for word in words if word not in stop_words and not re.match(emoticon_pattern, word)]

    return words

In [None]:
def create_vocab(directory):
    vocab = Counter()

    for path, _, files in os.walk(directory):
        for file in files:
            with open(os.path.join(path, file), 'r') as f:
                text = f.read()
                words = process_text(text)
                vocab.update(words)

    return vocab

In [None]:
vocab = create_vocab('/Users/amanmehmood/AIT 726/programming_assignment_2/data/raw/tweet/train')

In [None]:
def calculate_tf(document_words, vocab):
    tf = dict.fromkeys(vocab, 0)
    word_count = Counter(document_words)

    for word, count in word_count.items():
        if word in vocab:
            tf[word] = count / len(document_words)

    return tf

In [None]:
def calculate_idf(documents, vocab):
    N = len(documents)
    idf = dict.fromkeys(vocab, 0)

    for document in documents:
        for word in vocab:
            if word in document:
                idf[word] += 1
    
    for word in vocab:
        idf[word] = math.log(N / float(idf[word]))
        
    return idf

In [None]:
def calculate_tf_idf(tf, idf, vocab):
    tf_idf = dict.fromkeys(vocab, 0)

    for word in vocab:
        tf_idf[word] = tf[word] * idf[word]
        
    return tf_idf

In [None]:
def load_documents(directory):
    documents = []
    
    for path, _, files in os.walk(directory):
        for file in files:
            with open(os.path.join(path, file), 'r') as f:
                text = f.read()
                words = process_text(text)
                documents.append(words)
                
    return documents

In [None]:
documents = load_documents('/Users/amanmehmood/AIT 726/programming_assignment_2/data/raw/tweet/train')

In [None]:
tfs = [calculate_tf(document, vocab) for document in documents]

idf = calculate_idf(documents, vocab)

tf_idfs = [calculate_tf_idf(tf, idf, vocab) for tf in tfs]