# Exam Setup

In [1]:
import numpy as np
import os
import collections
from nltk.stem import PorterStemmer

## I. Reading Documents

In [2]:
PUNCT = "0123456789!#$%&()''*+-/.:;?@[]{}|^_`~<>=\"" # all punctuation we discard
TABLE = str.maketrans(PUNCT, " " * len(PUNCT)) # replace punctuation by space

def read_document(filename):
    f = open(filename, encoding="utf-8") # specify encoding to avoid unreadable documents
    text = f.read()
    f.close()
    
    text = text.lower() # all words to lowercase
    text = text.translate(TABLE)
    words = text.split() # separate the document into list of words
    
    return words

#### Example

In [3]:
for f in os.listdir("recipes/test"):
    path = "recipes/test/" + f
    words = read_document(path)
    
    print(words)
    break

['warm', 'cherry', 'bread', 'pudding', 'ingredients', 'inch', 'thick', 'slices', 'challah', 'or', 'other', 'egg', 'bread', 'about', 'ounces', 'cups', 'whipping', 'cream', 'cups', 'whole', 'milk', 'cups', 'sugar', 'large', 'eggs', 'cup', 'dark', 'rum', 'tablespoons', 'vanilla', 'extract', 'cups', 'dried', 'tart', 'cherries', 'purchased', 'caramel', 'sauce,', 'heated', 'preparation', 'preheat', 'oven', 'to', '°f', 'using', 'inch', 'diameter', 'round', 'cookie', 'cutter,', 'cut', 'round', 'from', 'each', 'bread', 'slice', 'and', 'arrange', 'on', 'baking', 'sheet', 'toast', 'bread', 'rounds', 'in', 'oven', 'until', 'golden', 'brown,', 'about', 'minutes', 'reduce', 'oven', 'temperature', 'to', '°f', 'bring', 'cream,', 'milk,', 'and', 'sugar', 'to', 'simmer', 'in', 'heavy', 'large', 'saucepan,', 'stirring', 'to', 'dissolve', 'sugar', 'whisk', 'eggs', 'in', 'large', 'bowl', 'to', 'blend', 'gradually', 'whisk', 'hot', 'cream', 'mixture', 'into', 'eggs', 'whisk', 'in', 'rum', 'and', 'vanilla', 

### Stemming and Removing Stopwords

In [4]:
os.listdir("recipes/")
sw_path = "recipes/stopwords.txt"
stopwords = read_document(sw_path)

In [5]:
def stemmed_document(filename):
    words = read_document(filename)
    
    # Removing Stopwords
    words_without_sw = [i for i in words if i not in stopwords]
    
    #Stemming
    ps = PorterStemmer()
    stemmed = [ps.stem(word) for word in words_without_sw]
    
    return stemmed

#### Example

In [6]:
for f in os.listdir("recipes/test"):
    path = "recipes/test/" + f
    words = stemmed_document(path)
    
    print(words)
    break

['warm', 'cherri', 'bread', 'pud', 'ingredi', 'inch', 'slice', 'challah', 'egg', 'bread', 'ounc', 'cup', 'whip', 'cream', 'cup', 'milk', 'cup', 'sugar', 'larg', 'egg', 'cup', 'dark', 'rum', 'tablespoon', 'vanilla', 'extract', 'cup', 'dri', 'tart', 'cherri', 'purchas', 'caramel', 'sauce,', 'heat', 'prepar', 'preheat', 'oven', '°f', 'use', 'inch', 'diamet', 'round', 'cooki', 'cutter,', 'cut', 'round', 'bread', 'slice', 'arrang', 'bake', 'sheet', 'toast', 'bread', 'round', 'oven', 'golden', 'brown,', 'minut', 'reduc', 'oven', 'temperatur', '°f', 'bring', 'cream,', 'milk,', 'sugar', 'simmer', 'heavi', 'larg', 'saucepan,', 'stir', 'dissolv', 'sugar', 'whisk', 'egg', 'larg', 'bowl', 'blend', 'gradual', 'whisk', 'hot', 'cream', 'mixtur', 'egg', 'whisk', 'rum', 'vanilla', 'butter', 'cup', 'soufflé', 'dish', 'arrang', 'rim', 'bake', 'sheet', 'place', 'bread', 'round', 'dish', 'heap', 'tablespoon', 'dri', 'cherri', 'cherri', 'dish', 'bread', 'round', 'heap', 'tablespoon', 'dri', 'cherri', 'cherr

## II. Vocabulary

In [24]:
vocabulary = collections.Counter()

for f in os.listdir("recipes/train"):
    path = "recipes/train/" + f
    words = stemmed_document(path)
    
    vocabulary.update(words) # count words and add to vocabulary
    
# Save vocabulary in "vocabulary.txt" file
f = open("vocabulary.txt", "w", encoding="utf-8")

for word, count in vocabulary.most_common(1000): # 1000 most common words
    print(word, file=f)
f.close()

In [8]:
def load_vocabulary(filename):
    f = open(filename, encoding="utf-8")
    text = f.read()
    f.close()
    words = text.split()
    
    # Create index for each word
    vocab = {}
    index = 0
    for word in words:
        vocab[word] = index
        index += 1
    
    return vocab

#### Example

In [9]:
vocabulary = load_vocabulary("vocabulary.txt")
list(vocabulary.items())[:5]

[('cup', 0), ('tablespoon', 1), ('minut', 2), ('teaspoon', 3), ('add', 4)]

## III. Bag of Words Representation

In [10]:
def bag_of_words(filename, vocab):
    words = stemmed_document(filename)
    
    # Bag of Words
    bow = np.zeros(len(vocab))
    for word in words:
        if word in vocab:
            index = vocab[word]
            bow[index] += 1
    
    return bow

#### Train Set

In [11]:
documents = []
labels = []

for f in os.listdir("recipes/train"):
    if "american" in f:
        final_path = "recipes/train/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(0)

for f in os.listdir("recipes/train"):
    if "asian" in f:
        final_path = "recipes/train/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(1)

for f in os.listdir("recipes/train"):
    if "french" in f:
        final_path = "recipes/train/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(2)

for f in os.listdir("recipes/train"):
    if "indian" in f:
        final_path = "recipes/train/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(3)

for f in os.listdir("recipes/train"):
    if "italian" in f:
        final_path = "recipes/train/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(4)

for f in os.listdir("recipes/train"):
    if "jewish" in f:
        final_path = "recipes/train/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(5)

for f in os.listdir("recipes/train"):
    if "mexican" in f:
        final_path = "recipes/train/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(6)

for f in os.listdir("recipes/train"):
    if "middle_eastern" in f:
        final_path = "recipes/train/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(7)


X = np.stack(documents)
Y = np.array(labels)

data = np.concatenate([X, Y[:, None]], 1)

np.savetxt("train_bow.txt.gz", data) # Save into a file / .gz compresses the file

#### Test Set

In [12]:
documents = []
labels = []

for f in os.listdir("recipes/test"):
    if "american" in f:
        final_path = "recipes/test/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(0)

for f in os.listdir("recipes/test"):
    if "asian" in f:
        final_path = "recipes/test/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(1)

for f in os.listdir("recipes/test"):
    if "french" in f:
        final_path = "recipes/test/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(2)

for f in os.listdir("recipes/test"):
    if "indian" in f:
        final_path = "recipes/test/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(3)

for f in os.listdir("recipes/test"):
    if "italian" in f:
        final_path = "recipes/test/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(4)

for f in os.listdir("recipes/test"):
    if "jewish" in f:
        final_path = "recipes/test/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(5)

for f in os.listdir("recipes/test"):
    if "mexican" in f:
        final_path = "recipes/test/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(6)

for f in os.listdir("recipes/test"):
    if "middle_eastern" in f:
        final_path = "recipes/test/" + f
        bow = bag_of_words(final_path, vocabulary)
        documents.append(bow)
        labels.append(7)


X = np.stack(documents)
Y = np.array(labels)

data = np.concatenate([X, Y[:, None]], 1)

np.savetxt("test_bow.txt.gz", data) # Save into a file / .gz compresses the file