# Fasttext Supervised learning example 

This notebook is inspired by the [Supervised Learning fastText tutorial](https://github.com/facebookresearch/fastText/blob/master/tutorials/supervised-learning.md)

In [1]:
def read_data(filename):
    """
    Read data 'line by line', using generators.
    Generators make it easier to process BIG text files.
    """
    with open(filename, 'r') as input:
        for line in input:
            yield line

In [2]:
def write_data(filename, data):
    """
    Write result to a file.
    
    :param result: the list to be written to the file
    """
    with open(filename, "a") as output:
        output.write('{}\n'.format(data))

In [3]:
from string import punctuation
from nltk.corpus import stopwords

def preprocess(data):
    """
    Preprocess data, filtering out stopwords, punctuation and lowering 
    all splitted tokens.
    
    :param data: the string data to be processed
    """    
    # Pad punctuation with spaces on both sides
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        data = data.replace(char, ' ' + char + ' ')
    sw = stopwords.words('english')
    splitted_chunks = data.split()
    lowered_chunks = (item.lower() for item in splitted_chunks)
    chunks_without_punctuation = (chunk for chunk in lowered_chunks if chunk not in punctuation)
    chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation if chunk not in sw)
    return " ".join(list(chunks_without_stopwords))

In [4]:
from itertools import islice

def pipeline(input_filename, output_filename, limit=None):
    """
    Iterate over the rows and apply the text preprocessing.

    :param input_filename: name of the input filename
    :param output_filename: name of the output filename
    :param limit: get the first N rows
    """    
    open(output_filename, 'w').close()  # Hack to "reset" the output file
    for row in islice(read_data(input_filename), 0, limit):
        data = preprocess(row)
        if data:
            write_data(output_filename, data)

In [5]:
def test_model(model, test_data):
    result = model.test(test_data)
    print('Precision@1:', result.precision)
    print('Recall@1:', result.recall)
    print('Number of examples:', result.nexamples)

In [6]:
from os import path

data_dir = path.join(path.dirname("__file__"), 'data')
cooking_input = path.join(data_dir, 'cooking.train')
cooking_input_norm = path.join(data_dir, 'cooking.train_norm')
cooking_test = path.join(data_dir, 'cooking.test')
cooking_test_norm = path.join(data_dir, 'cooking.test_norm')

In [7]:
pipeline(cooking_input, cooking_input_norm)

### Using fasttext

In [8]:
import fasttext as ft

# Info to save the model
model_dir = path.join(path.dirname("__file__"), 'models')
cooking_output = path.join(model_dir, 'cooking')

### Not normalized input

In [None]:
cooking_model = ft.supervised(cooking_input, cooking_output, lr=1.0, epoch=10, silent=0)

In [None]:
test_model(cooking_model, cooking_test)

### Normalized input

In [None]:
cooking_norm_model = ft.supervised(cooking_input_norm, cooking_output, lr=1.0, epoch=10, silent=0)

In [None]:
pipeline(cooking_test, cooking_test_norm)
test_model(cooking_norm_model, cooking_test_norm)

### Load existing test

In [None]:
# cooking_output_filename = path.join(current_dir, 'test', 'model_cooking.bin')
# model = ft.load_model(cooking_output_filename)

### Predictions

In [None]:
texts = ['Why not put knives in the dishwasher?']

labels = cooking_model.predict(texts)
print(labels)

In [None]:
labels = cooking_norm_model.predict(texts)
print(labels)