# Fasttext Supervised learning example 

This notebook is inspired by the [Supervised Learning fastText tutorial](https://github.com/facebookresearch/fastText/blob/master/tutorials/supervised-learning.md)

In [12]:
def read_data(filename):
    """
    Read data 'line by line', using generators.
    Generators make it easier to process BIG text files.
    """
    with open(filename, 'r') as input:
        for line in input:
            yield line

In [13]:
def write_data(filename, data):
    """
    Write result to a file.
    
    :param result: the list to be written to the file
    """
    with open(filename, "a") as output:
        output.write('{}\n'.format(data))

In [14]:
from string import punctuation
from nltk.corpus import stopwords

def preprocess(data):
    """
    Preprocess data, filtering out stopwords, punctuation and lowering 
    all splitted tokens.
    
    :param data: the string data to be processed
    """    
    # Pad punctuation with spaces on both sides
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        data = data.replace(char, ' ' + char + ' ')
    sw = stopwords.words('english')
    splitted_chunks = data.split()
    lowered_chunks = (item.lower() for item in splitted_chunks)
    chunks_without_punctuation = (chunk for chunk in lowered_chunks if chunk not in punctuation)
    chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation if chunk not in sw)
    return " ".join(list(chunks_without_stopwords))

In [142]:
from itertools import islice

def pipeline(input_filename, output_filename, limit=None):
    """
    Iterate over the rows and apply the text preprocessing.

    :param input_filename: name of the input filename
    :param output_filename: name of the output filename
    :param limit: get the first N rows
    """    
    open(output_filename, 'w').close()  # Hack to "reset" the output file
    for row in islice(read_data(input_filename), 0, limit):
        data = preprocess(row)
        if data:
            write_data(output_filename, data)

In [143]:
def test_model(model, test_data):
    result = model.test(test_data)
    print('Precision@1:', result.precision)
    print('Recall@1:', result.recall)
    print('Number of examples:', result.nexamples)

In [170]:
from os import path

data_dir = path.join(path.dirname("__file__"), 'data')
enron_input = path.join(data_dir, 'recipient_data.txt')
fairframe_test = path.join(data_dir, 'fairframe.train') 


In [171]:
#pipeline(cooking_input, cooking_input_norm)

### Using fasttext

In [172]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [173]:
def results(N, p, r):
    return N, p, r
    

In [174]:
import fastText as ft
from fastText import train_supervised

# Info to save the model
model_dir = path.join(path.dirname("__file__"), 'models')
gender_output = path.join(model_dir, 'gender_classification')

In [175]:
import numpy as np

In [176]:
import matplotlib.pyplot as plt

In [None]:
N_val = []
precision = []
recall = []
for wordNgrams in range(5,10):
    model = train_supervised(input=enron_input, 
                             epoch=50, #tuned 
                             lr=0.00001, 
                             ws = 2, #tuned 
                             wordNgrams=wordNgrams, #drops when you increase it 
                             verbose=10, 
                             minCount=1, 
                             dim=200)
    N,p,r =  results(*model.test(fairframe_test))
    N_val.append(N)
    precision.append(p)
    recall.append(r)

#print_results(*model.test(cooking_test))  

plt.plot(precision)
plt.plot(recall)
plt.show()

### Not normalized input

### Normalized input

### Load existing test

### Predictions