# Text Classification with fastText

In [144]:
# import modules
import pandas as pd
import numpy as np
import fastText

In [145]:
# read in file from csv
df = pd.read_csv('news-corpus-df.csv')

# drop articles from df that have less than 200 words of text
new_df = df.drop(df[df.text_len < 250].index)

# subset source bias, and article text
new_df = new_df.loc[:, ['bias', 'text']]
new_df.head()

Unnamed: 0,bias,text
0,Left,"Jared Bernstein, a former chief economist to V..."
1,Right,Liberals have opposed virtually every move Pre...
2,Center,CLOSE President Trump’s once bitter political ...
3,Center,"The attorneys for Michael Cohen, President Don..."
4,Left,Longtime Trump lawyer Michael Cohen is changin...


### Text Prep for fastText

In [146]:
# prepare to write txt file (only for fastText)
new_df = new_df.replace('Left ', 'Left')
new_df = new_df.replace('Right ', 'Right')
new_df = new_df.replace('Center ', 'Center')
new_df.head()

Unnamed: 0,bias,text
0,Left,"Jared Bernstein, a former chief economist to V..."
1,Right,Liberals have opposed virtually every move Pre...
2,Center,CLOSE President Trump’s once bitter political ...
3,Center,"The attorneys for Michael Cohen, President Don..."
4,Left,Longtime Trump lawyer Michael Cohen is changin...


In [147]:
# get the shape of the df
num_rows = new_df.shape[0]

# get the number of words in the corpus
corpus_size = 0
for i in range(0, num_rows):
    corpus_size += len(new_df.iloc[i][1])

print('There are '+str(num_rows)+' articles and '+str(corpus_size)+' words in the dataset.')

There are 3162 articles and 14233550 words in the dataset.


In [148]:
# create function to quickly clean text

# import modules from nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import string

# define stop lists
letters = list(string.ascii_lowercase)
numbers = [str(i) for i in range(0, 1000)]
boilerplate = ['000', '’s', '―', '/', 'playback', 'get', 'news', 'report', 'unsubscribe', 'they', 'must', 'share', 'that', 'view', 'hide', 'copy', 'something', 'enlarge', 'reprint', 'read', '_', 'videos', 'autoplay', 'watched', 'press', '’ve', 'toggle', 'around', 'the', 's.', 'said', 'here©', 'ad', '#', 'andhis', 'click', 'r', 'device', 'contributed', 'advertisement', 'the washington', '&', 'follow', 'copyright', 'mrs.', 'photo', 'to', 'also', 'times', 'for', 'however', 'fox', 'this', 'copyright ©', 'ofs', 'just', 'wait', 'n’t', 'told', 'unsupported', 'i', 'caption', 'ms.', '’m', 'paste', '’re', 'replay', 'photos', 'mr.', '©', 'skip', 'watch', '2018', 'cut', 'llc', 'more', 'post', 'embed', 'blog', 'b.', 'associated', 'permission']
stop_list = set(stopwords.words('english') + boilerplate + numbers + letters)

# create a tokenizer
re_tokenizer = RegexpTokenizer(r'\w+')
    
# simple preprocessing 
def clean(text):
    
    collection = []
    
    for i in range(0,len(text)):

        article = text[i].lower()
        
        test_tokens = re_tokenizer.tokenize(article)

        clean_tokens = []
        for t in test_tokens:
            if t not in stop_list:
                clean_tokens.append(t)
                
        collection.append(clean_tokens)
    
    #clean_string = (' ').join(collection)
    return collection

In [149]:
'''
rewrite the dataframe such that: 
-clean text
-option to convert training examples to paragraphs rather than documents

'''
def convert_text(option):
    new_docset = []

    # iterate through all the rows of the df
    for i in range(0, num_rows):
        # extract label and text information
        label = new_df.iloc[i][0]
        text = new_df.iloc[i][1]
        
        if option=='doc':
            clean_text = ' '.join([' '.join(line) for line in clean(text.splitlines())])
            new_docset.append([label, clean_text])
            
        elif option=='para':
            # segment texts by paragraph and treat as individual documents
            clean_text = [' '.join(line) for line in clean(text.splitlines())]
            for line in clean_text:
                new_docset.append([label, line])
                
    return new_docset

doc_text = convert_text('doc')
para_text = convert_text('para')

In [150]:
# examine properties of new corpus
def corpus_info(corpus, name):
    right_ct = 0
    left_ct = 0
    center_ct = 0
    
    for doc in corpus:
        if doc[0] == 'Left':
            left_ct +=1
        elif doc[0] == 'Right':
            right_ct +=1
        else:
            center_ct += 1
    
    print(name)
    print('Size:', len(corpus))
    print('Left:', '{0:.2f}'.format(left_ct/len(corpus)))
    print('Right:', '{0:.2f}'.format(right_ct/len(corpus)))
    print('Center:', '{0:.2f}'.format(center_ct/len(corpus)))

In [151]:
corpus_info(doc_text, 'Original Corpus')

Original Corpus
Size: 3162
Left: 0.42
Right: 0.33
Center: 0.26


In [152]:
corpus_info(para_text, 'Paragraph Corpus')

Paragraph Corpus
Size: 64309
Left: 0.45
Right: 0.28
Center: 0.27


In [142]:
# shuffle corpus
# from random import shuffle
# shuffle(para_text)

# partition into train and test sets
#train = para_text[0:51447]
#test = para_text[51447:64309]

# partition into train and test sets
train = doc_text[0:2532]
test = doc_text[2532:3162]

# evaluate training and test sets
corpus_info(train, 'Train')
corpus_info(test, 'Test')

Train
Size: 2532
Left: 0.40
Right: 0.33
Center: 0.28
Test
Size: 630
Left: 0.49
Right: 0.34
Center: 0.17


In [154]:
# function to write corpus data to text file
from os import path
fastText_dir = path.join(path.dirname("__file__"), 'fastText')

def write_file(corpus, label):
    f = open('fastText/'+label+'.txt', 'wt', encoding='utf-8')
    for instance in corpus:
        label = instance[0]
        text = instance[1]
        f.write('%s\t%s\n' % (label, text))

write_file(para_text, 'para')
write_file(doc_text, 'doc')
# create training and test files
#write_file(train, 'train')
#write_file(test, 'test')

In [129]:
X, y = [], []
with open('fastText/doc.train.txt', "r") as infile:
    for line in infile:
        label, text = line.split("\t")
        X.append(text.split())
        y.append(label)

In [119]:
# function to get model results
def test_model(model, test_data):
    result = model.test(test_data)
    print('Precision@1:', result[1])
    print('Recall@1:', result[2])
    print('Number of examples:', result[0])

In [120]:
# get file information
news_input = path.join(fastText_dir, 'train.txt')
news_test = path.join(fastText_dir, 'test.txt')

In [121]:
# import fastText
import fastText as ft

# Info to save the model
model_dir = path.join(path.dirname("__file__"), 'models')
news_output = path.join(model_dir, 'news')

In [69]:
news_model = ft.train_supervised(news_input, lr=0.1, dim=100, ws=5, 
                              epoch=5, minCount=1, minCountLabel=0, minn=0, maxn=0, 
                              neg=5, wordNgrams=1, loss='softmax', bucket=2000000, 
                              thread=12, lrUpdateRate=100, t=0.0001, label='__label__', 
                                 verbose=2, pretrainedVectors='')

In [70]:
test_model(news_model, news_test)

Precision@1: 0.4746031746031746
Recall@1: 0.4746031746031746
Number of examples: 630


In [None]:
import argparse
import numpy as np
from sklearn.metrics import confusion_matrix

def parse_labels(path):
    with open(path) as f:
        return np.array(list(map(lambda x: int(x[9:]), f.read().split())))

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Display confusion matrix.')
    parser.add_argument('test', help='Path to test labels')
    parser.add_argument('predict', help='Path to predictions')
    args = parser.parse_args()
    test_labels = parse_labels(args.test)
    pred_labels = parse_labels(args.predict)
    eq = test_labels == pred_labels
    print("Accuracy: " + str(eq.sum() / len(test_labels)))
    print(confusion_matrix(test_labels, pred_labels))

In [48]:
help(fastText.FastText)

Help on module fastText.FastText in fastText:

NAME
    fastText.FastText

DESCRIPTION
    # Copyright (c) 2017-present, Facebook, Inc.
    # All rights reserved.
    #
    # This source code is licensed under the BSD-style license found in the
    # LICENSE file in the root directory of this source tree. An additional grant
    # of patent rights can be found in the PATENTS file in the same directory.

FUNCTIONS
    load_model(path)
        Load a model given a filepath and return a model object.
    
    tokenize(text)
        Given a string of text, tokenize it and return a list of tokens
    
    train_supervised(input, lr=0.1, dim=100, ws=5, epoch=5, minCount=1, minCountLabel=0, minn=0, maxn=0, neg=5, wordNgrams=1, loss='softmax', bucket=2000000, thread=12, lrUpdateRate=100, t=0.0001, label='__label__', verbose=2, pretrainedVectors='')
        Train a supervised model and return a model object.
        
        input must be a filepath. The input text does not need to be tokenized