In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from keras.preprocessing.text import Tokenizer
from collections import Counter
import six

Using TensorFlow backend.


Load 10K raw file and output textual features for classification using sklearn and keras.  
Inputs: 
- data/10k/10k_raw.pickle: a pandas pickle file containing gvkey, fyear, and MD&A section 

Outputs:
- data/10k/10k_index.csv: an index file that preserves the sequence 
- data/10k/X_keras_unigram.npy: unigram sequences for keras
- data/10k/word_map.pickle: a dictionary to map numbers to words

# load 10k raw text

## 2017-04-18 Use full 10K sample (included 10KSB etc.)  
Inputs
- mda_raw_corpus.pickle: Generated from `process_10k` file in the culture project

In [2]:
sec_10k = pd.read_pickle("/shared/data/10k_2017/processed_corpus/mda_raw_corpus.pickle")

In [3]:
sec_10k.shape

(234701, 3)

In [None]:
sec10k_index = sec_10k[['gvkey','fyear']]
sec10k_index.to_csv("/shared/data/10k_2017/processed_corpus/10k_index.csv")

In [None]:
# remove obs with no MD&A
sec_10k_no_NA = sec_10k[sec_10k['mda_text'] != ""]

In [None]:
sec_10k_no_NA.shape

In [17]:
sec_10k_no_NA.to_pickle("/shared/data/10k_2017/processed_corpus/mda_raw_corpus_no_none.pickle")

In [18]:
sec10k_index_no_NA = sec_10k_no_NA[['gvkey','fyear']]
sec10k_index_no_NA.to_csv("/shared/data/10k_2017/processed_corpus/10k_index_no_none.csv")

# use keras to pre-process text

In [4]:
nb_words = 10000
mda_text_list = sec_10k['mda_text'].tolist()
del sec_10k

In [None]:
len(mda_text_list)

In [5]:
tokenizer = pickle.load(file=open("data/10k/tokenizer.pickle", 'rb'))
tokenizer.nb_words = nb_words

In [6]:
len(tokenizer.word_counts)

876768

In [7]:
import re
def filter_tokenizer_word_index(word_index, word_counts, rare_threshold = 5, most_common = 100):
    '''filter out the words to include in corpus'''
    # filter out any token that has no letter 
    word_index ={word: v for word, v in word_index.items() if (any(c.isalpha() for c in word) and len(word)>=3 and len(word) <=18)}
    word_counts ={word: v for word, v in word_counts.items() if (any(c.isalpha() for c in word) and len(word)>=3 and len(word) <=18)}
    
    # filter out html codes and other words
    to_filter = set(['div', 'align', 'border', 'color', 'font', 'left', 'colspan', 'roman', 'roman\'','valign', 'family', 'hidden', 'bottom', 'times', 'padding', 'rowspan', 'background', 'class', 'cceeff',
                    'style', 'text', 'medium', 'vertical', 'nbsp', 'width', 'nowrap', 'serif', 'indent', 'height', 'inherit', 'offset','msonormal', 'weight',
                    'top', 'right', 'none'])
    p = re.compile("\d+(pt|px|in)")
    word_index ={word: v for word, v in word_index.items() if (not bool(p.match(word))) and word not in to_filter}
    word_counts ={word: v for word, v in word_counts.items() if (not bool(p.match(word))) and word not in to_filter}
    
    
    # filter most common and rare words
    most_common_words = set(sorted(word_counts, key=word_counts.get, reverse=True)[:most_common])
    print(most_common_words)
    word_index ={word: v for word, v in word_index.items() if (word_counts.get(word) >= rare_threshold) and (word not in most_common_words)}
    word_counts ={word: v for word, v in word_counts.items() if (word_counts.get(word) >= rare_threshold) and (word not in most_common_words)}
    
    return word_index, word_counts

In [None]:
# tokenize the raw text
tokenizer = Tokenizer(nb_words=nb_words)

In [8]:
filtered_tokenizer_dicts = filter_tokenizer_word_index(tokenizer.word_index, tokenizer.word_counts, most_common= 0)
tokenizer.word_index = filtered_tokenizer_dicts[0]
tokenizer.word_counts =  filtered_tokenizer_dicts[1]

set()


In [9]:
len(tokenizer.word_counts)

241953

In [None]:
tokenizer.fit_on_texts(mda_text_list)

In [10]:
# X is a matrix, each row is sequence of word ids
X_seq = tokenizer.texts_to_sequences(mda_text_list) 
X = np.array(X_seq)

In [None]:
# save the tokenized sequence to disk
np.save("data/10k/X_keras_unigram.npy", X)
# np.save("data/10k/X_keras_unigram_20000.npy", X)
# save the word mapping to disk
pickle.dump(tokenizer.word_index, file=open("data/10k/word_map.pickle", 'wb'))
# pickle.dump(tokenizer.word_index, file=open("data/10k/word_map_20000.pickle", 'wb'))

In [None]:
# save the trained tokenizer
# pickle.dump(tokenizer, file=open("data/10k/tokenizer.pickle", 'wb'))
# pickle.dump(tokenizer, file=open("data/10k/tokenizer_20000.pickle", 'wb'))

In [None]:
X_tfidf = tokenizer.sequences_to_matrix(X_seq, mode = 'tfidf')

In [None]:
X_tfidf = np.array(X_tfidf)

In [None]:
np.save("data/10k/X_tfidf.npy", X_tfidf)

In [28]:
sorted(tokenizer.word_index, key=tokenizer.word_index.get)[10000:10010]

['reimburses',
 'costless',
 'reside',
 'elk',
 'ultrasound',
 'contemplation',
 'corps',
 'systemic',
 'qspe',
 'knee']