In [1]:
%load_ext autoreload
%autoreload 2
%run ../nb_config.py

In [2]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.word2vec import LineSentence
from gensim.models.phrases import Phrases, Phraser
from pathlib import Path

In [3]:
from src.nlp_quant.sec_filings import parse_sec_tenks
from src.nlp_quant.sec_filings import n_grams
from src.load_data import io_utils

In [5]:
# Spacy lang model
nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.max_length = parse_sec_tenks.max_doc_length

In [6]:
# Parameters
sections = ['1', '1a', '7', '7a']
max_doc_length = 6000000
min_sentece_length = 5

phrases_args = {'min_count': 25,  # ignore terms with a lower count
                'threshold': 0.5,  # accept phrases with higher score
                'max_vocab_size': 40000000,  # prune of less common words to limit memory use
                'delimiter': b'_',  # how to join ngram tokens
                'progress_per': 50000,  # log progress every
                'scoring': 'npmi'}
max_ngram_length = 3

# Sample Pipeline

In [7]:
sec_path = Path(io_utils.raw_path, 'sec_filings', 'smpl') 
filing_path = sec_path / 'tenks'
sections_path = sec_path / 'tenks_sections' 
clean_path = sec_path / 'tenks_selected_sections'
lemma_path = sec_path / 'tenks_lemma_selected_sections'
ngram_path = sec_path / 'tenks_ngrams'
stats_path = sec_path / 'tenks_corpus_stats'

In [8]:
parse_sec_tenks.identify_sections(inpath=filing_path, outpath=sections_path)

In [9]:
parse_sec_tenks.parse_sections(nlp=nlp, sections=sections, text_col='text', item_col='item',
                   inpath=sections_path, outpath_sentences=clean_path, outpath_docs=lemma_path)

In [10]:
n_grams.create_unigrams(min_length=parse_sec_tenks.min_sentece_length,
                        inpath=clean_path, outpath_ngram=ngram_path, outpath_stats=stats_path)

0 

In [11]:
n_grams.create_ngrams(max_length=max_ngram_length, phrases_args=phrases_args,
                      ngram_path=ngram_path, stats_path=stats_path)

2 3 
	Duration:  00:00:06
	ngrams: 11,698

length
2    9563
3    2135
dtype: int64


In [12]:
corpus = [str(x) for x in lemma_path.glob('*.txt')]  

In [13]:
vectorizer = CountVectorizer(input='filename', max_df=0.95, min_df=.01)
X_bow = vectorizer.fit_transform(corpus)

In [14]:
pd.DataFrame(X_bow.todense(), columns=vectorizer.vocabulary_)

Unnamed: 0,safe,harbor,create,amend,securities,historical,fact,deem,word,seek,wording,expectation,looking,harm,undertake,revise,update,reflect,circumstance,sandisk,collectively,refer,corporation,delaware,subsidiary,...,marginal,variability,proportionate,spare,accrued,meritorious,argument,lump,pertinent,discounting,recognizable,concerned,lapsing,indemnifie,harmless,persuasive,collectibility,holdback,destination,fifo,identifiable,necessity,reassess,mercer,attribution
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,6,4,4,0,0,...,1,0,0,16,0,0,0,0,1,0,0,0,27,12,0,5,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,10,5,1,0,0,1,0,0,0,1,0,0,...,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,4,0,2,0,...,0,0,0,17,4,0,0,0,0,0,0,0,0,40,1,0,42,0,0,0,0,0,4,0,0
3,0,0,3,0,1,1,0,0,0,0,0,0,0,0,0,10,0,0,5,0,0,3,0,0,0,...,2,0,0,4,0,0,0,0,0,0,0,0,0,1,0,0,3,0,0,0,4,5,0,0,0
4,0,0,0,1,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,5
5,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,1,0,0,10,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3,0,0,0,1,4,0,1,0,...,0,0,0,19,0,0,1,0,0,0,0,0,0,46,0,0,0,0,0,0,0,0,0,1,0
7,1,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,2,4,0,1,1,0,0,...,0,20,11,4,0,3,0,0,0,0,0,0,0,1,0,0,2,0,0,1,0,0,0,0,4
8,0,0,1,0,0,1,0,2,0,1,0,0,0,0,0,0,0,5,18,15,0,13,5,0,0,...,0,0,0,0,0,0,0,0,0,11,1,0,0,8,0,0,0,8,1,0,0,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,8,0,1,0,1,3,0,2,...,1,0,0,4,0,0,0,0,0,0,0,14,3,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
tfidf = TfidfVectorizer(input='filename', max_df=0.95, min_df=.01)
X_tfidf = tfidf.fit_transform(corpus)

In [16]:
pd.DataFrame(X_tfidf.todense(), columns=tfidf.vocabulary_)

Unnamed: 0,safe,harbor,create,amend,securities,historical,fact,deem,word,seek,wording,expectation,looking,harm,undertake,revise,update,reflect,circumstance,sandisk,collectively,refer,corporation,delaware,subsidiary,...,marginal,variability,proportionate,spare,accrued,meritorious,argument,lump,pertinent,discounting,recognizable,concerned,lapsing,indemnifie,harmless,persuasive,collectibility,holdback,destination,fifo,identifiable,necessity,reassess,mercer,attribution
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006759,0.0,0.011611,0.003687,0.005407,0.0,0.0,...,0.001505,0.0,0.0,0.016169,0.0,0.0,0.0,0.0,0.002276,0.0,0.0,0.0,0.05225,0.012127,0.0,0.011382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002201,0.0,0.0,0.0,0.0,0.0,0.037069,0.018535,0.003151,0.0,0.0,0.002201,0.0,0.0,0.0,0.002201,0.0,0.0,...,0.0,0.0,0.0,0.003291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001646,0.0,0.0,0.004902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.001963,0.00146,0.0,0.0,0.0,0.001963,0.0,0.001669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00318,0.0,0.003337,0.0,...,0.0,0.0,0.0,0.014813,0.007852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034855,0.001963,0.0,0.054514,0.0,0.0,0.0,0.0,0.0,0.006675,0.0,0.0
3,0.0,0.0,0.008539,0.0,0.003253,0.002273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032532,0.0,0.0,0.011363,0.0,0.0,0.004649,0.0,0.0,0.0,...,0.005061,0.0,0.0,0.006795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001699,0.0,0.0,0.007591,0.0,0.0,0.0,0.015308,0.019135,0.0,0.0,0.0
4,0.0,0.0,0.0,0.005934,0.0,0.007047,0.0,0.0,0.0,0.0,0.0,0.0,0.005934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012014,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015132,0.0,0.025221
5,0.0,0.0,0.0,0.0,0.0,0.003214,0.0,0.003214,0.004601,0.0,0.005413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002192,0.0,0.0,0.0,...,0.003579,0.0,0.0,0.024028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005035,0.0,0.0,0.0,0.001427,0.002718,0.0,0.001427,0.0,...,0.0,0.0,0.0,0.014154,0.0,0.0,0.001678,0.0,0.0,0.0,0.0,0.0,0.0,0.034269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001678,0.0
7,0.003807,0.0,0.0,0.0,0.006473,0.0,0.0,0.002261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004521,0.011325,0.0,0.001542,0.002261,0.0,0.0,...,0.0,0.07614,0.041877,0.00676,0.0,0.011421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00169,0.0,0.0,0.005035,0.0,0.0,0.003807,0.0,0.0,0.0,0.0,0.012945
8,0.0,0.0,0.001369,0.0,0.0,0.001093,0.0,0.002186,0.0,0.001841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007824,0.019676,0.020536,0.0,0.009691,0.005466,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020249,0.001841,0.0,0.0,0.006537,0.0,0.0,0.0,0.014727,0.001841,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.00393,0.0,0.0,0.0,0.0,0.0,0.013235,0.0,0.0,0.0,0.0,0.0,0.045004,0.0,0.004922,0.0,0.00268,0.011789,0.0,0.013235,...,0.004376,0.0,0.0,0.01175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092645,0.016877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Example

In [17]:
filing0 = list(filing_path.glob('*.txt'))[0].read_text()

In [18]:
filing0[:1000]

' \t \t UNITED STATES SECURITIES AND EXCHANGE COMMISSION WASHINGTON, D.C. 20549 FORM 10-K (Mark One) R ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ended December 30, 2012 OR TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the transition period from ________________ to ________________ Commission file number: 000-26734 SANDISK CORPORATION (Exact name of registrant as specified in its charter) Delaware 77-0191793 (State or other jurisdiction of (I.R.S. Employer incorporation or organization) Identification No.) 951 SanDisk Drive Milpitas, California 95035 (Address of principal executive offices) (Zip Code) (408) 801-1000 (Registrant s telephone number, including area code) Securities registered pursuant to Section 12(b) of the Act: Title of each class Name of each exchange on which registered Common Stock, $0.001 par value; Rights to Purchase Series A Junior Participating Preferred S

## Raw filling section identification

In [19]:
filing0_sections =  parse_sec_tenks.filing_section_identification(filing0)

In [20]:
filing0_sections

item
1     business this annual report on form 10-k conta...
1a    risk factors our operating results may fluctua...
1b                 unresolved staff comments none. 25 â
2     properties our corporate headquarters are loca...
3     legal proceedings see note 16, litigation, in ...
4     mine safety disclosures not applicable. 26 par...
5     market for registrant s common equity, related...
6     selected financial data fiscal years ended dec...
7     management s discussion and analysis of financ...
7a    quantitative and qualitative disclosures about...
8     financial statements and supplementary data th...
9     changes in and disagreements with accountants ...
9a    controls and procedures evaluation of disclosu...
9b      other information not applicable. 49 part iii â
10    directors, executive officers and corporate go...
11    executive compensation the information require...
12    security ownership of certain beneficial owner...
13    certain relationships and related tra

## Filter sections: Get sections sentences and lemmatized document

In [21]:
filling0_item1 = nlp(filing0_sections[1])

In [22]:
filling0_item1_clean, filling0_item1_lemma = parse_sec_tenks.preprocessor(filling0_item1)
filling0_item1_clean[:100]

'risk factors operating results fluctuate significantly harm financial condition stock price quarterl'

In [23]:
filling0_item1_lemma[:100]

'risk factor operating result fluctuate significantly harm financial condition stock price quarterly '

In [24]:
filling0_item1_sections, filling0_item1_lemma = parse_sec_tenks.doc_sentence_preprocessor(filling0_item1, 0)

In [25]:
filling0_item1_sections[:2]

sentence
0    risk factors operating results fluctuate signi...
1    quarterly annual operating results fluctuated ...
Name: text, dtype: object

In [26]:
filling0_item1_lemma[:2]

['risk factor operating result fluctuate significantly harm financial condition stock price',
 'quarterly annual operating result fluctuate significantly past expect continue fluctuate future']

In [27]:
filling0_item1_lemma_doc = " ".join(filling0_item1_lemma)
filling0_item1_lemma_doc[:100]

'risk factor operating result fluctuate significantly harm financial condition stock price quarterly '

## From section sentences to n-grams: Phrasers Dectection

In [28]:
sentences = LineSentence(ngram_path / 'ngrams_1.txt')
phrases = Phrases(sentences=sentences)

In [29]:
list(phrases.export_phrases(sentences))[0:10]

[(b'annual report', 147.101466844466),
 (b'forward looking', 501.9094890510949),
 (b'act amended', 105.58149496680232),
 (b'securities exchange', 27.131471728112892),
 (b'act amended', 105.58149496680232),
 (b'forward looking', 501.9094890510949),
 (b'forward looking', 501.9094890510949),
 (b'subject risks', 15.653250773993808),
 (b'actual results', 54.764500349406006),
 (b'differ materially', 353.0339539978094)]

In [30]:
bigrams = Phraser(phrases)
bigrams_sentences = bigrams[sentences]
bigrams_sentences = list(bigrams_sentences)
bigrams_sentences[:2]

[['annual_report',
  'form',
  'contains',
  'forward_looking',
  'statements',
  'future',
  'events',
  'future',
  'results',
  'subject',
  'safe',
  'harbors',
  'created',
  'securities',
  'act_amended',
  'securities_exchange',
  'act_amended'],
 ['statements',
  'statements',
  'historical',
  'fact',
  'statements',
  'deemed',
  'forward_looking',
  'statements']]

# Pipeline

In [None]:
sec_path = Path(io_utils.raw_path, 'sec_filings') 
filing_path = sec_path / 'tenks'
sections_path = sec_path / 'tenks_sections' 
clean_path = sec_path / 'tenks_selected_sections'
lemma_path = sec_path / 'tenks_lemma_selected_sections'
ngram_path = sec_path / 'tenks_ngrams'
stats_path = sec_path / 'tenks_corpus_stats'

In [None]:
parse_sec_tenks.identify_sections(inpath=filing_path, outpath=sections_path)

In [None]:
parse_sec_tenks.parse_sections(nlp=nlp, sections=sections, text_col='text', item_col='item',
                   inpath=sections_path, outpath_sentences=clean_path, outpath_docs=lemma_path)

In [None]:
n_grams.create_unigrams(min_length=min_sentece_length, inpath=clean_path, outpath_ngram=ngram_path, outpath_stats=stats_path,)

In [None]:
n_grams.create_ngrams(max_length=max_ngram_length, phrases_args=phrases_args, ngram_path=ngram_path, stats_path=stats_path)