In [129]:
import pandas as pd
import re
import os
import json
import pickle
from datetime import datetime, timedelta
from collections import Counter, defaultdict
from tqdm import tqdm

# Importing libraries you need to install
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from string import punctuation

# Import yfinance and pandas_datareader
from pandas_datareader import data as pdr
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

# import yfinance as yf 

# Override function to store data we get
# yf.pdr_override()

# Import nltk for first step extracting words
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# Set up stop_words from nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
stop_words |= {'10-k', 'form', 'table', 'contents', 'united', 'states', 'securities', 'exchange', 'commission'}

lemmatizer = WordNetLemmatizer() 

"""
this is where different from version 1
"""
#import libraries for n-gram counting
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luckywang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/luckywang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/luckywang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## now we have all the txts stored in the file:
'./data/10k/[cik]/rawtext/[cik]_[date]'

## we can make a dictionary to store all the data needed

In [5]:
# read the ticker library of all the tikers into ticker_library
ticker_library = pd.read_csv(os.path.join("data", "tickers.csv"))

# read the sp500 components into ticker_selected, 'name' is the company name and ticker is company's ticker
ticker_selected = pd.read_csv(os.path.join("data", "SP500_component_stocks.csv"), header = None)
ticker_selected.columns = ['name','ticker']


  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
# build a ticker_cik_df dataframe to store ticker and its cik number
ticker_cik_df = pd.DataFrame()

# store all the tickers in a ticker_list
ticker_list = ticker_selected.ticker

# build a list cik_list for cik
cik_list = []

for ticker in ticker_list:    
    try:
        # for a given ticker, find its cik number through th ticker library
        cik_list.append(list(ticker_library[ticker_library.ticker == ticker].secfilings)[0][-10:])
        
    except:
        # if could not find cik, give it a empty cik
        cik_list.append('')

# write cik_list and ticker_list to the dataframe ticker_cik_df
ticker_cik_df['cik'] = cik_list
ticker_cik_df['ticker'] = ticker_list

# delete the tickers with empty cik number
ticker_cik_df = ticker_cik_df[ticker_cik_df['cik'] != '']

# display a sample of ticker_cik_df
ticker_cik_df.head()


Unnamed: 0,cik,ticker
0,1090872,A
1,6201,AAL
2,1158449,AAP
3,320193,AAPL
4,1551152,ABBV


In [122]:
CIK2TICKER = {row["cik"]: row["ticker"] for _, row in ticker_cik_df.iterrows()}

In [27]:
listtickers = ['AMZN','BBY','BKNG','MCD','EBAY','F','HD','TGT','WHR','JPM','SIVB','CFG','C','ALL','IVZ','ETFC','MET','PFG','CBOE',
              'CTL','IPG','VIAC','NFLX','CHTR','FB','TWTR','NWSA','FOXA','AMD','INTC','AAPL','LRCX','MSFT','NLOK','CTSH','ADS',
              'WU','PAYC','ABT','CVS','PFE','JNJ','BIIB','INCY','HSIC','WAT','ALGN','EW']

ticker_cik_sample = pd.DataFrame()

for ticker in listtickers:
    ticker_cik_sample = ticker_cik_sample.append(ticker_cik_df[ticker_cik_df['ticker'] == ticker])
    

In [36]:
cik_list = ticker_cik_sample["cik"].values
ticker_list = ticker_cik_sample["ticker"].values

In [28]:
# use a sample to analysis the overall program
# use a sample to analysis the overall program
# sample1 = ticker_cik_df.iloc[0:50]
# sample2 = ticker_cik_df.iloc[50:100]
# sample3 = ticker_cik_df.iloc[100:150]
# sample4 = ticker_cik_df.iloc[150:200]
# sample5 = ticker_cik_df.iloc[200:250]
# sample6 = ticker_cik_df.iloc[250:300]
# sample7 = ticker_cik_df.iloc[300:350]
# sample8 = ticker_cik_df.iloc[350:400]
# sample9 = ticker_cik_df.iloc[400:450]
# sample10 = ticker_cik_df.iloc[450:500]
# ticker_cik_sample = ['0000049826','0000816284']


In [37]:
# store data using dictionary
all_data = {}

# set the key of dictionary as ticker
for cik, ticker in zip(cik_list, ticker_list):
    
    # set the value of tikcer as a dict
    all_data[ticker] = {}

    # set the dict data[ticker] 
    all_data[ticker]['cik'] = cik
    all_data[ticker]['10ks'] = {}
    all_data[ticker]['10qs'] = {}


In [11]:
all_data['ITW']

{'cik': '0000049826', '10ks': {}, '10qs': {}}

In [12]:
dir_10k = './data/10k/'
dir_10q = './data/10q/'

In [38]:
print(len(cik_list))
print(len(ticker_list))

45
45


In [164]:
# define a function to remove punctuations if a given word ended with a punctuation
def remove_punct(string):
    return re.sub(r"[{}]+".format(punctuation), "", string)


In [165]:
# define a function for filtering words
def filter_words(string):
    return bool(re.match(r'^[a-z\']+$', string))


In [106]:
def aggregate_cik_texts(cik, filetype):
    """
    Collect all the texts related to given `cik` with given filetype and 
    return a single string
    """
    
    assert filetype in ("10k", "10q")
    
    cik_dir = os.path.join("data", filetype, cik)
    rawtext_dir = os.path.join(cik_dir, "rawtext")
    # goes into the directory to find the path for txtfiles
    try:
        all_files = os.listdir(rawtext_dir)
    except:
        print("No such dir")
    
    texts = ""
    for file in all_files:
        with open(os.path.join("data", filetype, cik, "rawtext", file), encoding = "utf8") as f:
            string_temp = f.read().lower()
            texts += string_temp
    
    texts = remove_punct(texts)
    
    return texts


In [107]:
def texts2counter(texts):
    tokens = [token for token in nltk.word_tokenize(texts) if token not in stop_words and not token.isdigit()]
    tokens = list(filter(filter_words, tokens))
    
    counter = Counter(tokens)
    pkl_path = os.path.join(cik_dir, "pickle")
    if not os.path.isdir(pkl_path):
        os.mkdir(pkl_path)
    
    with open(os.path.join(pkl_path, 'token_counter.pkl'), 'wb') as f:
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(counter, f, pickle.HIGHEST_PROTOCOL)
    
    return counter
    

In [240]:
# './data/10k/[cik]/rawtext/[cik]_[date]'
docs = []
tickers = []

for cik in tqdm(cik_list):
    tickers.append(CIK2TICKER[cik])
    texts = ""
    for filetype in ["10k", "10q"]:
        texts += aggregate_cik_texts(cik, filetype)
        # counter = texts2counter(texts)
    docs.append(texts)


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:37<00:00,  1.19it/s]


In [241]:
cv = CountVectorizer(max_df=0.8, stop_words=stop_words, max_features=10000)
word_count_vector = cv.fit_transform(docs)



In [242]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)


In [243]:
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    
    return results


In [244]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)
# tf_idf_vector = tfidf_transformer.transform(word_count_vector)
# tf_idf_vector = tfidf_transformer.transform(cv.transform([docs[3]]))
# tf_idf_vector = tfidf_transformer.transform(cv.transform(docs[0]))

TfidfTransformer()

In [245]:
len(docs)

45

In [246]:
keywords = []

for i in range(len(docs)):
    tf_idf_vector = tfidf_transformer.transform(cv.transform([docs[i]]))
    # you only needs to do this once, this is a mapping of index to 
    feature_names = cv.get_feature_names()

    sorted_items = sort_coo(tf_idf_vector.tocoo())
    # extract only the top n; n here is 10
    keyword = extract_topn_from_vector(feature_names, sorted_items, 30)
    
    keywords.append(keyword)
    

In [247]:
dict_top_k = defaultdict(list)
dict_top_k["tickers"] = tickers

In [248]:
for keyword in keywords:
    for i, word in enumerate(keyword.keys()):
        dict_top_k["word_{}".format(i)].append(word)
        dict_top_k["tfidf_{}".format(i)].append(keyword[word])

In [249]:
len(keywords)

45

In [250]:
df_top_k_word = pd.DataFrame(dict_top_k)

In [251]:
df_top_k_word.to_csv("data/45_companies_top_30", index=False)

Unnamed: 0,tickers,word_0,tfidf_0,word_1,tfidf_1,word_2,tfidf_2,word_3,tfidf_3,word_4,...,word_25,tfidf_25,word_26,tfidf_26,word_27,tfidf_27,word_28,tfidf_28,word_29,tfidf_29
0,AMZN,amazoncom,0.438,peacs,0.373,fulfillment,0.368,amazon,0.245,shipping,...,euros,0.059,card,0.059,wwwamazonca,0.056,wwwamazoncom,0.055,unearned,0.054
1,BBY,stores,0.641,store,0.378,sga,0.283,musicland,0.176,merchandise,...,largeformat,0.062,speakeasy,0.059,notebook,0.051,canadian,0.05,auctionrate,0.049
2,BKNG,hotel,0.42,pricelinecom,0.404,bookingcom,0.347,reservations,0.273,airline,...,search,0.068,airlines,0.067,otcs,0.067,expedia,0.066,braddock,0.065
3,MCD,restaurants,0.478,mcdonald,0.467,restaurant,0.383,companyoperated,0.348,mcdonalds,...,conventional,0.033,franchisee,0.032,japan,0.032,breakfast,0.032,chicken,0.032
4,EBAY,paypal,0.819,ebay,0.423,skype,0.144,marketplaces,0.106,paypals,...,halfcom,0.035,tpv,0.034,ebaycom,0.033,search,0.031,auction,0.03
5,F,ford,0.888,automotive,0.265,motor,0.154,vehicles,0.133,incomeloss,...,jaguar,0.033,nonconsumer,0.032,fcar,0.031,assetbacked,0.031,commodity,0.03
6,HD,depot,0.72,stores,0.44,store,0.298,hd,0.167,merchandise,...,chain,0.045,blake,0.042,shrink,0.041,lighting,0.041,gift,0.038
7,TGT,card,0.582,stores,0.355,guests,0.254,comparablestore,0.248,merchandise,...,minnesota,0.059,gift,0.058,douglas,0.057,cvs,0.055,3a,0.054
8,WHR,whirlpool,0.807,befiex,0.241,brazilian,0.193,maytag,0.186,indesit,...,forwardsoptions,0.046,oilrelated,0.044,monetized,0.041,hotpoint,0.041,compressors,0.04
9,JPM,jpmorgan,0.539,lendingrelated,0.276,mortgage,0.246,card,0.227,pages,...,msr,0.08,afs,0.079,tier,0.078,rfs,0.077,lending,0.076


In [92]:
with open(os.path.join(pkl_path, 'token_counter.pkl'), 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    c = pickle.load(f)

In [16]:
"""
This is where different from version 1, adding a beta term to calculate excess return.
"""

#preprocess the data and store them
for ticker in all_data.keys():
    
    #give the cik for this ticker
    cik = all_data[ticker]['cik']
    
    #goes into the directory to find the path for txtfiles
    try:
        files_10k = os.listdir(pathname_10k + cik + '/grabbed_text/')
    except:
        break
    
    #iterate through the 10k path and get the information and txt
    for file_10k in files_10k:
        
        #get the release time
        release_10k = file_10k[-14:-4]
        
        """
        adding beta = 1, denoting the market return value at that time to calculate excess return.
        """
        
        #get the 5 day excess return for the given release time
        try:
            excess_return = Get_Ex_Ret(ticker, release_10k)
            """
            adding a market return term and subtracting it(assuming beta = 1)
            """
            market_return = Get_Ex_Ret('SPY', release_10k)
            excess_return = excess_return - market_return
            
        #exception may happen if we don't have the ticker name in yahoo finance, so we simply delete it    
        except:
            break
    
        #preprocess the txt and store it in a list of words
        #open the text file and read file as lower string
        with open(pathname_10k + cik + '/grabbed_text/' + file_10k, encoding = "utf8") as f:
            string_temp = f.read().lower()
        
        #rule out all the stop words and store them in a list. Also rule out all the puctuations
        filtered_words = [remove_punct(word) for word in string_temp.split() if word not in stop_words]
        
        #filter word for all words
        filtered_words = list(filter(filter_words, filtered_words))
        
        #reduce word to its root form
        filtered_words = [lemmatizer.lemmatize(word, pos = 'v') for word in filtered_words]
        filtered_words = [lemmatizer.lemmatize(word, pos = 'n') for word in filtered_words]
        
        #remove all empty values
        #while '' in filtered_words:
            #filtered_words.remove('')
        
        #store all the information needed in the all_data dict
        if filtered_words != []:
            all_data[ticker]['10ks'][release_10k] = {}
            all_data[ticker]['10ks'][release_10k]['ex_return'] = excess_return
            all_data[ticker]['10ks'][release_10k]['words'] = ' '.join(filtered_words)


In [17]:
"""
This is where different from version 1, adding a beta term to calculate excess return.
"""

for ticker in all_data.keys():
    
    #give the cik for this ticker
    cik = all_data[ticker]['cik']
    
    #goes into the directory to find the path for txtfiles
    try:
        files_10q = os.listdir(pathname_10q + cik + '/grabbed_text/')
    except:
        break
    
    #iterate through the 10q path and get the information and txt
    for file_10q in files_10q:
        
        #get the release time
        release_10q = file_10q[-14:-4]       
        
        """
        adding beta = 1, denoting the market return value at that time to calculate excess return.
        """
        
        #get the 5 day excess return for the given release time
        try:
            excess_return = Get_Ex_Ret(ticker, release_10q)
            """
            adding a market return term and subtracting it(assuming beta = 1)
            """
            market_return = Get_Ex_Ret('SPY', release_10k)
            excess_return = excess_return - market_return
        
        #exception may happen if we don't have the ticker name in yahoo finance, so we simply delete it    
        except:
            break
    
        #preprocess the txt and store it in a list of words
        #open the text file and read file as lower string
        with open(pathname_10q + cik + '/grabbed_text/' + file_10q, encoding = "utf8") as f:
            string_temp = f.read().lower()
        
        #rule out all the stop words and store them in a list. Also rule out all the puctuations
        filtered_words = [remove_punct(word) for word in string_temp.split() if word not in stop_words]
        
        #filter word for all words
        filtered_words = list(filter(filter_words, filtered_words))
        
        #reduce word to its root form
        filtered_words = [lemmatizer.lemmatize(word, pos = 'v') for word in filtered_words]
        filtered_words = [lemmatizer.lemmatize(word, pos = 'n') for word in filtered_words]
        
        #remove all empty values
        #while '' in filtered_words:
            #filtered_words.remove('')
        
        #store all the information needed in the all_data dict
        if filtered_words != []:
            all_data[ticker]['10qs'][release_10q] = {}
            all_data[ticker]['10qs'][release_10q]['ex_return'] = excess_return
            all_data[ticker]['10qs'][release_10q]['words'] = ' '.join(filtered_words)


### have a look at the data stucture now

In [18]:
#all_data['NKE']

## Analyze 10K word with tfidf and bag-of-words

In [19]:
"""
This part is to compute the tf values for the words in 10ks and collect the overall word list for computing idf in the next step
"""
#count the number of 10K documents we have
document_num_10k = 0

#word list for 10k 
word_list_10k = defaultdict(int)


#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10ks']:
        
        #we have document for a given date, so add 1 for document_num_10k
        document_num_10k += 1
        
        #compute the tfs for the txt file
        all_data[ticker]['10ks'][date]['tf'] = Counter(all_data[ticker]['10ks'][date]['words'].split())
        
        #iterate through the words in tf, which is the words of a given 10k document of a given date
        for word in all_data[ticker]['10ks'][date]['tf']:
            
            #add one if it already contains the word, or add the this word to the dict if not
            word_list_10k[word] += 1
        

In [20]:
#compute idf value for the word in 10ks
idf_10k = {}

#iterate through all the words in word_list_10k
for word in word_list_10k:
    
    #compute idf value
    idf_10k[word] = np.log(document_num_10k / (1 + word_list_10k[word]))

In [21]:
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#we already have the number of all 10k files
# document_num_10k = document_num_10k

#word pair list for 10k
pair_list_10k = defaultdict(int)


#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10ks']:
        
        #there is no need for counting document_num_10k
        
        #compute the tfs for the txt file
        vectorizer = CountVectorizer(analyzer = 'word', ngram_range = (2, 2))
        
        #fit it through vectorizer
        fitted = vectorizer.fit_transform([all_data[ticker]['10ks'][date]['words']])
        
        #after vectorizer, we have feature name and feature count, feed them to a dataframe
        df_temp = pd.DataFrame(index = vectorizer.get_feature_names(), data = np.squeeze(fitted.toarray()))
#         #then we can add the data into the main data set all_data
        all_data[ticker]['10ks'][date]['tf_pair'] = df_temp.to_dict()[0]
#         print(all_data[ticker]['10ks'][date]['tf_pair'])
        #print(all_data[ticker]['10ks'][date]['tf_pair'])
        #iterate through the pairs in tf_pair, which is the words of a given 10k document of a given date
        for pair in all_data[ticker]['10ks'][date]['tf_pair']:
            
            #add one if it already contains the pair, or add the this pair to the dict if not have
            pair_list_10k[pair] += 1
        

In [22]:
pair_list_10k

defaultdict(int, {})

In [23]:
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#compute idf value for the pair in 10ks
idf_10k_pair = {}

#iterate through all the pairs in pair_list_10k
for pair in pair_list_10k:
    
    #compute idf value
    idf_10k_pair[pair] = np.log(document_num_10k / (1 + pair_list_10k[pair]))

In [24]:
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#we already have the number of all 10k files
# document_num_10k = document_num_10k

#word triple list for 10k
triple_list_10k = defaultdict(int)


#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10ks']:
        
        #there is no need for counting document_num_10k
        
        #compute the tfs for the txt file
        vectorizer = CountVectorizer(analyzer = 'word', ngram_range = (3, 3))
        
        #fit it through vectorizer
        fitted = vectorizer.fit_transform([all_data[ticker]['10ks'][date]['words']])
        
        #after vectorizer, we have feature name and feature count, feed them to a dataframe
        df_temp = pd.DataFrame(index = vectorizer.get_feature_names(), data = np.squeeze(fitted.toarray()))
        
        #then we can add the data into the main data set all_data
        all_data[ticker]['10ks'][date]['tf_triple'] = df_temp.to_dict()[0]
        
        #iterate through the triples in tf_triple, which is the words of a given 10k document of a given date
        for triple in all_data[ticker]['10ks'][date]['tf_triple']:
            
            #add one if it already contains the triple, or add the this triple to the dict if not have
            triple_list_10k[triple] += 1
            
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#compute idf value for the triple in 10ks
idf_10k_triple = {}

#iterate through all the triples in triple_list_10k
for triple in triple_list_10k:
    
    #compute idf value
    idf_10k_triple[triple] = np.log(document_num_10k / (1 + triple_list_10k[triple]))

### doing the same to 10qs

In [25]:
#also do the same to 10q files

"""
This part is to compute the tf values for the words in 10qs and collect the overall word list for computing idf in the next step
"""
#count the number of 10Q documents we have
document_num_10q = 0

#word list for 10q 
word_list_10q = defaultdict(int)


#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10qs']:
        
        #we have document for a given date, so add 1 for document_num_10q
        document_num_10q += 1
        
        #compute the tfs for the txt file
        all_data[ticker]['10qs'][date]['tf'] = Counter(all_data[ticker]['10qs'][date]['words'].split())
        
        #iterate through the words in tf, which is the words of a given 10q document of a given date
        for word in all_data[ticker]['10qs'][date]['tf']:
            
            #add one if it already contains the word, or add the this word to the dict if not
            word_list_10q[word] += 1
               

In [26]:
#compute idf value for the word in 10qs
idf_10q = {}

#iterate through all the words in word_list_10q
for word in word_list_10q:
    
    #compute idf value
    idf_10q[word] = np.log(document_num_10q / (1 + word_list_10q[word]))
    

In [27]:
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#we already have the number of all 10q files
# document_num_10q = document_num_10q

#word pair list for 10q
pair_list_10q = defaultdict(int)


#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10qs']:
        
        #there is no need for counting document_num_10q
        
        #compute the tfs for the txt file
        vectorizer = CountVectorizer(analyzer = 'word', ngram_range = (2, 2))
        
        #fit it through vectorizer
        fitted = vectorizer.fit_transform([all_data[ticker]['10qs'][date]['words']])
        
        #after vectorizer, we have feature name and feature count, feed them to a dataframe
        df_temp = pd.DataFrame(index = vectorizer.get_feature_names(), data = np.squeeze(fitted.toarray()))
        
        #then we can add the data into the main data set all_data
        all_data[ticker]['10qs'][date]['tf_pair'] = df_temp.to_dict()[0]
        
        #iterate through the pairs in tf_pair, which is the words of a given 10q document of a given date
        for pair in all_data[ticker]['10qs'][date]['tf_pair']:
            
            #add one if it already contains the pair, or add the this pair to the dict if not have
            pair_list_10q[pair] += 1

In [28]:
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#compute idf value for the pair in 10qs
idf_10q_pair = {}

#iterate through all the pairs in pair_list_10q
for pair in pair_list_10q:
    
    #compute idf value
    idf_10q_pair[pair] = np.log(document_num_10q / (1 + pair_list_10q[pair]))

In [29]:
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#we already have the number of all 10q files
# document_num_10q = document_num_10q

#word triple list for 10q
triple_list_10q = defaultdict(int)


#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10qs']:
        
        #there is no need for counting document_num_10q
        
        #compute the tfs for the txt file
        """
        The only adaptation from pair to triple is changing ngram_range, needing futher simplification of code
        """
        vectorizer = CountVectorizer(analyzer = 'word', ngram_range = (3, 3))
        
        #fit it through vectorizer
        fitted = vectorizer.fit_transform([all_data[ticker]['10qs'][date]['words']])
        
        #after vectorizer, we have feature name and feature count, feed them to a dataframe
        df_temp = pd.DataFrame(index = vectorizer.get_feature_names(), data = np.squeeze(fitted.toarray()))
        
        #then we can add the data into the main data set all_data
        all_data[ticker]['10qs'][date]['tf_triple'] = df_temp.to_dict()[0]
        
        #iterate through the triples in tf_triple, which is the words of a given 10q document of a given date
        for triple in all_data[ticker]['10qs'][date]['tf_triple']:
            
            #add one if it already contains the triple, or add the this triple to the dict if not have
            triple_list_10q[triple] += 1
            
"""
This part is new in the version 2. It's using for n-gram preparation.
"""

#compute idf value for the triple in 10qs
idf_10q_triple = {}

#iterate through all the triples in triple_list_10q
for triple in triple_list_10q:
    
    #compute idf value
    idf_10q_triple[triple] = np.log(document_num_10q / (1 + triple_list_10q[triple]))

### have a look at the data structure

In [30]:
idf_10k

{}

In [31]:
"""
This is new in version 2, containing idfs for word pairs
"""

idf_10q_pair

{}

In [32]:
"""
This is new in version 2, containing idfs for word pairs
"""

idf_10q_triple

{}

## Store the data for future use 

In [33]:
"""
this part is for storing the data for future use
"""
#delete word in all_data for storage
#iterate through the tickers
for ticker in all_data:
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10qs']:
        
        del all_data[ticker]['10qs'][date]['words']
    
    #for a given ticker, iterate through date
    for date in all_data[ticker]['10ks']:
        
        del all_data[ticker]['10ks'][date]['words']


In [34]:
#write all_data to a json file
with open('all_data.json', 'w') as json_file:
    json_file.write(json.dumps(all_data))

#write idf_10k to a json file
with open('idf_10k.json', 'w') as json_file:
    json_file.write(json.dumps(idf_10k))

#write idf_10q to a json file    
with open('idf_10q.json', 'w') as json_file:
    json_file.write(json.dumps(idf_10q))
    

In [35]:
"""
This is new in version 2, storing files for word pairs and word triples
"""
#write idf_10k_pair to a json file  
with open('idf_10k_pair.json', 'w') as json_file:
    json_file.write(json.dumps(idf_10k_pair))

#write idf_10q_pair to a json file    
with open('idf_10q_pair.json', 'w') as json_file:
    json_file.write(json.dumps(idf_10q_pair))
    
#write idf_10k_triple to a json file  
with open('idf_10k_triple.json', 'w') as json_file:
    json_file.write(json.dumps(idf_10k_triple))

#write idf_10q_triple to a json file    
with open('idf_10q_triple.json', 'w') as json_file:
    json_file.write(json.dumps(idf_10q_triple))    
