# Script to calculate the IDF vector for words in the dataset

### Input: 
    crunch base and pitch book descriptions of companies
### Output:
    A pandas dataframe series (column vector), where the index is each word and each row value is the corresponding IDF value (able to write this to csv as well). Sorted in descending order

#### Notes: 
 - Currently having issues with time complexity in this solution in regards to calculating the IDF value for all of the 650,000 companies in the entire dataset. I have isolated the inefficiency to numDocsContaining function, which if you look at how it calculates the freq of each word individually it needs a better method, which I will work on 
 
 - Since the format will be the same, I calculated this IDF vector only for the training set of companies so we can get the code working for building the graph first
 
 - This script takes a long time to run, as is, for only ~3000 companies (but with using both the cb and pb descriptions)
  

In [2]:
import pandas as pd
import numpy as np

# Have a look at the category labeled training data 
###################### Training Data Test #############################
training_categories_df = pd.read_csv("../data/category_training_labeled_fixed.csv",  encoding = "ISO-8859-1")
mydoclist = training_categories_df.ix[0:,('cb_desc','pb_desc')].values
#####################

#################### Full Data - Not working currently, need to make IDF calculation more efficient #########################
#training_categories_df = pd.read_csv("../data/raw_data_fixed.csv",  encoding = "ISO-8859-1", usecols=['domain'\
#, 'tx_industry', 'cb_category', 'tx_category', 'cb_desc', 'pb_desc', 'pb_category'])
#mydoclist = training_categories_df.ix[0:,'cb_desc'].values

mydoclist


array([[ 'ConferenceCloud provides state-of-the-art live communications and interactivity for hybrid conferences, lectures and meetings.',
        'Provider of an online conferencing platform. The company provides an online platform which allows conducting of video conferences.'],
       [ 'Terminus is a platform that seamlessly integrate salesforce CRM and build segments of best fit accounts.',
        "Developer of a B2B advertising platform. The company enables B2B marketers to simplify account-based marketing to reach and engage targeted accounts across all stages of the buyer's journey."],
       ['Next generation card processing platform',
        'Provider of payment processing services. The company provides pre-paid visa, mastercard and bill payment processing services and also offers its clients business intelligence, analytics and program management services.'],
       ..., 
       [ 'Kupu, based in Hawaii, aims to empower the youth to serve the community through character bu

In [None]:
import string
import nltk
from nltk.corpus import stopwords


def build_lexicon(corpus):
    lexicon = set()
    iteration = 1
    for doc in corpus:
        # This line is where you need to clean up the words
        lexicon.update([word.strip().lower().translate(str.maketrans('', '', string.punctuation)) for word in str(doc).split()])  
        if iteration % 100000 is 0:
            print('Iteration {} out of {}'.format(iteration, len(corpus)))
        iteration += 1
    # Now filter out all of the stop words
    print("Out of the first loop, into the filter stages")
    filtered = {word for word in lexicon if not word in stopwords.words('english')}
    print("Finished filtering stop words, now into filtering Nan's")
    # Last, filter out all the nan values and any spaces that made it through
    filtered_final = {word for word in filtered if not word in {'nan',' '}}
    return filtered_final


vocabulary = build_lexicon(mydoclist)


def numDocsContaining(word, doclist):
    doccount = 0
    for doc in doclist:
        #if freq(word, doc) > 0:  # This line seems inefficient because you don't need the full count for this you just need to know it is present at least once
        if word in str(doc).split():
            doccount +=1
    return doccount 

def freq(term, document):
    return str(document).split().count(term)

def idf(word, doclist):
    n_samples = len(doclist)
    df = numDocsContaining(word, doclist)
    return np.log(n_samples / 1+df)

print("Finished building the lexicon, now into the idf calculations...")
my_idf_vector = [idf(word, mydoclist) for word in vocabulary]
print("Done!!!")
#print('Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']')
#print('The inverse document frequency vector is [' + ', '.join(format(freq, 'f') for freq in my_idf_vector) + ']')

In [1]:

idf_vector = pd.Series(my_idf_vector, index=vocabulary)

sorted_idf_vector = idf_vector.sort_values(ascending=False)

print(sorted_idf_vector)

NameError: name 'pd' is not defined

In [None]:
sorted_idf_vector.to_csv("../data/initial_idf_vector.csv")

----------------------------------- This is the end of the code to calculate the idf vector and write to CSV
The below code is a way to generate a matrix where the rows are each document and the columns represent each word, with an entry being the idf value of each word if it is present in the document (not sure if this will be useful or not) 

In [None]:
def build_idf_matrix(idf_vector):
    idf_mat = np.zeros((len(idf_vector), len(idf_vector)))
    np.fill_diagonal(idf_mat, idf_vector)
    return idf_mat

#This function is titled term frequency, but actually returns a 1 if the word is present in the document and a 0 else
def tf(term, document):
    count = freq(term, document)
    if count >= 1:
        return 1
    else:
        return 0

# We now build the doc_term_matrix, which is a list of lists, where each list is a row with a 1 in each column if the
# word that column represents is present in the document
doc_term_matrix = []

for doc in mydoclist:
    print('The doc is "' + doc + '"')
    tf_vector = [tf(word, doc) for word in vocabulary]
    tf_vector_string = ', '.join(format(freq, 'd') for freq in tf_vector)
    #print('The tf vector for Document %d is [%s]' % ((mydoclist.index(doc)+1), tf_vector_string))
    doc_term_matrix.append(tf_vector)

doc_term_matrix = np.matrix(doc_term_matrix)
print(doc_term_matrix)

my_idf_matrix = build_idf_matrix(my_idf_vector) # This is a diagonal matrix, where the diagonal is the idf vector found previously



In [None]:
# Now create an idf matrix, where the column is each term, and the row represents a document. The value represents
# the idf of the term in the document

idf_matrix = np.dot(doc_term_matrix, my_idf_matrix)
idf_matrix