In [1]:
import nltk
import time
import unidecode
import ftfy
import re
import time
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
# for reading in the articles from the dataframe
import ast
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

### Load in the cleaned data / clean the data

In [2]:
# load in the corpus, stop_words, and cleaned companies
sentences = pd.read_csv("all_sentences.csv")
sentences = [x[0] for x in sentences.values if x is not None]
stop_words = set(stopwords.words('english'))
companies = pd.read_csv("all/companies.csv", encoding = "latin-1", header = -1)[0].tolist()
# remove duplicate sentences
companies = list(set(companies))
# articles
articles = pd.read_csv("articles_sentences.csv")
list_of_articles = [ast.literal_eval(x) for x in articles['Article'].values]

# load in CEOs for negative samples?
ceos = pd.read_csv("all/ceo.csv", encoding = "latin-1", header = -1).replace(np.nan, '', regex = True)
ceos[2] = (ceos[0].str.rstrip() + " " + ceos[1].str.rstrip()).str.rstrip()
cleaned_ceos = list(set(ceos[2].tolist()))

print("Number of Articles: {}".format(len(list_of_articles)))
print("Number of Total Sentences: {}".format(len(sentences)))
print("Number of Unique Companies: {}".format(len(companies)))
print("Number of Unique Ceos: {}".format(len(cleaned_ceos)))

Number of Articles: 730
Number of Total Sentences: 708281
Number of Unique Companies: 2592
Number of Unique Ceos: 1291


## Building Feature Vectors

Features include:
* Number of words in the sentence
* Number of characters in the sentence
* Number of characters in the target word/phrase
* Number of capitals in the target word
* Number of capitals in the sentence
* Starting position of the word in the sentence
* Number of words in the article
* Number of sentences in the article
* Number of times the word appears in the article
* 1 if the word contains a keyword
* number of words in the potential company name

In [3]:
keywords = set(['Inc', 'inc', 'Corp', 'corp', 'corporation', 'Co', 'company', 'Company', 'Group', 'Ltd', 'ltd', 'Capital', 'capital',
               'management', 'Management', 'Financial', 'financial', 'consulting', 'Consulting', 'Depot'])

def build_feature_vector(word_tuple):
    '''
    word_tuple: (word, sentence_index, article_index)
    '''
    word = word_tuple[0]
    sentence_index = word_tuple[1]
    article_index = word_tuple[2]
    
    sentence = list_of_articles[article_index][sentence_index]
    list_of_sentences = list_of_articles[article_index]
    # number of words in the sentence
    number_of_words = len(sentence.split(' '))
    # number of characters in the sentence
    number_of_chars_sentence = len(sentence)
    # number of characters in the target
    number_of_chars_candidate = len(word)
    # number of capitals in the target word
    number_capitals_word = sum(1 for x in word if x.isupper())
    # starting position in the sentence
    starting_index = sentence.find(word)
    # number of capitals sentence
    number_capitals_sentence = sum(1 for x in sentence if x.isupper())
    # number of words in the article
    sum_article = 0
    for x in list_of_sentences:
        sum_article += len(x.split(' '))
    number_of_words_article = sum_article
    # number of sentences in the article
    number_of_sentences_article = len(list_of_sentences)
    # number of times the word appears in the article
    total_string = ' '.join(list_of_sentences)
    appearance_count = total_string.count(word)
    # 1 if the word contains a keyword
    keyword_appear = 0
    for subword in word.split(' '):
        if subword in keywords:
            keyword_appear = 1
            break
    
    # number of words in the target word
    candidate_word_count = len(word.split(' '))
    
    vector = [number_of_words, number_of_chars_sentence, number_of_chars_candidate,
             number_capitals_word, starting_index, number_capitals_sentence, number_of_words_article,
             number_of_sentences_article, appearance_count, keyword_appear, candidate_word_count]
    
    return vector

# Identify training data from corpus

Here we use company names as positive samples, and CEO names as negative samples

In [4]:
# find positive and negative matches to train against
pos_matches = []
neg_matches = []
start_time = time.time()
# find matches with word, sentence_index, article_index
for article_index, article in enumerate(list_of_articles):
    if article_index % 70 == 0:
        print("Percentage Completed: {0:.0%}".format(article_index / len(list_of_articles)))
    for sentence_index, sentence in enumerate(article):
        for company in companies:
            if company in sentence:
                pos_matches.append((company, sentence_index, article_index))
        for ceo in cleaned_ceos:
            if ceo in sentence:
                neg_matches.append((ceo, sentence_index, article_index))
                
print("Time to complete: {}".format(time.time() - start_time))

Percentage Completed: 0%
Percentage Completed: 10%
Percentage Completed: 19%
Percentage Completed: 29%
Percentage Completed: 38%
Percentage Completed: 48%
Percentage Completed: 58%
Percentage Completed: 67%
Percentage Completed: 77%
Percentage Completed: 86%
Percentage Completed: 96%
Time to complete: 395.7308769226074


### Build feature vectors for positive and negative samples

In [5]:
# Add Labels
print("Starting Positive samples...")
start_time = time.time()
pos_data = list(map(lambda x: build_feature_vector(x), pos_matches))
for vector in pos_data:
    vector.append(1)
print("Number of Positive samples: {}".format(len(pos_data)))
print("Time to finish positive samples: {}".format(time.time() - start_time))

start_time = time.time()
print("Starting Negative samples...")
neg_data = list(map(lambda x: build_feature_vector(x), neg_matches))
for vector in neg_data:
    vector.append(0)
print("Number of Negative samples: {}".format(len(neg_data)))
print("Time to finish negative samples: {}".format(time.time() - start_time))

Starting Positive samples...
Number of Positive samples: 191661
Time to finish positive samples: 335.9750978946686
Starting Negative samples...
Number of Negative samples: 38739
Time to finish negative samples: 69.47800898551941


## Random Forest

Split the data into training and testing data, tune parameters

In [6]:
# combine the data
combined_data = pd.DataFrame(pos_data + neg_data)
x_values = combined_data.drop(11, axis = 1)
y_values = combined_data[11]

x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.33)

rfc = RandomForestClassifier(n_estimators = 30)
rfc.fit(x_train, y_train)
rf_fit_values = rfc.predict(x_train)
confus = confusion_matrix(y_train, rf_fit_values)
print(confus)
accuracy = (confus[0,0] + confus[1,1]) / sum(sum(confus))
precision = (confus[0,0] / (confus[0,0] + confus[0,1]))
recall = (confus[0,0] / (confus[0,0] + confus[1,0]))
print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))

[[ 25658    304]
 [   285 128121]]
Accuracy: 0.996184442371476
Precision: 0.9882905785378631
Recall: 0.9890143776741317


#### Testing data

In [7]:
rf_fit_values = rfc.predict(x_test)
confus = confusion_matrix(y_test, rf_fit_values)
print(confus)
accuracy = (confus[0,0] + confus[1,1]) / sum(sum(confus))
precision = (confus[0,0] / (confus[0,0] + confus[0,1]))
recall = (confus[0,0] / (confus[0,0] + confus[1,0]))
print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))

[[ 7773  5004]
 [ 2815 60440]]
Accuracy: 0.8971617213804713
Precision: 0.6083587696642404
Recall: 0.7341329807329052


### Identify potential companies

We use a regex that follows this idea:

* This Matches The Regex

It gathers all the words that have capital letters to start in a sequence

For instance:

* "This would only match the first word" would return "This" as a match

In [8]:
# identify which sentences may have a company in it
def identify_potential_sentence(sentence):
    matches = re.findall(r"(?:(?:[A-Z]+[a-z]*) ?)+", sentence)
    if matches:
        return matches
    else:
        return False

# removes the stop words from a found match
def remove_stop_words(word):
    cleaned_word = ' '.join([x for x in word.split(' ') if x.lower() not in stop_words]).rstrip()
    return cleaned_word

# find positive and negative matches to train against
potential_matches = []
start_time = time.time()
# find matches with word, sentence_index, article_index
# only select matches that are their own word
for article_index, article in enumerate(list_of_articles):
    if article_index % 70 == 0:
        print("Percentage Completed: {0:.0%}".format(article_index / len(list_of_articles)))
    for sentence_index, sentence in enumerate(article):
        matches = identify_potential_sentence(sentence) 
        if matches:
            filtered_matches = []
            # filter out the matches with annoying APBloomberg or AP stuff in it
            for match in matches:
                if " " + match + " " in sentence:
                    filtered_matches.append(match)
                elif sentence[:len(match)+1 == match + " "]:
                    filtered_matches.append(match)                 
                elif sentence[-len(match)+1:] == " " + match:
                    filtered_matches.append(match)
            cleaned_matches = [remove_stop_words(x) for x in filtered_matches]
            remove_empty_words = [x for x in cleaned_matches if x != '']
            for match in remove_empty_words:
                potential_matches.append((match, sentence_index, article_index))

potential_matches = list(set(potential_matches))
print("Number of potential matches: {}".format(len(potential_matches)))
print("Time to complete: {}".format(time.time() - start_time))

Percentage Completed: 0%
Percentage Completed: 10%
Percentage Completed: 19%
Percentage Completed: 29%
Percentage Completed: 38%
Percentage Completed: 48%
Percentage Completed: 58%
Percentage Completed: 67%
Percentage Completed: 77%
Percentage Completed: 86%
Percentage Completed: 96%
Number of potential matches: 15301
Time to complete: 6.928769826889038


## Build feature vectors for the potential samples

In [9]:
# Add Labels
print("Starting classification of potential samples...")
start_time = time.time()
new_data = list(map(lambda x: build_feature_vector(x), potential_matches))
print("Number of samples: {}".format(len(new_data)))
print("Time to finish: {}".format(time.time() - start_time))

Starting classification of potential samples...
Number of samples: 15301
Time to finish: 27.088132858276367


### Classify the potential samples based on the model and output

In [10]:
rf_fit_values = rfc.predict(new_data)
classified_companies = []
for index, classification in enumerate(rf_fit_values):
    if classification == 1:
        classified_companies.append(potential_matches[index][0])

In [11]:
found_companies = set(classified_companies)
output = set(list(found_companies) + list(companies))
pd.Series(list(output)).to_csv("found_companies.csv", header = False, index = False)