# CEOs

In [1]:
# import necessary packages
import nltk
import time
import unidecode
import ftfy
import re
import numpy as np
import ast

from nltk.corpus import stopwords

import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

### Load in the data, clean the training CEO data set

We clean the CEO data by stripping away any of the extra white space, and combining first and last names into one columns

In [2]:
# load in the corpus, stop_words, and cleaned companies
sentences = pd.read_csv("all_sentences.csv")
sentences = [x[0] for x in sentences.values if x is not None]
stop_words = set(stopwords.words('english'))
companies = pd.read_csv("all/companies.csv", encoding = "latin-1", header = -1)[0].tolist()
# remove duplicate sentences
companies = list(set(companies))
# articles
articles = pd.read_csv("articles_sentences.csv")
list_of_articles = [ast.literal_eval(x) for x in articles['Article'].values]

# load in CEOs for negative samples?
ceos = pd.read_csv("all/ceo.csv", encoding = "latin-1", header = -1).replace(np.nan, '', regex = True)
ceos[2] = (ceos[0].str.rstrip() + " " + ceos[1].str.rstrip()).str.rstrip()
cleaned_ceos = list(set(ceos[2].tolist()))

print("Number of Articles: {}".format(len(list_of_articles)))
print("Number of Total Sentences: {}".format(len(sentences)))
print("Number of Unique Companies: {}".format(len(companies)))
print("Number of Unique Ceos: {}".format(len(cleaned_ceos)))

Number of Articles: 730
Number of Total Sentences: 708281
Number of Unique Companies: 2592
Number of Unique Ceos: 1291


### Build the feature vector
* number of words in the sentence
* number of characters in the sentence
* number of characters in the potential CEO
* number of capitals in the potential CEO
* number of capitals in the sentence
* starting position of the word in the sentence
* number of words in the article
* number of sentences in the article
* number of times the word appears in the article
* 1 if the article contains a keyword from the first set of keywords
* length of the first and last name
* number of times the first and last time appear in the article
* number of times the second set of keywords appear in the sentence and article
* number of words in the potential CEO name

In [3]:
keywords = set(['Inc', 'inc', 'Corp', 'corp', 'corporation', 'Co', 'company', 'Company', 'Group', 'Ltd', 'ltd', 'Capital', 'capital',
               'management', 'Management', 'Financial', 'financial', 'consulting', 'Consulting', 'Depot'])

keywords2 = set(['CEO', 'Chief', 'Executive', 'executive', 'ceo', 'Officer', 'Company', 'company'])

def build_feature_vector(word_tuple):
    '''
    word_tuple: (word, sentence_index, article_index)
    '''
    word = word_tuple[0]
    sentence_index = word_tuple[1]
    article_index = word_tuple[2]
    
    sentence = list_of_articles[article_index][sentence_index]
    list_of_sentences = list_of_articles[article_index]
    # number of words in the sentence
    number_of_words = len(sentence.split(' '))
    # number of characters in the sentence
    number_of_chars_sentence = len(sentence)
    # number of characters in the target
    number_of_chars_candidate = len(word)
    # number of capitals in the target word
    number_capitals_word = sum(1 for x in word if x.isupper())
    # starting position in the sentence
    starting_index = sentence.find(word)
    # number of capitals sentence
    number_capitals_sentence = sum(1 for x in sentence if x.isupper())
    # number of words in the article
    sum_article = 0
    for x in list_of_sentences:
        sum_article += len(x.split(' '))
    number_of_words_article = sum_article
    # number of sentences in the article
    number_of_sentences_article = len(list_of_sentences)
    # number of times the word appears in the article
    total_string = ' '.join(list_of_sentences)
    appearance_count = total_string.count(word)
    # 1 if the word contains a keyword
    keyword_appear = 0
    for subword in word.split(' '):
        if subword in keywords:
            keyword_appear = 1
            break
    
    # len of first and last name
    split_words = word.split(' ')
    first_name = split_words[0]
    
    if len(split_words) > 1:
        last_name = split_words[1]
    else:
        last_name = ''
    
    # first_name appearance count
    first_name_count = total_string.count(" " + first_name + " ")
    
    if last_name == '':
        last_name_count = 0
    else:
        last_name_count = total_string.count(" " + last_name + " ")
    
    sentence_keyword = 0
    article_keyword = 0
    for keyword in keywords2:
        sentence_keyword += sentence.count(keyword)
        article_keyword += total_string.count(keyword)
    
    # number of words in the target word
    candidate_word_count = len(word.split(' '))
    
    vector = [number_of_words, number_of_chars_sentence, number_of_chars_candidate,
             number_capitals_word, starting_index, number_capitals_sentence, number_of_words_article,
             number_of_sentences_article, appearance_count, keyword_appear, candidate_word_count, first_name_count, last_name_count,
             sentence_keyword, article_keyword]
    
    return vector

### Identify candidate names 

Here we identify candidate names by selecting patterns of two words with capital letters:
* Test Word

would satisfy the candidate

We then clean the word by removing stop words and also filtering out any potential words if they are in the bad keyword list. This was refined after visually inspecing potential keywords

In [4]:
# filter keywords
bad_keywords = set(['Inc', 'inc', 'Corp', 'corp', 'corporation', 'Co', 'company', 'Company', 'Group', 'Ltd', 'ltd', 'Capital', 'capital',
               'management', 'Management', 'Financial', 'financial', 'consulting', 'Consulting', 'Depot', 'China', 'USA', 'Asia', 'North America',
                   'Administration', 'Department', 'Business', 'Industry', 'Institute', 'United', 'States', 'Asia', 'Europe', 'New', 'York', 'Chicago',
                   'Houston', 'Los', 'Angeles', 'National', 'President', 'Representative', 'House', 'Representatives', 'Senator', 'CFO', 
                   'Mojave', 'Desert', 'Olympics', 'Obama', 'Secretary', 'General', 'Inspector', 'Advisor', 'Economic', 'Atlantic', 'Gulf', 'Pacific', 'Ocean', 
                   'Finance', 'Wall Street', 'Wall', 'Street', 'Federal', 'Affordable', 'Republicans', 'Democrats', 'Congressional', 'Aviation', 'Internet', 'Hong', 
                   'Kong', 'Beijing', 'Africa', 'Russia', 'Government', 'Research', 'Council', 'Public', 'Service', 'Mobility', 'Bitcoin', 'Economy', 'Commodity',
                   'Prices', 'Presentation', 'Citi', 'Navy', 'Jewish', 'Muslim', 'Journal', 'British', 'Zillow', 'Egypt', 'Congo', 'Kitchen', 'Thrift', 'Savings', 
                   'Director', 'Iraq', 'Iran', 'War', 'Saudi', 'Arabia', 'Oil', 'Turkey', 'Greece', 'Investment', 'Production', 'User', 'Experience', 'Western',
                   'Eastern', 'Bank', 'Access', 'Debt', 'Growth', 'Resources', 'Brazil', 'Mexico', 'Canada', 'Canadian', 'American', 'English', 'Chinese', 
                   'Dangerous'])


# identify which sentences may have a company in it
def identify_potential_sentence(sentence):
    matches = re.findall(r"(?=([A-Z][a-z]+ [A-Z][a-z]+))", sentence)
    if matches:
        return matches
    else:
        return False

# removes the stop words from a found match
def remove_stop_words(word):
    cleaned_word = ' '.join([x for x in word.split(' ') if x.lower() not in stop_words]).rstrip()
    return cleaned_word

# find positive and negative matches to train against
potential_matches = []
start_time = time.time()
# find matches with word, sentence_index, article_index
# only select matches that are their own word
for article_index, article in enumerate(list_of_articles):
    if article_index % 70 == 0:
        print("Percentage Completed: {0:.0%}".format(article_index / len(list_of_articles)))
    for sentence_index, sentence in enumerate(article):
        matches = identify_potential_sentence(sentence) 
        if matches:
            filtered_matches = []
            # filter out the matches with annoying APBloomberg or AP stuff in it
            for match in matches:
                if " " + match + " " in sentence:
                    filtered_matches.append(match)
                elif sentence[:len(match)+1 == match + " "]:
                    filtered_matches.append(match)                 
                elif sentence[-len(match)+1:] == " " + match:
                    filtered_matches.append(match)
            
            filtered_2 = []
            for match in filtered_matches:
                split_word = match.split(' ')
                if split_word[0] not in bad_keywords and split_word[1] not in bad_keywords:
                    filtered_2.append(match)
                    
            cleaned_matches = [remove_stop_words(x) for x in filtered_2]
            remove_empty_words = [x for x in cleaned_matches if x != '']
            for match in remove_empty_words:
                potential_matches.append((match, sentence_index, article_index))

potential_matches = list(set(potential_matches))
print("Time to complete: {}".format(time.time() - start_time))

Percentage Completed: 0%
Percentage Completed: 10%
Percentage Completed: 19%
Percentage Completed: 29%
Percentage Completed: 38%
Percentage Completed: 48%
Percentage Completed: 58%
Percentage Completed: 67%
Percentage Completed: 77%
Percentage Completed: 86%
Percentage Completed: 96%
Time to complete: 4.2578887939453125


## Build positive and negative matches from the corpus from the training set

Here, we use positive samples as the training CEO data set, and the negative as the training company data set

In [5]:
# find positive and negative matches to train against
pos_matches = []
neg_matches = []
start_time = time.time()
# find matches with word, sentence_index, article_index
for article_index, article in enumerate(list_of_articles):
    if article_index % 70 == 0:
        print("Percentage Completed: {0:.0%}".format(article_index / len(list_of_articles)))
    for sentence_index, sentence in enumerate(article):
        for company in companies:
            if company in sentence:
                neg_matches.append((company, sentence_index, article_index))
        for ceo in cleaned_ceos:
            if ceo in sentence:
                pos_matches.append((ceo, sentence_index, article_index))
                
print("Time to complete: {}".format(time.time() - start_time))

Percentage Completed: 0%
Percentage Completed: 10%
Percentage Completed: 19%
Percentage Completed: 29%
Percentage Completed: 38%
Percentage Completed: 48%
Percentage Completed: 58%
Percentage Completed: 67%
Percentage Completed: 77%
Percentage Completed: 86%
Percentage Completed: 96%
Time to complete: 413.20392179489136


### Build feature vectors for positive and negative samples

In [6]:
# Add Labels
print("Starting Positive samples...")
start_time = time.time()
pos_data = list(map(lambda x: build_feature_vector(x), pos_matches))
for vector in pos_data:
    vector.append(1)
print("Number of Positive samples: {}".format(len(pos_data)))
print("Time to finish positive samples: {}".format(time.time() - start_time))

start_time = time.time()
print("Starting Negative samples...")
neg_data = list(map(lambda x: build_feature_vector(x), neg_matches))
for vector in neg_data:
    vector.append(0)
print("Number of Negative samples: {}".format(len(neg_data)))
print("Time to finish negative samples: {}".format(time.time() - start_time))

Starting Positive samples...
Number of Positive samples: 38739
Time to finish positive samples: 109.19758582115173
Starting Negative samples...
Number of Negative samples: 191661
Time to finish negative samples: 557.5668721199036


### Build feature vectors for potential ceo samples

In [7]:
# Add Labels
print("Starting classification of potential samples...")
start_time = time.time()
new_data = list(map(lambda x: build_feature_vector(x), potential_matches))
print("Number of samples: {}".format(len(new_data)))
print("Time to finish: {}".format(time.time() - start_time))

Starting classification of potential samples...
Number of samples: 189502
Time to finish: 540.3816339969635


### Train the model, Random Forest

Split the given data into training and testing set, 2/3 - 1/3 split. We then train a random forest and tune parameters using the test set.

In [8]:
# combine the data
combined_data = pd.DataFrame(pos_data + neg_data)
x_values = combined_data.drop(15, axis = 1)
y_values = combined_data[15]

x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.33)

rfc = RandomForestClassifier(n_estimators = 40)
rfc.fit(x_train, y_train)
rf_fit_values = rfc.predict(x_train)
confus = confusion_matrix(y_train, rf_fit_values)
print(confus)
accuracy = (confus[0,0] + confus[1,1]) / sum(sum(confus))
precision = (confus[0,0] / (confus[0,0] + confus[0,1]))
recall = (confus[0,0] / (confus[0,0] + confus[1,0]))
print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))

[[128071    280]
 [   261  25756]]
Accuracy: 0.9964953876451078
Precision: 0.9978184821310313
Recall: 0.9979662126359754


### Evaluate the test data

In [9]:
rf_fit_values = rfc.predict(x_test)
confus = confusion_matrix(y_test, rf_fit_values)
print(confus)
accuracy = (confus[0,0] + confus[1,1]) / sum(sum(confus))
precision = (confus[0,0] / (confus[0,0] + confus[0,1]))
recall = (confus[0,0] / (confus[0,0] + confus[1,0]))
print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))

[[61687  1623]
 [ 4793  7929]]
Accuracy: 0.9156144781144782
Precision: 0.974364239456642
Recall: 0.9279031287605295


### Run the model on the samples pulled from the corpus, classify and output

In [10]:
rf_fit_values = rfc.predict(new_data)
classified_ceos = []
for index, classification in enumerate(rf_fit_values):
    if classification == 1:
        classified_ceos.append(potential_matches[index][0])
found_ceos = set(classified_ceos)

In [11]:
output = list(set(list(found_ceos) + cleaned_ceos))
pd.Series(output).to_csv("found_ceos.csv", index = False, header = False)