In [None]:
# Reads in raw text (ocr-ed pdfs of Wall Street Journal and New York Times articles about CEO announcements), 
#      cleans text, constructs features (tfidf scores of ngrams), tunes hyperparameters for SVM and RF classifiers,
#      lists most important features for each classifier (and for a combined feature-importance score) for the total
#      sample period (1950-2015) and for three sub-periods: prior to 1990 when outside CEO hire rates were low,
#      1990-2000 when outside CEO hiring surged at large public US corporations during the 1990s economic boom, 
#      and 2001-2015 when outside CEO hiring leveled off during a period of relative economic volatility.

In [1]:
import os 
import pandas as pd
import numpy as np
import string
import math
import re
import random
import matplotlib.pyplot as plt
import sklearn.metrics as met   
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from nltk import word_tokenize, sent_tokenize  
from spacy.lang.en.stop_words import STOP_WORDS

# Prepare texts

In [2]:
# IMPORT DATA
df_prelim = pd.read_csv("ceo_articles.csv")
df_prelim = df_prelim.rename(columns = {"text": "raw_text"}) # preserve raw text for reference

In [3]:
# REMOVE UNWANTED MATERIAL FROM BEGINNING AND END OF ARTICLES

def clip_front_junk(string):
    """Read in article as string and remove unwanted material at beginning (headers, proquest info, etc)."""
    s_check = string.lower()
    if "abstract" in s_check:
        string = string[s_check.index("abstract") + 8:]
        s_check = s_check[s_check.index("abstract") + 8:]
        return string[s_check.index("full text") + 9:]
    elif "full text" in s_check:
        return string[s_check.index("full text") + 9:]
    elif "proquest" in s_check:
        return string[s_check.index("proquest") + 8:]
    elif "wall street journal" in s_check:
        return string[s_check.index("wall street journal") + 19:]
    elif "new york times" in s_check:
        return string[s_check.index("new york times") + 19:]
    else:
        return string
    
def clip_end_junk(string):
    """Read in article as string and remove unwanted material at end (copywrite notice, proquest info, etc)."""
    s_check = string.lower()
    if "credit:" in s_check:
        string = string[:s_check.index("credit:")]
    if "subject:" in s_check:
        string = string[:s_check.index("subject:")]
    if "details subject" in s_check:
        string = string[:s_check.index("details subject")]
    if "issn" in s_check:
        string = string[:s_check.index("issn")]
    return string

df_prelim['text'] = df_prelim.raw_text.apply(clip_front_junk).apply(clip_end_junk)

#won't clip front material if it cuts out most of the article (usually due to bad OCR)
df_prelim['len_text'] = df_prelim.text.map(str.split).map(len)
df_prelim.loc[df_prelim.len_text < 25, 'text'] = df_prelim.raw_text
df_prelim = df_prelim.drop(['len_text'], axis=1)

In [4]:
# MORE CLEANING

def clean(text):
    """General cleaning of text, mostly replacements that need to be made."""
    # remove hyphens at end of line
    text = text.replace("- ", "")
    text = text.replace("-", "")
    # remove line at end of every photocopied page
    text = text.replace("Reproduced with permission of the copyright owner. Further reproduction prohibited without permission.", "")
    # some prominent typos
    text = text.replace("gen eral", "general")
    text = text.replace("chatrman", "chairman")
    text = text.replace("sald", "said")
    text = text.replace("ta", "to")
    text = text.replace("retoiling", "retailing")
    text = text.replace("stotement", "statement")
    return text.strip()

df_prelim['text'] = df_prelim.text.map(clean)   

def turnaround(text):   
    """Based on close readings, I know there are a lot of ways to say this. Basically a corpus-specific lemma."""
    tokens = text.split()
    for idx, word in enumerate(tokens.copy()[:-1]):
        if word.startswith("turn") and tokens[idx + 1] == "around":
            tokens[idx] = "turnaround"
    text = ' '.join(tokens)
    text = text.replace("turnaround around", "turnaround")
    return text.strip()

def losangeles(text):   
    """Another corpus-specific lemma. Here particularly wanted to avoid multiple occurences of uni/bigrams."""
    tokens = text.split()
    for idx, word in enumerate(tokens.copy()[:-1]):
        if tokens[idx] == "Los" and tokens[idx + 1] == "Angeles":
            tokens[idx] = "losangeles"
    text = ' '.join(tokens)
    text = text.replace("losangeles Angeles", "losangeles")
    return text.strip()

df_prelim['text'] = df_prelim.text.apply(clean).apply(turnaround).apply(losangeles)

In [5]:
# ISOLATE SENTENCES ABOUT INCOMING CEO: FIRST SPLIT INTO SENTENCES AND GET CEO LAST NAME

df_prelim['sentences'] = df_prelim.text.map(sent_tokenize)

def extract_CEO_lastname(string):
    name = string.split()
    if name[-1] == "Jr." or name[-1] == "Jr" or name[-1] == "II" or name[-1] == "III":
        if name[-2][-1] == ",":
            return name[-2][:-1]
        else:
            return name[-2]
    else:
        return name[-1]

df_prelim['CEO_lastname'] = df_prelim.CEO.map(extract_CEO_lastname)  

In [6]:
# NOW IDENTIFY CEO SENTENCES

def CEO_match_yn(sentence, ceo_lastname):
    if ceo_lastname in sentence:
        return True
    for w in sentence.split():
        char_match = 0
        w2 = w[:]
        for c in ceo_lastname:
            if c in w2:
                char_match += 1
                idx = w2.find(c)
                w2 = w2[:idx] + w2[idx + 1:]
        match_pct = char_match / len(ceo_lastname)
        if match_pct >= .5 and ceo_lastname[0] == w[0] and ceo_lastname[-1] == w[-1]:
            return True
    else:
        return False

def CEO_sentences(sentences, ceo_lastname):
    ceo_sentences = []
    for i, sent in enumerate(sentences):
        if CEO_match_yn(sent, ceo_lastname):
            ceo_sentences.append(sent)
        elif re.search("(^|[^a-z])he[^a-z]", sent) and ceo_lastname in sentences[i - 1]:
            ceo_sentences.append(sent)
        elif re.search("(^|[^a-z])him[^a-z]", sent) and ceo_lastname in sentences[i - 1]:
            ceo_sentences.append(sent)
        elif re.search("(^|[^a-z])his[^a-z]", sent) and ceo_lastname in sentences[i - 1]:
            ceo_sentences.append(sent)
        elif re.search("(^|[^a-z])she[^a-z]", sent) and ceo_lastname in sentences[i - 1]:
            ceo_sentences.append(sent)
        elif re.search("(^|[^a-z])her[^a-z]", sent) and ceo_lastname in sentences[i - 1]:
            ceo_sentences.append(sent)
    return ceo_sentences

df_prelim['ceo_sentences_list'] = list(map(CEO_sentences, df_prelim.sentences, df_prelim.CEO_lastname))

In [8]:
# CONCATENATE CEO SENTENCES AND REMOVE MISSING DATA

df_prelim['text_ceo'] = df_prelim.ceo_sentences_list.map(' '.join)

#print(len(df_prelim[df_prelim.text_ceo == ""])) #14 records to drop
df = df_prelim[df_prelim.text_ceo != ""].reset_index(drop=True)

# Train-test split

In [9]:
# ADD VARIABLES FOR PERIOD

df['year10'] = df.year.div(10).map(np.floor).map(lambda x: x*10).map(int)

df['year_cat'] = df.year10
df.year_cat = df.year_cat.replace(1950, 1980)
df.year_cat = df.year_cat.replace(1960, 1980)
df.year_cat = df.year_cat.replace(1970, 1980)
df.year_cat = df.year_cat.replace(2010, 2000)
df.at[df.index[df.year == 2000], 'year_cat'] = 1990

df['period_out'] = 1
df.at[df.index[(df.outside_hire == 1) & (df.year_cat == 1980)], 'period_out'] = 2
df.at[df.index[(df.outside_hire == 0) & (df.year_cat == 1990)], 'period_out'] = 3
df.at[df.index[(df.outside_hire == 1) & (df.year_cat == 1990)], 'period_out'] = 4
df.at[df.index[(df.outside_hire == 0) & (df.year_cat == 2000)], 'period_out'] = 5
df.at[df.index[(df.outside_hire == 1) & (df.year_cat == 2000)], 'period_out'] = 6

In [10]:
# SPLIT INTO TRAINING AND TESTING SETS
X_train, X_test, Y_train, Y_test = train_test_split(df.text_ceo,
                                                    df.outside_hire,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=df.period_out)
print('Size of Training Data ', X_train.shape[0])
print('Size of Test Data ', X_test.shape[0])

Size of Training Data  1104
Size of Test Data  276


# Feature construction

In [14]:
# STOP WORDS

basic_stopwords = STOP_WORDS # from spacy
basic_stopwords.add("going") # only shows up as synonym with "will"
#print(len(basic_stopwords)) # 327

# remove ceo and firm names
ceo_names = set([item for sublist in df.CEO.apply(str.lower).apply(str.split) for item in sublist])
firm_names = set([item for sublist in df.Company_ID.apply(str.lower).apply(str.split) for item in sublist])

# add back in some important words removed as part of firm names
keep_names = set(['bank', 'bankers', 'banking', 'insurance', 'financial', 'good', 
                    'business', 'company', 'companies', 'commercial', 'computer', 'information', 
                    'industries', 'international', 'manufacturing', 'manufacturers', 
                    'natural', 'new', 'producing', 'products','regulator', 'solutions', 'trust', 'young'])

added_stopwords = ceo_names.union(firm_names)
added_stopwords.add("mr") # to avoid, for instance, "said mr" in addition to "said"
added_stopwords.add(("proquest", "historical", "newspapers")) # meaningless proquest info
added_stopwords = set([w for w in added_stopwords if w not in keep_names])
#print(len(added_stopwords)) # 1248

In [16]:
# TFIDF SCORES

def my_tokenizer(doc):
    NGRAM_RANGE = (1, 4)    
    unigrams = word_tokenize(remove_punctuation(doc))
    tokens = unigrams
    for n in range((NGRAM_RANGE[0] + 1), (NGRAM_RANGE[1] + 1)):
        ngrams = [' '.join(tuple) for tuple in zip(*[unigrams[i:] for i in range(n)])]
        tokens = tokens + ngrams
    final_tokens = []
    for t in tokens:
        t_split = t.split()
        if t_split[0] not in basic_stopwords:
            if t_split[-1] not in basic_stopwords:
                if len(added_stopwords.intersection(t_split)) == 0:
                    final_tokens.append(t)
    return final_tokens

def remove_punctuation(s):
    s = s.replace("'s", "")
    s = s.replace("`s", "")
    s = s.replace("’s", "")
    return s.translate(str.maketrans('', '', string.punctuation + '”“‘’—'))

tfidf_sklearn = TfidfVectorizer(tokenizer=my_tokenizer, min_df=25, max_df=0.7) # high min_df to drop firm or ceo names
tfidf_scores = tfidf_sklearn.fit_transform(df.text_ceo)
#tfidf_scores.shape # 1,106 tokens

In [17]:
# DEFINE PERIOD SUBSETS FOR TRAIN/TEST SETS 

#train
X_train_tf = tfidf_sklearn.transform(X_train)

tf_train = pd.DataFrame(X_train_tf.toarray(), columns = tfidf_sklearn.get_feature_names_out())
X_train_df = pd.DataFrame(X_train)
X_train_df['year_cat'] = df.year_cat # matches based on index
tf_train['year_cat'] = X_train_df.reset_index()['year_cat']
tf_train['outside_hire'] = Y_train.reset_index()['outside_hire']

X_train_pre1990 = tf_train[tf_train.year_cat==1980].drop(['year_cat', 'outside_hire'], axis=1)
Y_train_pre1990 = tf_train[tf_train.year_cat==1980].outside_hire
X_train_1990s = tf_train[tf_train.year_cat==1990].drop(['year_cat', 'outside_hire'], axis=1)
Y_train_1990s = tf_train[tf_train.year_cat==1990].outside_hire
X_train_post2000 = tf_train[tf_train.year_cat==2000].drop(['year_cat', 'outside_hire'], axis=1)
Y_train_post2000 = tf_train[tf_train.year_cat==2000].outside_hire


#test
X_test_tf = tfidf_sklearn.transform(X_test)

tf_test = pd.DataFrame(X_test_tf.toarray(), columns = tfidf_sklearn.get_feature_names_out())
X_test_df = pd.DataFrame(X_test)
X_test_df['year_cat'] = df.year_cat # matches based on index
tf_test['year_cat'] = X_test_df.reset_index()['year_cat']
tf_test['outside_hire'] = Y_test.reset_index()['outside_hire']

X_test_pre1990 = tf_test[tf_test.year_cat==1980].drop(['year_cat', 'outside_hire'], axis=1)
Y_test_pre1990 = tf_test[tf_test.year_cat==1980].outside_hire
X_test_1990s = tf_test[tf_test.year_cat==1990].drop(['year_cat', 'outside_hire'], axis=1)
Y_test_1990s = tf_test[tf_test.year_cat==1990].outside_hire
X_test_post2000 = tf_test[tf_test.year_cat==2000].drop(['year_cat', 'outside_hire'], axis=1)
Y_test_post2000 = tf_test[tf_test.year_cat==2000].outside_hire

# Hyperparameter Tuning

In [18]:
# BALANCE DATA THROUGH UNDERSAMPLING
# (Note: SMOTE oversampling is less appropriate for sparse data like text ngrams)

N_train = Y_train[Y_train == 1].sum() 
df_temp = pd.DataFrame(X_train_tf.toarray())
df_temp['outside_hire'] = Y_train.reset_index()['outside_hire']
df_temp_outs = df_temp[df_temp.outside_hire == 1]
df_temp_ins = df_temp[df_temp.outside_hire == 0].sample(N_train, random_state=0)
df_balanced = pd.concat([df_temp_outs, df_temp_ins])
X_tr_bal = df_balanced.drop(['outside_hire'], axis=1)
Y_tr_bal = df_balanced.outside_hire

In [19]:
# SVM HYPERPARAMETER TUNING USING GRID SEARCH

param_grid = {
    'C': [0.1, 1, 10, 100], 
    'kernel': ['linear']
}
grid = GridSearchCV(SVC(random_state=0), param_grid, refit=False, cv=5, scoring='accuracy')
grid.fit(X_tr_bal, Y_tr_bal)

best_params = grid.best_params_
print("Best alpha parameter identified by grid search ", best_params)

pd.set_option('display.max_colwidth', None)
gridsearch_results = pd.DataFrame(grid.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
                    'params']].sort_values(by=['rank_test_score'])

Best alpha parameter identified by grid search  {'C': 1, 'kernel': 'linear'}


Unnamed: 0,rank_test_score,mean_test_score,params
1,1,0.80302,"{'C': 1, 'kernel': 'linear'}"
2,2,0.759466,"{'C': 10, 'kernel': 'linear'}"
3,2,0.759466,"{'C': 100, 'kernel': 'linear'}"
0,4,0.633217,"{'C': 0.1, 'kernel': 'linear'}"


In [20]:
# RF HYPERPARAMETER TUNING USING GRID SEARCH

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 3, 5, 10]
}
grid = GridSearchCV(RandomForestClassifier(random_state=0), param_grid, refit=False, cv=5, scoring='accuracy')
grid.fit(X_tr_bal, Y_tr_bal)

best_params = grid.best_params_
print("Best alpha parameter identified by grid search ", best_params)

gridsearch_results = pd.DataFrame(grid.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
                    'params']].sort_values(by=['rank_test_score'])[:12]

Best alpha parameter identified by grid search  {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 50}


Unnamed: 0,rank_test_score,mean_test_score,params
0,1,0.778862,"{'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 50}"
1,2,0.769338,"{'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100}"
13,3,0.759582,"{'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 100}"
2,4,0.75935,"{'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 200}"
12,5,0.750174,"{'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 50}"
6,6,0.750058,"{'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 50}"
19,6,0.750058,"{'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 100}"
4,8,0.749942,"{'max_depth': None, 'max_features': 'log2', 'n_estimators': 100}"
8,9,0.745412,"{'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 200}"
5,10,0.745064,"{'max_depth': None, 'max_features': 'log2', 'n_estimators': 200}"


# Train Classifiers and Bootstrap

In [39]:
##### SELECT INPUT DATA (TOTAL PERIOD, PRE-1990, 1990s, POST-2000) #####

#X_input_train = pd.DataFrame(X_train_tf.toarray())
#Y_input_train = Y_train
#X_input_test = pd.DataFrame(X_test_tf.toarray())
#Y_input_test = Y_test

#X_input_train = X_train_pre1990
#Y_input_train = Y_train_pre1990
#X_input_test = X_test_pre1990
#Y_input_test = Y_test_pre1990

#X_input_train = X_train_1990s
#Y_input_train = Y_train_1990s
#X_input_test = X_test_1990s
#Y_input_test = Y_test_1990s

#X_input_train = X_train_post2000
#Y_input_train = Y_train_post2000
#X_input_test = X_test_post2000
#Y_input_test = Y_test_post2000

In [40]:
# ONE-HOT CLASSIFIER

X_train_oh = X_input_train.reset_index(drop=True).astype(bool)
X_train_oh['outside_hire'] = Y_input_train.reset_index()['outside_hire']
oh_scores = X_train_oh[X_train_oh.outside_hire == 1].mean(axis=0) - X_train_oh[X_train_oh.outside_hire == 0].mean(axis=0)
oh_scores = oh_scores.drop(['outside_hire'])

In [41]:
# SET RANDOM SEEDS TO MAKE BOOTSTRAPPING REPLICABLE

#seeds = [random.randint(0,1000000) for _ in range(100)]
#print(seeds)
seeds = [68205, 944716, 279319, 900751, 717488, 230310, 700563, 35648, 
        197274, 27083, 662368, 333356, 675971, 533482, 958546, 567334, 
        872298, 230929, 259618, 98065, 892905, 655327, 991298, 9426, 
        130288, 933858, 127716, 856209, 172686, 293803, 758790, 510045, 
        534425, 296243, 832123, 616404, 867188, 544862, 219853, 104131, 
        763319, 941494, 402384, 183838, 569661, 669144, 349585, 844692, 
        188662, 254168, 523485, 665488, 659273, 744117, 165509, 75766, 
        442257, 235121, 298426, 282483, 812921, 911418, 865758, 319980, 
        347712, 290291, 789076, 224440, 195829, 879757, 648964, 433458, 
        473982, 81785, 98720, 168945, 520238, 3127, 511821, 702424, 65407, 
        101516, 824661, 585764, 836377, 442649, 690597, 500534, 959700, 
        117599, 662650, 562916, 89803, 818856, 803400, 622335, 350794, 135974, 282290, 752710]

In [42]:
# FIT CLASSIFIERS USING UNDERSAMPLING AND BOOTSTRAPPING
# (Note: SMOTE oversampling is less appropriate for sparse data like text ngrams)

coef_svm = []
coef_rf = []
acc_svm = []
acc_rf = []
acc_oh = []

N_train = Y_input_train[Y_input_train == 1].sum()
N_test = Y_input_test[Y_input_test == 1].sum()

for seed in seeds:
    
    # balance training data
    df_temp_train = X_input_train.reset_index(drop=True)
    df_temp_train['outside_hire'] = Y_input_train.reset_index()['outside_hire']
    df_temp_train_outs = df_temp_train[df_temp_train.outside_hire == 1]
    df_temp_train_ins = df_temp_train[df_temp_train.outside_hire == 0].sample(N_train, random_state=seed)
    df_balanced_train = pd.concat([df_temp_train_outs, df_temp_train_ins])
    df_balanced_train = df_balanced_train.sample(frac=1, random_state=seed) 
    X_bal_train = df_balanced_train.drop(['outside_hire'], axis=1)
    Y_bal_train = df_balanced_train.outside_hire

    # balance testing set
    df_temp_test = X_input_test.reset_index(drop=True)
    df_temp_test['outside_hire'] = Y_input_test.reset_index()['outside_hire']
    df_temp_test_outs = df_temp_test[df_temp_test.outside_hire == 1]
    df_temp_test_ins = df_temp_test[df_temp_test.outside_hire == 0].sample(N_test, random_state=seed)
    df_balanced_test = pd.concat([df_temp_test_outs, df_temp_test_ins])
    df_balanced_test = df_balanced_test.sample(frac=1, random_state=seed) 
    X_bal_test = df_balanced_test.drop(['outside_hire'], axis=1)
    Y_bal_test = df_balanced_test.outside_hire    
    
    # SVM
    svc = SVC(kernel="linear", C=1, random_state=seed)
    svc.fit(X_bal_train, Y_bal_train)
    Y_pred_svc = svc.predict(X_bal_test)
    acc_svm.append(met.accuracy_score(Y_bal_test, Y_pred_svc))
    coef_svm.append(list(svc.coef_[0]))
    
    # RF
    rf = RandomForestClassifier(n_estimators=50, max_features='sqrt', max_depth=None, random_state=seed)
    rf.fit(X_bal_train, Y_bal_train)
    Y_pred_rf = rf.predict(X_bal_test)
    acc_rf.append(met.accuracy_score(Y_bal_test, Y_pred_rf))
    coef_rf.append(list(rf.feature_importances_))
    
    # ONE-HOT
    Y_pred_oh = X_bal_test.to_numpy().dot(oh_scores) 
    oh_median_calcs = X_bal_train.to_numpy().dot(oh_scores) 
    oh_median = np.median(oh_median_calcs)
    Y_pred_oh = np.where(Y_pred_oh >= oh_median, 1, 0)
    acc_oh.append(met.accuracy_score(Y_bal_test, Y_pred_oh.transpose()))

In [43]:
# ACCURACIES AND FEATURE IMPORTANCE SCORES

#accuracies
acc = pd.DataFrame({'svm': acc_svm})
acc['rand_forests'] = acc_rf
acc['one_hot'] = acc_oh

#feature importances
vocabulary = tfidf_sklearn.get_feature_names_out()
coef_svm_df = pd.DataFrame(coef_svm, columns=vocabulary)
coef_rf_df = pd.DataFrame(coef_rf, columns=vocabulary)
coef = pd.DataFrame({'words': vocabulary}, index=vocabulary)
coef['svm'] = coef_svm_df.mean(axis=0).tolist()
coef['rf_prelim'] = coef_rf_df.mean(axis=0).tolist()
coef['one_hot'] = oh_scores.tolist()

#edit RF feature importances: separate those helping predict outside hires vs those helping predict inside hires
tf_scores = X_train_tf[Y_train==1].mean(axis=0) - X_train_tf[Y_train==0].mean(axis=0)
coef['tf_scores'] = tf_scores.tolist()[0]
coef['tf_scores2'] = 0
coef.loc[coef.tf_scores < 0, 'tf_scores2'] = -1
coef.loc[coef.tf_scores > 0, 'tf_scores2'] = 1
coef['rand_forests'] = coef.rf_prelim * coef.tf_scores2

#combined SVM and RF score (first make distributions similar (0,1))
def sigmoid(num):
    return 1 / (1 + math.exp(-num))
coef['svm_transf'] = (coef.svm - coef.svm.mean()) / coef.svm.std()
coef['svm_transf'] = coef.svm_transf.apply(sigmoid)
coef['rf_transf'] = (coef.rand_forests - coef.rand_forests.mean()) / coef.rand_forests.std()
coef['rf_transf'] = coef.rf_transf.apply(sigmoid)
coef['svm_rf_combined'] = coef.svm_transf * coef.rf_transf

In [44]:
##### SAVE RESULTS #####

#acc_total = acc.copy()
#acc_pre1990 = acc.copy()
#acc_1990s = acc.copy()
#acc_post2000 = acc.copy()

#coef_total = coef.copy()
#coef_pre1990 = coef.copy()
#coef_1990s = coef.copy()
#coef_post2000 = coef.copy()

# Display Results - Total Period

In [45]:
# ACCURACY SCORES FOR EACH CLASSIFIER

acc_total.describe()

Unnamed: 0,svm,rand_forests,one_hot
count,100.0,100.0,100.0
mean,0.81,0.776731,0.7975
std,0.044519,0.044905,0.041223
min,0.711538,0.653846,0.692308
25%,0.783654,0.75,0.769231
50%,0.807692,0.769231,0.788462
75%,0.846154,0.807692,0.826923
max,0.903846,0.884615,0.903846


In [46]:
# CORRELATIONS BETWEEN FEATURE IMPORTANCE SCORES FOR DIFFERENT CLASSIFIERS

coef_total[['svm', 'rand_forests', 'one_hot']].corr()

Unnamed: 0,svm,rand_forests,one_hot
svm,1.0,0.687119,0.745268
rand_forests,0.687119,1.0,0.814791
one_hot,0.745268,0.814791,1.0


In [47]:
# TOP FEATURES FOR EACH CLASSIFIER (PREDICTING OUTSIDE CEO HIRES)

top_svm = list(coef_total[['words', 'svm']].sort_values(by='svm', ascending=False)[:19]['words'])
top_rf = list(coef_total[['words', 'rand_forests']].sort_values(by='rand_forests', ascending=False)[:19]['words'])
top_oh = list(coef_total[['words', 'one_hot']].sort_values(by='one_hot', ascending=False)[:19]['words'])
top_svmrf = list(coef_total[['words', 'svm_rf_combined']].sort_values(by='svm_rf_combined', ascending=False)[:19]['words'])

top_outs_total = pd.DataFrame(list(zip(top_svm, top_rf, top_oh, top_svmrf)), 
                  columns=['svm', 'rand_forests', 'one_hot', 'svm_rf_combined'])
top_outs_total

Unnamed: 0,svm,rand_forests,one_hot,svm_rf_combined
0,director,experience,experience,experience
1,resigned,search,search,search
2,experience,said,board,president and chief executive
3,president and chief executive,board,million,resigned
4,search,president and chief executive,stock,director
5,associates,million,appointment,million
6,million,resigned,job,appointment
7,responsible,appointment,industry,board
8,international,new,president and chief executive,said
9,selection,director,big,recently


In [48]:
# TOP FEATURES FOR EACH CLASSIFIER (PREDICTING INSIDE CEO HIRES)

top_svm = list(coef_total[['words', 'svm']].sort_values(by='svm')[:19]['words'])
top_rf = list(coef_total[['words', 'rand_forests']].sort_values(by='rand_forests')[:19]['words'])
top_oh = list(coef_total[['words', 'one_hot']].sort_values(by='one_hot')[:19]['words'])
top_svmrf = list(coef_total[['words', 'svm_rf_combined']].sort_values(by='svm_rf_combined')[:19]['words'])

top_ins_total = pd.DataFrame(list(zip(top_svm, top_rf, top_oh, top_svmrf)), 
                  columns=['svm', 'rand_forests', 'one_hot', 'svm_rf_combined'])
top_ins_total

Unnamed: 0,svm,rand_forests,one_hot,svm_rf_combined
0,officer,officer,officer,officer
1,vice,vice,chief executive officer,vice
2,operating,executive officer,executive officer,executive officer
3,succession,chief executive officer,vice,chief executive officer
4,continue,chief operating officer,elected,operating officer
5,operating officer,operating officer,vice president,chief operating officer
6,operations,chief operating,chief operating officer,operating
7,bank,vice president,operating officer,chief operating
8,chief operating officer,operating,chief operating,vice president
9,chief operating,named,president and chief operating,joined


# Display results - decade-specific

In [49]:
# TOP FEATURES FOR EACH CLASSIFIER (PREDICTING OUTSIDE CEO HIRES)

top_pre1990 = list(coef_pre1990[['words', 'svm_rf_combined']].sort_values(by='svm_rf_combined', ascending=False)[:19]['words'])
top_1990s = list(coef_1990s[['words', 'svm_rf_combined']].sort_values(by='svm_rf_combined', ascending=False)[:19]['words'])
top_post2000 = list(coef_post2000[['words', 'svm_rf_combined']].sort_values(by='svm_rf_combined', ascending=False)[:19]['words'])

top_outs_decade = pd.DataFrame(list(zip(top_pre1990, top_1990s, top_post2000)), 
                  columns=['pre1990', '1990s', 'post2000'])
top_outs_decade

Unnamed: 0,pre1990,1990s,post2000
0,director,search,experience
1,new,stock,took
2,international,million,said
3,president and chief executive,retailing,businesses
4,resigned,investors,recently
5,experience,package,think
6,industry,experience,board
7,selection,said,september
8,early,months,turnaround
9,associates,news,plans


# Display results - decade comparisons

In [51]:
comp = coef_1990s.svm_rf_combined - coef_pre1990.svm_rf_combined
comp_90s_pre90 = list(comp.sort_values(ascending=False)[:19].index)
comp_pre90_90s = list(comp.sort_values()[:19].index)

comp = coef_post2000.svm_rf_combined - coef_1990s.svm_rf_combined
comp_post00_90s = list(comp.sort_values(ascending=False)[:19].index)
comp_90s_post00 = list(comp.sort_values()[:19].index)

comp_decades = pd.DataFrame(list(zip(comp_pre90_90s, comp_90s_pre90, comp_90s_post00, comp_post00_90s)), 
                  columns=['pre1990 vs 1990s', '1990s vs pre1990', '1990s vs post2000', 'post2000 vs 1990s'])
comp_decades

Unnamed: 0,pre1990 vs 1990s,1990s vs pre1990,1990s vs post2000,post2000 vs 1990s
0,new president,retailing,investors,businesses
1,university,investors,hard,replace
2,associates,performance,times,recently
3,held,package,known,recent
4,jobs,hard,customer,turnaround
5,statement,customer,news,february
6,international,stock,manager,including
7,products,billion,firm,52
8,partner,electronics,months,hit
9,early,news,electronics,responsible
