In [169]:
import pandas as pd
import numpy as np
# Exchange rates
from forex_python.converter import CurrencyRates
# NLP
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Suppress warning
pd.options.mode.chained_assignment = None
# Visualisations
import matplotlib.pyplot as plt

# Load Data

In [63]:
ev = pd.read_csv('../raw_data/enterprise_value_tidy.csv', index_col='Unnamed: 0')
df = pd.read_csv('../raw_data/company_info_fmp.csv', index_col='Unnamed: 0')
roic = pd.read_csv('../raw_data/ROIC.csv', index_col='Unnamed: 0')
financials = pd.read_csv('../raw_data/company_IS.csv', index_col='Unnamed: 0')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Tidy 

In [64]:
financials = roic.merge(financials, on=['symbol', 'date'],how='inner')

## Feature Engineering

### Growth Rate

In [65]:
# Remove companies with fewer than 5 entries
symbol_counts = financials['symbol'].value_counts()
financials = financials[financials['symbol'].isin(symbol_counts[symbol_counts > 4].index)]

In [66]:
# Calculate growth rate
financials['old_revenue'] = financials['revenue'].shift(-4)

financials['growth_rate'] = (financials['revenue'] / financials['old_revenue'])**0.25 - 1

In [67]:
# Select 2019 entry
financials['date'] = pd.to_datetime(financials['date'])

financials = financials[financials['date'] < '2020-01-01']

financials = financials[financials['date'] >= '2019-01-01']

### Ebitda Margin

In [68]:
# Calculate ebitda margin
financials['ebitda_margin'] = financials['ebitda'] / financials['revenue']

In [69]:
financials.drop(columns='old_revenue', inplace=True)

## Filter Continuous Variables

### Company Size

In [70]:
# Remove small companies
financials = financials[financials['revenue'] > 10_000_000]
print(f'{len(financials)} rows remaining')

10066 rows remaining


### Absurd Values

In [71]:
# Remove companies with absurd growth
financials = financials[financials['growth_rate'] < 2]
print(f'{len(financials)} rows remaining')

9777 rows remaining


In [72]:
# Remove companies with negative ebitda margin
financials = financials[financials['ebitda_margin'] >= 0.01]
print(f'{len(financials)} rows remaining')

7355 rows remaining


In [73]:
# Remove companies with an ebitda margin greater than 1
financials = financials[financials['ebitda_margin'] <= 1]
print(f'{len(financials)} rows remaining')

7193 rows remaining


In [74]:
# Merge tables
ev_info = ev.merge(df, on='symbol', how='inner')
ev_info = ev_info.merge(financials, on='symbol', how='inner')[['symbol', 'enterpriseValue', 'sector', 'currency',
                                                     'country', 'description', 'returnOnCapitalEmployed',
                                                     'revenue', 'grossProfit', 'ebitda', 'growth_rate', 'ebitda_margin']]

In [75]:
# Remove too large companies
ev_info = ev_info[ev_info['enterpriseValue'] < 200_000_000_000]
print(f'{len(ev_info)} rows remaining')

6937 rows remaining


In [76]:
# Remove companies with negative values
ev_info = ev_info[ev_info['enterpriseValue'] > 0]
print(f'{len(ev_info)} rows remaining')

5849 rows remaining


## Filter Categorical Variables

### Uncommon Values

In [77]:
# Keep most common countries
countries = ev_info['country'].value_counts()
ev_info = ev_info[ev_info['country'].isin(countries[countries > 20].index)]
print(f'{len(ev_info)} rows remaining')

5680 rows remaining


In [78]:
# Keep most common currencies
currencies = ev_info['currency'].value_counts()
ev_info = ev_info[ev_info['currency'].isin(currencies[currencies > 20].index)]
print(f'{len(ev_info)} rows remaining')

5667 rows remaining


### Sectors

In [79]:
# Keep most common sectors
sectors = ev_info['sector'].value_counts()
ev_info = ev_info[ev_info['sector'].isin(sectors[sectors >= 20].index)]
print(f'{len(ev_info)} rows remaining')

5469 rows remaining


In [80]:
# Remove financial/Real Estate companies
financial_sectors = ['Financial Services', 'Banking', 'Real Estate']
ev_info = ev_info[~ev_info['sector'].isin(financial_sectors)]
print(f'{len(ev_info)} rows remaining')

4784 rows remaining


In [81]:
# Drop companies with no sector or description
ev_info = ev_info.dropna(subset=['sector', 'description']).drop_duplicates()
print(f'{len(ev_info)} rows remaining')

4695 rows remaining


In [82]:
# Change GBp to GBP
ev_info = ev_info.replace({'GBp': 'GBP'})

In [83]:
sector_map = {'Health Care': 'Healthcare', 'Energy ': 'Energy', 'Building': 'Construction'}
ev_info['sector'] = ev_info['sector'].map(sector_map).fillna(ev_info['sector'])

### Missing Values

In [84]:
ev_info = ev_info.dropna()
print(f'{len(ev_info)} rows remaining')

4693 rows remaining


## Conversion

### Units

In [85]:
# Convert to millions
ev_info[['enterpriseValue', 'revenue', 'grossProfit', 'ebitda']] = ev_info[['enterpriseValue', 'revenue', 'grossProfit', 'ebitda']].apply(lambda x: x/1_000_000)

### Exchange Rates

In [86]:
import datetime
def get_rate(foreign_currency, home_currency='USD'):
    c = CurrencyRates()
    date_obj = datetime.datetime(2019,12,31)
    rate = c.get_rate(foreign_currency, home_currency, date_obj=date_obj)
    return rate

In [87]:
ev_info['exchange_rate'] = ev_info['currency'].apply(get_rate)

In [88]:
columns = ['enterpriseValue', 'revenue', 'grossProfit', 'ebitda']
for column in columns:
    ev_info[column] =  ev_info[column] * ev_info['exchange_rate']

In [89]:
ev_info.drop(columns=['currency', 'exchange_rate'], inplace=True)

# NLP Preprocessing

## Stop Words

In [None]:
# Create a list out of all words
words = []

for info in ev_info['clean_info'][0:-1]:
    for word in info:
        words.append(word)

In [144]:
# Count which words are the most common
from collections import Counter

word_counts = Counter(words)

# Print top 200 most common words
counter = 0
for w in sorted(word_counts, key=word_counts.get, reverse=True):
    if counter > 0 and counter < 200:
        print(w, word_counts[w])
    counter += 1

In [172]:
stop_words = set(stopwords.words('english'))

# Add common and useless words to stop_words
stop_words.update(['company', 'provides', 'offers', 'operates', 'well', 'segment', 'also', 'limited',
                  'headquartered','founded', 'inc', 'management', 'sells', 'including', 'united',
                  'segments', 'states', 'markets', 'various', 'engages', 'addition',
                  'based', 'name', 'business', 'customers', 'formerly', 'known', 'corporation',
                  'subsidiaries', 'group', 'changed', 'develops','approximately','primarily',
                  'related','care','used', 'use', 'include','serves', 'incorporated', 'holdings',
                  'together', 'companys','distributes', 'comprising', 'produces', 'support', 'two',
                  'companies','sales', 'operations', 'ltd','involved','industry','subsidiary', 'owns',
                  'sale', 'three', 'range', 'holding', 'businesses', 'firm', 'product', 'plc',
                  'located', 'names'])

## Language Cleaning

In [161]:
def cleaner(info):
    # Remove punctuation
    for p in string.punctuation:
        info = str(info).replace(p, '')   
    # Lower case
    info = info.lower()
    # Remove numbers
    info = ''.join(word for word in info if not word.isdigit())
    # Add common and useless words to stop_words
    stop_words = set(stopwords.words('english'))
    stop_words.update(['company', 'provides', 'offers', 'operates', 'well', 'segment', 'also', 'limited',
                      'headquartered','founded', 'inc', 'management', 'sells', 'including', 'united',
                      'segments', 'states', 'markets', 'various', 'engages', 'addition',
                      'based', 'name', 'business', 'customers', 'formerly', 'known', 'corporation',
                      'subsidiaries', 'group', 'changed', 'develops','approximately','primarily',
                      'related','care','used', 'use', 'include','serves', 'incorporated', 'holdings',
                      'together', 'companys','distributes', 'comprising', 'produces', 'support', 'two',
                      'companies','sales', 'operations', 'ltd','involved','industry','subsidiary', 'owns',
                      'sale', 'three', 'range', 'holding', 'businesses', 'firm', 'product', 'plc',
                      'located', 'names'])
    # Remove stop words
    word_tokens = word_tokenize(info)
    info = [w for w in word_tokens if not w in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in info]
    info = ' '.join(lemmatized)
    return info

## Vectorizing

In [170]:
def vectorize(df, vectorizer='tfidf', context=2, max_df=0.85, min_df=0.05):
    df_copy = df.copy()
    # Clean language columns
    df_copy['clean_info'] = df_copy['description'].apply(clean_info)
    # Vectorize
    if vectorizer == 'count':
        vectorizer = CountVectorizer(ngram_range=(1,context), max_df=max_df, min_df=min_df)
    if vectorizer == 'tfidf':
        vectorizer = TfidfVectorizer(ngram_range=(1,context), max_df=max_df, min_df=min_df)
    X = vectorizer.fit_transform(df_copy['clean_info'])
    # Convert back to df
    vect_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names(), index= df_copy.index)
    merged_df = df_copy.merge(vect_df, left_index=True, right_index=True, how='left')
    merged_df.drop(columns=['description','clean_info'], inplace=True)
    return merged_df

In [171]:
clean_df = nlp_preprocessing(ev_info)
clean_df.head()

Unnamed: 0,symbol,enterpriseValue,sector_x,country_x,returnOnCapitalEmployed,revenue,grossProfit,ebitda,growth_rate,ebitda_margin,...,transmission,transport,transportation,treatment,unit,utility,vehicle,water,wholesale,worldwide
0,0001.HK,21597.256799,Industrials,HK,0.051154,38402.729002,21451.105507,12705.530323,0.157184,0.33085,...,0.0,0.0,0.075711,0.0,0.0,0.0,0.0,0.31099,0.0,0.065747
1,0002.HK,5136.485887,Utilities,HK,0.047146,10908.819796,3330.785385,2378.104992,0.023875,0.217998,...,0.249487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0003.HK,4062.479373,Utilities,HK,0.091021,5217.793781,2417.570978,1608.588899,0.082469,0.308289,...,0.0,0.0,0.0,0.149108,0.0,0.075648,0.0,0.133238,0.0,0.0
4,0008.HK,6248.674105,Communication Services,HK,0.046134,4818.754518,2390.562528,1516.223337,-0.011602,0.31465,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0019.HK,4510.26998,Industrials,HK,0.036114,11000.132246,4086.713774,2147.961657,0.089073,0.195267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.097778,0.0,0.0,0.0
