In [3]:
import pandas as pd
import numpy as np

In [4]:
# read csv
data = pd.read_csv('../data/combined_bloomberg_mostly_raw.csv')
data = data.dropna()
data = data.drop(columns=['Unnamed: 0'])

In [5]:
# clean numeric values
data['Issuer Ticker'] = data['Issuer Ticker'].str.split(' ').str[0]
data = data.set_index('Issuer Ticker')
data['Underpriced'] = (data['Offer To 1st Close'] > 0).astype(int)

In [6]:
# collect all the sectors, industry groups, industries, and sub-industries
sectors = data["Industry Sector"].unique()
industry_groups = []
industries = data["Industry Group"].unique()
sub_industries = data["Industry Subgroup"].unique()

print("no sectors:", len(sectors))
print("no industy_groups:", len(industry_groups))
print("no industries:", len(industries))
print("no sub_industries:", len(sub_industries))

no sectors: 10
no industy_groups: 0
no industries: 64
no sub_industries: 265


In [7]:
# replace all fields with a numeric value
data["Industry Sector"] = data["Industry Sector"].apply(lambda sector: np.where(sectors == sector)[0][0]).astype('float')
data["Industry Group"] = data["Industry Group"].apply(lambda industry: np.where(industries == industry)[0][0]).astype('float')
data["Industry Subgroup"] = data["Industry Subgroup"].apply(lambda sub_industry: np.where(sub_industries == sub_industry)[0][0]).astype('float')
data.dtypes

Issuer Name                     object
Sales - 1 Yr Growth            float64
Profit Margin                  float64
Return on Assets               float64
Offer Size (M)                 float64
Shares Outstanding (M)         float64
Offer Price                    float64
Offer To 1st Close             float64
Market Cap at Offer (M)        float64
Trade Date (US)                 object
cusip                           object
Cash Flow per Share            float64
Offer Size (M).1               float64
Shares Outstanding (M).1       float64
Instit Owner (% Shares Out)    float64
Instit Owner (Shares Held)     float64
Filing Term Price Range         object
Priced Range                    object
Industry Sector                float64
Industry Group                 float64
Industry Subgroup              float64
Underpriced                      int64
dtype: object

In [9]:
# remove duplicate labels
data.drop(columns=['Offer Size (M).1', 'Shares Outstanding (M).1'])

Unnamed: 0_level_0,Issuer Name,Sales - 1 Yr Growth,Profit Margin,Return on Assets,Offer Size (M),Shares Outstanding (M),Offer Price,Offer To 1st Close,Market Cap at Offer (M),Trade Date (US),cusip,Cash Flow per Share,Instit Owner (% Shares Out),Instit Owner (Shares Held),Filing Term Price Range,Priced Range,Industry Sector,Industry Group,Industry Subgroup,Underpriced
Issuer Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0911013D,Specialty Laboratories Inc,12.6,-9.6,-15.9,80.0000,20.1875,16.00,45.703125,323.000,12/8/2000,84749R100,-0.300000,0.001527,365.0,14 - 16,Priced Within Range,0.0,0.0,0.0,1
1121454D,StorageNetworks Inc,-23.2,-80.6,-11.6,243.0000,88.2610,27.00,234.259262,2383.050,6/29/2000,8.62E+107,-0.005532,0.006602,6567.0,23 - 25,Priced Above Range,1.0,1.0,1.0,1
1240716D,Targa Energy LP,69.4,-2.9,-0.8,82.8000,21.1000,23.00,0.000000,485.300,7/21/2006,04930A104,0.700000,0.007076,3681.0,23 - 25,Priced Within Range,2.0,2.0,2.0,0
1448479D,Campus Crest Communities Inc,15.9,-152.4,-3.1,407.2880,28.4283,12.50,0.080000,355.350,10/14/2010,13466Y105,1.000000,2.999850,1942600.0,12.50 - 14.50,Priced Within Range,3.0,3.0,3.0,1
1556442D,GigPeak Inc,45.4,3.8,2.1,41.7000,16.4847,6.95,-13.525178,114.570,7/23/2004,37518Q109,0.055422,0.004543,3073.0,6.50 - 7.50,Priced Within Range,1.0,4.0,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZSANQ,Zosano Pharma Corp,250.4,-3812.1,-119.7,50.7100,11.9261,11.00,0.090909,131.187,1/27/2015,98979H301,-8.600000,0.505746,24793.0,12-Oct,Priced Within Range,0.0,13.0,15.0,1
ZTS,Zoetis Inc,16.4,26.2,15.1,2574.3900,99.0150,26.00,19.269230,2574.390,2/1/2013,98978V103,4.600000,102.747000,478873000.0,22 - 25,Priced Above Range,0.0,13.0,15.0,1
ZUMZ,Zumiez Inc,19.5,10.0,7.9,64.6875,13.1803,18.00,38.166668,237.250,5/6/2005,989817101,5.500000,99.548100,19382400.0,15 - 17,Priced Above Range,5.0,17.0,92.0,1
ZUO,Zuora Inc,13.5,-28.6,-20.6,177.1000,12.6500,14.00,42.857143,1472.220,4/12/2018,98983V106,0.100000,79.494300,98413900.0,13-Nov,Priced Above Range,1.0,5.0,82.0,1


In [10]:
# add values to a csv
data.to_csv('../data/clean_bloomberg_with_sectors.csv')