## Class03

In [1]:
import pandas as pd
import numpy as np

### Read data and drop duplicates

In [2]:
file_path = '/Users/ml/Google Drive/af/teaching/database/data/'
ibes_raw = pd.read_csv(file_path+'ibes_1976_1990_summ_both.txt',sep='\t',low_memory=False)
ibes_raw.columns = ibes_raw.columns.str.lower()
ibes_raw = ibes_raw.drop_duplicates(['ticker','statpers']).reset_index(drop=True)

In [3]:
ibes_raw.iloc[:5,:13]

Unnamed: 0,ticker,cusip,oftic,cname,statpers,measure,fiscalp,fpi,estflag,curcode,numest,numup,numdown
0,A,2742010,A,AMERN MEDIC BLDG,19831020,EPS,ANN,1,P,USD,1,0,0
1,A,2742010,A,AMERN MEDIC BLDG,19831117,EPS,ANN,1,P,USD,1,0,0
2,A,2742010,A,AMERN MEDIC BLDG,19831215,EPS,ANN,1,P,USD,1,0,0
3,A,2742010,A,AMERN MEDIC BLDG,19860116,EPS,ANN,1,P,USD,1,0,0
4,A,2742010,A,AMERN MEDIC BLDG,19860220,EPS,ANN,1,P,USD,1,0,0


### Keep US firms
This file contains both US firm and non-US firms. **usfirm** can be used to filter them out: it is US firm if **usfirm** is 1 and it is global firm if **usfirm** equals to 0.

In [4]:
ibes_raw.groupby('usfirm')[['ticker']].count()

Unnamed: 0_level_0,ticker
usfirm,Unnamed: 1_level_1
0,228834
1,453924


In [5]:
ibes_us = ibes_raw[ibes_raw['usfirm']==1].copy()
len(ibes_us)

453924

### Sample selection: keep firms with at least 60 month of numest 

In [6]:
ibes_us['n_numest'] = ibes_us.groupby('ticker')['numest'].transform('count')
ibes_us_1 = ibes_us[ibes_us['n_numest']>=60].copy()
ibes_us_1 = ibes_us_1.sort_values(['ticker','statpers']).reset_index(drop=True)
len(ibes_us_1)

344326

### Check number of unique firms

In [7]:
len(ibes_us['ticker'].unique())

6951

### Basic summary statistics

In [8]:
ibes_us_1[['numest','meanest','stdev']].describe()

Unnamed: 0,numest,meanest,stdev
count,344326.0,344324.0,285761.0
mean,7.994543,167450.7,55512.47
std,7.634961,19224010.0,4449504.0
min,1.0,-988235300.0,0.0
25%,2.0,0.35,0.02
50%,5.0,0.89,0.06
75%,12.0,1.82,0.16
max,52.0,2861765000.0,817647100.0


In [9]:
ibes_us_1['year'] = (ibes_us_1['statpers']/10000).astype(int)
ibes_us_1.groupby('year')[['numest','meanest','stdev']].aggregate(['mean','median','std','min','max']).T

Unnamed: 0,year,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990
numest,mean,5.692249,5.553678,5.795983,6.074493,6.324035,7.092612,7.719291,7.655167,7.715934,8.418741,8.942773,9.299843,9.70181,10.28249,10.33535
numest,median,4.0,3.0,3.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,6.0,6.0,7.0,7.0,7.0
numest,std,5.465563,5.440635,5.693274,5.721052,5.568066,6.070099,6.889808,7.023749,7.38533,8.110911,8.399494,8.514879,8.877915,9.26138,9.135582
numest,min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
numest,max,29.0,28.0,28.0,28.0,28.0,31.0,34.0,36.0,39.0,44.0,52.0,44.0,48.0,51.0,50.0
meanest,mean,15.917728,14.478176,27.672631,30.469365,32.559812,31.528427,36.560658,35.041802,31.492565,92518.36,952834.9,853529.9,211412.1,-60422.36,21772.69
meanest,median,0.85,0.98,1.0,1.07,1.06,1.09,0.95,0.84,0.9,0.82,0.74,0.77,0.86,0.87,0.81
meanest,std,387.591476,381.469765,638.814774,706.432692,779.285898,781.877931,845.769494,807.799582,1014.431228,15906100.0,47687920.0,41057450.0,14697460.0,10763430.0,3591394.0
meanest,min,-4.33,-3.47,-100.0,-12.0,-87.91,-2330.0,-500.0,-5495.0,-18256.0,-22776.0,-4308.0,-540.0,-197058700.0,-988235300.0,-79411760.0
meanest,max,15282.35,19411.75,22323.52,23523.52,25665.97,27935.28,32929.39,29399.98,64411.76,2735294000.0,2861765000.0,2479412000.0,1214706000.0,500000000.0,352940900.0


### Percentile

In [10]:
for i in range(10,91,10):
    ibes_us_1['p'+str(i)] = ibes_us_1.groupby('year')['meanest'].transform(lambda x: x.quantile(i/100))

### Correlation

In [11]:
ibes_us_1[['numest','meanest','stdev']].corr()

Unnamed: 0,numest,meanest,stdev
numest,1.0,-0.002907,-0.006359
meanest,-0.002907,1.0,0.515212
stdev,-0.006359,0.515212,1.0
