# Explore split points

In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
sys.path.append("../modules")
from common import *

In [3]:
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'
dir_input_data = '03_aggregated'
gc_data = GC_Data_Processing(project, bucket_name)
aggregate_transform = Aggregate_Transform(project, bucket_name, dir_input_data, dir_output_data)

## Reading data

In [4]:
file_data = bucket_name + "/" + dir_input_data + "/2016_aggregated.csv"

In [5]:
bucket_fs = gc_data.get_gc_fs()

In [16]:
with bucket_fs.open(file_data) as f:
    df = pd.read_csv(f)

  interactivity=interactivity, compiler=compiler, result=result)


# Companies with reported revenue

## Marking them

In [17]:
df['has_revenue'] = (np.isnan(df.mean_amt_revenue)==False).astype(int)

## Revenue

In [18]:
pd.crosstab(df.has_revenue, df.has_relocated_next_year) 

has_relocated_next_year,False,True
has_revenue,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1740630,33899
1,77447,864


In [19]:
pd.crosstab(df.has_revenue, df.has_relocated_next_year, normalize = 'index') 

has_relocated_next_year,False,True
has_revenue,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.980897,0.019103
1,0.988967,0.011033


## Operating result

In [64]:
df['has_operating_result'] = (np.isnan(df.mean_amt_operating_result)==False).astype(int)

In [65]:
pd.crosstab(df.has_revenue, df.has_operating_result)

has_operating_result,0,1
has_revenue,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1758950,89673
1,26618,54861


## Consolidated revenue

In [58]:
df['has_consolidated_revenue'] = (np.isnan(df.mean_amt_consolidated_revenue)==False).astype(int)

In [59]:
pd.crosstab(df.has_revenue, df.has_consolidated_revenue)

has_consolidated_revenue,0,1
has_revenue,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1821920,26703
1,72670,8809


## Consolidated operating result

In [61]:
df['has_consolidated_operating_result'] = (np.isnan(df.mean_amt_consolidated_operating_result)==False).astype(int)

In [62]:
pd.crosstab(df.has_consolidated_operating_result, df.has_consolidated_revenue)

has_consolidated_revenue,0,1
has_consolidated_operating_result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1879592,7223
1,14998,28289


In [28]:
pd.crosstab(df.ntile_qty_employees, df.has_revenue)

has_revenue,0,1
ntile_qty_employees,Unnamed: 1_level_1,Unnamed: 2_level_1
"(1.082, 2.0]",193973,5231
"(2.0, 3.0]",60782,3018
"(3.0, 6.0]",79291,6118
"(6.0, 25.0]",82283,11839
"(25.0, 913374.0]",74746,34423


# Companies and number of employees

## n-Tiles

In [37]:
df['ntile_qty_employees'] = pd.qcut(df.mean_qty_employees[df.mean_qty_employees > 1], 5)

In [70]:
df['has_1_employee'] = df['mean_qty_employees'] == 1

## Bins

In [47]:
df['mean_qty_employees'].max()

913374.0

In [55]:
df['bin_qty_employees'] = pd.cut(df.mean_qty_employees, 
                                 [0, 1, 3, 6, 25, 100, 250, df['mean_qty_employees'].max()])

In [56]:
pd.crosstab(df.bin_qty_employees, df.has_revenue)

has_revenue,0,1
bin_qty_employees,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0.0, 1.0]",1064365,19078
"(1.0, 3.0]",254755,8249
"(3.0, 6.0]",79291,6118
"(6.0, 25.0]",82283,11839
"(25.0, 100.0]",31132,10931
"(100.0, 250.0]",10375,6268
"(250.0, 913374.0]",33239,17224


In [63]:
list(df)

['Unnamed: 0',
 'id_branch',
 'id_company',
 'date_month',
 'code_sbi_1',
 'has_relocated',
 'has_name_change',
 'delta_qty_employees',
 'delta_qty_issued_credit_reports',
 'delta_score_payment_assessment',
 'code_legal_form_has_changed',
 'SBI_has_changed',
 'company_age',
 'years_since_last_amt_consolidated_operating_result',
 'years_since_last_amt_consolidated_revenue',
 'years_since_last_amt_operating_result',
 'years_since_last_qty_employees',
 'years_since_last_amt_revenue',
 'years_in_current_location',
 'ratio_operating_result_consolidated_operating_result',
 'ratio_revenue_consolidated_revenue',
 'unique_id',
 'qty_green_flags',
 'qty_orange_flags',
 'qty_red_flags',
 'A',
 'AA',
 'AAA',
 'B',
 'BB',
 'BBB',
 'C',
 'CC',
 'CCC',
 'D',
 'NR',
 'code_legal_form_group_1',
 'code_legal_form_group_2',
 'SBI_group_1',
 'SBI_group_2',
 'is_discontinued_any',
 'has_financial_calamity',
 'mean_amt_consolidated_operating_result',
 'mean_amt_consolidated_revenue',
 'mean_amt_operating_re

## Combination

In [69]:
df.groupby(['has_revenue',
            'has_consolidated_revenue'])['id_branch'].count()

has_revenue  has_consolidated_revenue
0            0                           1821920
             1                             26703
1            0                             72670
             1                              8809
Name: id_branch, dtype: int64