<a id='top'></a>


<a href=#inspect_data>Inspect Data</a>  

<a href=#logreg>Logistic Regression</a>

<a href=#subset>Subset Data</a>

<a href=#summary>Summary Statistics</a>

<a href=#subsetII>Subset for Summary Stats</a>

<a href=#correlation>Correlation</a>

<a href=#variance>Variance</a>

<a href=#ProbMoves>Probability that Company Moves</a>

<a href=#bottom>Bottom</a>


In [1]:
6794.92 - 2265.81

4529.110000000001

In [2]:
# Ignore 'dask' warning

import pandas as pd
import gcsfs
from google.cloud import storage
from pandas import DataFrame
from IPython.display import HTML
from google.cloud.storage import Blob
import datalab.storage as gcs_datalab
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn import preprocessing
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy import stats

In [3]:
# Setting up constants. All required
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'

In [4]:
# Initializing bucket
fs = gcsfs.GCSFileSystem(project='graydon-moving-indicator')
gcs = storage.Client()
bucket = gcs.get_bucket(bucket_name)

In [5]:
selected_columns = ['date_month', 'id_company', 'id_branch',
       'is_discontinued',
       'financial_calamity_outcome',
       'qty_employees', 'year_qty_employees', 'id_company_creditproxy',
       'score_payment_assessment', 'amt_revenue',
       'year_revenue', 'amt_consolidated_revenue', 'year_consolidated_revenue',
       'amt_consolidated_operating_result',
       'year_consolidated_operating_result', 
       'perc_credit_limit_adjustment', 'color_credit_status', 'rat_pd',
       'score_pd','has_increased_risk',
       'is_sole_proprietor', 'code_SBI_2', 'code_SBI_1',
       'qty_address_mutations_total',
       'qty_address_mutations_month', 
       'has_relocated',
       'has_name_change',  'Vice President'
       ]

In [6]:
selected_columns_small = ['date_month', 'id_company', 'id_branch', 'date_established',
       'is_discontinued',
       'financial_calamity_outcome',
       'qty_employees', 'year_qty_employees', 'id_company_creditproxy',
       'score_payment_assessment', 'amt_revenue',
       'year_revenue', 'amt_consolidated_revenue', 'year_consolidated_revenue',
       'perc_credit_limit_adjustment', 'color_credit_status', 'rat_pd',
       'score_pd','has_increased_risk',
       'is_sole_proprietor', 'code_SBI_2', 'code_SBI_1', 'qty_address_mutations_total',
       'qty_address_mutations_month', 'has_relocated',
        'has_name_change'
       ]

In [7]:

def read_one_year_from_bucket_merged_csv(year, dir_prefix = ''):
    """ Reads a whole year of data from the already merged files """
    full_year_df = pd.DataFrame()
    print('Starting with year: ', year)
    print(dir_prefix)
    blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
    for blob in blob_list:  
        print("blob", blob.name)
        if year in blob.name:
            print('Processing file: ', blob.name)
            with fs.open('graydon-data/' + blob.name) as f:
                full_year_df = pd.read_csv(f, sep=',', index_col=0)   
        print('The number of rows so far is: ', full_year_df.shape[0])
    return full_year_df

In [8]:
def clean_data_per_year(df):
    df['date_month'] = pd.to_datetime(df['date_month'])
    df['financial_calamity_outcome'] = df['financial_calamity_outcome'].fillna(-1) 
    df['qty_employees'] = df['qty_employees'].str.strip() 
    df.loc[df.qty_employees == 'NA', 'qty_employees'] = 0
    df['qty_employees'] = df['qty_employees'].fillna(0) 
    df['qty_employees'] = df['qty_employees'].astype(str).astype(int)
    df['year_qty_employees'] = df['year_qty_employees'].str.strip()
    df.loc[df.year_qty_employees == 'NA', 'year_qty_employees'] = None
    df['amt_revenue'] = df['amt_revenue'].str.strip() 
    df.loc[df.amt_revenue == 'NA', 'amt_revenue'] = 0
    df['amt_revenue'] = df['amt_revenue'].astype(str).str.replace(',','.').astype(float)
    df['year_revenue'] = df['year_revenue'].str.strip() 
    df.loc[df.year_revenue == 'NA', 'year_revenue'] = 0
    df['amt_consolidated_revenue'] = df['amt_consolidated_revenue'].str.strip() 
    df.loc[df.amt_consolidated_revenue == 'NA', 'amt_consolidated_revenue'] = 0
    df['amt_consolidated_revenue'] = df['amt_consolidated_revenue'].astype(str).str.replace(',','.').astype(float)
    df['year_consolidated_revenue'] = df['year_consolidated_revenue'].str.strip() 
    df.loc[df.year_consolidated_revenue == 'NA', 'year_consolidated_revenue'] = 0
    df['amt_consolidated_operating_result'] = df['amt_consolidated_operating_result'].str.strip() 
    df.loc[df.amt_consolidated_operating_result == 'NA', 'amt_consolidated_operating_result'] = 0
    df['amt_consolidated_operating_result'] = df['amt_consolidated_operating_result'].astype(str).str.replace(',','.').astype(float)
    df['year_consolidated_operating_result'] = df['year_consolidated_operating_result'].str.strip() 
    df.loc[df.year_consolidated_operating_result == 'NA', 'year_consolidated_operating_result'] = 0
    df['score_pd'] = df['score_pd'].str.strip() 
    df.loc[df.score_pd == 'NA', 'score_pd'] = 0
    df['score_pd'] = df['score_pd'].astype(str).str.replace(',','.').astype(float)
    df['has_increased_risk'] = df['has_increased_risk'].astype(bool)
    df.loc[df.has_increased_risk == None, 'has_increased_risk'] = False
    df.loc[df.code_sbi_2.isnull(), 'code_sbi_2'] = 0
    return df

In [9]:
def save_df_locally(df, dir_prefix, year, as_json= False):
    """ Saves df as json or csv locally on server """
    if as_json:        
        file_path = dir_prefix + '/' + year + '_merged.json'
        df.to_json(file_path)
    else:
        file_path =  dir_prefix + '/' + year + '_merged.csv'
        df.to_csv(file_path)
        

#### Read one full year from already merged files

### 2013

In [10]:
df_one_year = read_one_year_from_bucket_merged_csv(dir_prefix= '02_cleaned', year = '2013')

Starting with year:  2013
02_cleaned
blob 02_cleaned/2013_merged.csv
Processing file:  02_cleaned/2013_merged.csv


  if (yield from self.run_code(code, result)):
  mask |= (ar1 == a)


The number of rows so far is:  20764754
blob 02_cleaned/2014_merged.csv
The number of rows so far is:  20764754
blob 02_cleaned/2015_merged.csv
The number of rows so far is:  20764754
blob 02_cleaned/2016_merged.csv
The number of rows so far is:  20764754
blob 02_cleaned/2017_merged.csv
The number of rows so far is:  20764754


In [11]:
pd.crosstab(df_one_year.is_sole_proprietor, df_one_year.has_relocated, normalize = "index")

has_relocated,False,True
is_sole_proprietor,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.995015,0.004985


In [12]:
df_one_year = read_one_year_from_bucket_merged_csv(dir_prefix= '02_cleaned', year = '2014')

Starting with year:  2014
02_cleaned
blob 02_cleaned/2013_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2014_merged.csv
Processing file:  02_cleaned/2014_merged.csv
The number of rows so far is:  21393752
blob 02_cleaned/2015_merged.csv
The number of rows so far is:  21393752
blob 02_cleaned/2016_merged.csv
The number of rows so far is:  21393752
blob 02_cleaned/2017_merged.csv
The number of rows so far is:  21393752


In [13]:
pd.crosstab(df_one_year.is_sole_proprietor, df_one_year.has_relocated, normalize = "index")

has_relocated,False,True
is_sole_proprietor,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.995039,0.004961


In [14]:
df_one_year = read_one_year_from_bucket_merged_csv(dir_prefix= '02_cleaned', year = '2015')

Starting with year:  2015
02_cleaned
blob 02_cleaned/2013_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2014_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2015_merged.csv
Processing file:  02_cleaned/2015_merged.csv
The number of rows so far is:  21834713
blob 02_cleaned/2016_merged.csv
The number of rows so far is:  21834713
blob 02_cleaned/2017_merged.csv
The number of rows so far is:  21834713


In [15]:
pd.crosstab(df_one_year.is_sole_proprietor, df_one_year.has_relocated, normalize = "index")

has_relocated,False,True
is_sole_proprietor,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.994726,0.005274


In [16]:
df_one_year = read_one_year_from_bucket_merged_csv(dir_prefix= '02_cleaned', year = '2016')

Starting with year:  2016
02_cleaned
blob 02_cleaned/2013_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2014_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2015_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2016_merged.csv
Processing file:  02_cleaned/2016_merged.csv


  if (yield from self.run_code(code, result)):


The number of rows so far is:  12735659
blob 02_cleaned/2017_merged.csv
The number of rows so far is:  12735659


In [17]:
pd.crosstab(df_one_year.is_sole_proprietor, df_one_year.has_relocated, normalize = "index")

has_relocated,False,True
is_sole_proprietor,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.99445,0.00555


In [18]:
df_one_year = read_one_year_from_bucket_merged_csv(dir_prefix= '02_cleaned', year = '2017')

Starting with year:  2017
02_cleaned
blob 02_cleaned/2013_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2014_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2015_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2016_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2017_merged.csv
Processing file:  02_cleaned/2017_merged.csv
The number of rows so far is:  22729762
blob 02_cleanedupload_test.py
The number of rows so far is:  22729762


In [19]:
pd.crosstab(df_one_year.is_sole_proprietor, df_one_year.has_relocated, normalize = "index")

has_relocated,False,True
is_sole_proprietor,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.994512,0.005488


In [20]:
df_one_year = read_one_year_from_bucket_merged_csv(dir_prefix= '02_cleaned', year = '2018')

Starting with year:  2018
02_cleaned
blob 02_cleaned/2013_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2014_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2015_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2016_merged.csv
The number of rows so far is:  0
blob 02_cleaned/2017_merged.csv
The number of rows so far is:  0
blob 02_cleanedupload_test.py
The number of rows so far is:  0


In [21]:
pd.crosstab(df_one_year.is_sole_proprietor, df_one_year.has_relocated, normalize = "index")

AttributeError: 'DataFrame' object has no attribute 'is_sole_proprietor'

<a id='inspect_data'></a>

# Inspect Data:


<a href=#top>Top</a>

In [None]:
one_year_df.shape

#### Preview df

In [None]:
HTML(DataFrame(one_year_df.head(5)).to_html())

In [None]:
one_year_df['has_relocated'].value_counts()

In [None]:
one_year_df['has_relocated'].value_counts()/one_year_df.shape[0]

In [None]:
one_year_df['is_sole_proprietor'].value_counts()

In [None]:
one_year_df['id_company'].value_counts()

In [None]:
one_year_df[one_year_df['id_company']==890460183][:3]

In [None]:
one_year_df[one_year_df['id_company']==890460183]['score_payment_assessment'].describe()

In [None]:
one_year_df.dtypes

In [None]:
type(one_year_df['qty_employees'])
#['has_relocated'].value_counts()

In [None]:
one_year_df['qty_employees'].describe()

In [None]:
one_year_df[one_year_df['qty_employees']>0]['qty_employees'].describe()

In [None]:
one_year_df['score_pd'].describe()

In [None]:
one_year_df['score_payment_assessment'].describe()

In [None]:
one_year_df['code_sbi_1'].describe()

In [None]:
one_year_df['code_sbi_2'].describe()

In [None]:
one_year_df['rat_pd'].describe()

<a id='logreg'></a>
# Logistic Regression:

<a href=#top>Top</a>

In [None]:
feature_cols = ['score_pd', 'score_payment_assessment', 'code_sbi_2']

In [None]:
X = one_year_df[feature_cols]
y = one_year_df.has_relocated 

In [None]:
X.shape, y.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)


In [None]:
logisticRegr = LogisticRegression()

In [None]:
logisticRegr.fit(x_train, y_train)

In [None]:
y_pred=logisticRegr.predict(x_test)

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
print('Accuracy of logistic regression classifier on test set: {:.4f}'.format(logisticRegr.score(x_test, y_test)))

In [None]:
y_test.value_counts() / y_test.shape[0]

<a id='subset'></a>

# Subset Data

<a href=#top>Top</a>

In [None]:
one_year_df.shapee

In [None]:
one_year_df[one_year_df['qty_employees']>1].shape

In [None]:
one_year_df['qty_employees'].describe()

In [None]:
mult_companies = one_year_df[one_year_df['qty_employees']>1]

In [None]:
mult_companies['has_relocated'].value_counts()

In [None]:
mult_companies['has_relocated'].value_counts()/mult_companies.shape[0]

In [None]:
X = mult_companies[feature_cols]
y = mult_companies.has_relocated 

In [None]:
X.shape, y.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
logisticRegr.fit(x_train, y_train)

In [None]:
logisticRegr.fit(x_train, y_train)

In [None]:
y_pred=logisticRegr.predict(x_test)

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
print('Accuracy of logistic regression classifier on test set: {:.4f}'.format(logisticRegr.score(x_test, y_test)))

In [None]:
y_test.value_counts() / y_test.shape[0]

<a id='summary'></a>
# Summary Statistics

<a href=#top>Top</a>

In [None]:
plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
mult_companies = one_year_df[one_year_df['qty_employees']>50]

In [None]:
mult_companies.shape

In [None]:
sns.countplot(x='has_relocated', data=mult_companies)
plt.show()

In [None]:
mult_companies.score_payment_assessment.hist()
plt.title('Histogram of score payment')
plt.xlabel('Score')
plt.ylabel('Frequency')

In [None]:
feature_cols = ['score_pd', 'score_payment_assessment', 'code_sbi_2']

In [None]:
subset_data = mult_companies[feature_cols]

In [None]:
for cols in feature_cols:
    print(cols)
    mult_companies[cols].hist()
#     plt.title('Histogram of', cols)
#     plt.xlabel(cols)
#     plt.ylabel('Frequency')

In [None]:
subset_data = mult_companies[feature_cols][:1000]

In [None]:
mult_companies.dtypes

In [None]:
feature_cols = ['score_pd', 'score_payment_assessment', 'code_sbi_2', 'qty_employees', 'qty_address_mutations_total']

### Remove qty_address_mutations_total

In [None]:
g = sns.pairplot(mult_companies[:10000], vars=feature_cols)

In [None]:
subset_large_companies = mult_companies.loc[mult_companies.qty_employees < 10000]

In [None]:
subset_large_companies['qty_employees'].describe()

In [None]:
feature_cols = ['score_pd', 'score_payment_assessment', 'code_sbi_2', 'qty_employees']

In [None]:
g = sns.pairplot(subset_large_companies[:10000], vars=feature_cols)

In [None]:
g = sns.pairplot(subset_large_companies, vars=feature_cols)

In [None]:
subset_large_companies.shape

In [None]:
subset_large_companies['has_relocated'].value_counts() / subset_large_companies.shape[0]

<a id='subsetII'></a>
# Subset for summary stats with relative fraction preserved

<a href=#top>Top</a>

In [None]:
subset_train=pd.read_csv("subset_2018.csv")

In [None]:
subset_train, subset_test, y_train, y_test = train_test_split(mult_companies, mult_companies.has_relocated, test_size=0.75, random_state=0)

In [None]:
mult_companies['has_relocated'].value_counts()

In [None]:
subset_train.shape

In [None]:
subset_train.has_relocated.value_counts()

In [None]:
subset_train.columns

In [None]:
subset_train.to_csv('subset_2018.csv')

In [None]:
subset_train.has_relocated.value_counts() / subset_train.shape[0]

In [None]:
subset_train.qty_employees.describe()


In [None]:
mult_companies.has_relocated.value_counts() / mult_companies.shape[0]

In [None]:
all_cols = subset_train.columns

In [None]:
all_cols


In [None]:
plot_cols = all_cols[4:6]

plot_cols

In [None]:
for cols in plot_cols[:2]:
    print(cols)
    subset_train[cols].hist()
#     plt.title('Histogram of', cols)
#     plt.xlabel(cols)
#     plt.ylabel('Frequency')

In [None]:
subset_train[subset_train['qty_employees']<200]['qty_employees'].hist()

In [None]:
subset_train.loc[(subset_train.qty_employees>10) & (subset_train.qty_employees<200)]['qty_employees'].hist()

In [None]:
np.log10(subset_train['qty_employees']).hist()

In [None]:
g = sns.pairplot(subset_train, vars=plot_cols)

<a id='correlation'></a>
# Correlation Analysis:

<a href=#top>Top</a>

In [None]:
# Compute the correlation matrix
corr_df = subset_train.drop(columns=['Unnamed: 0']).corr()


In [None]:
corr_df

In [None]:

# Generate a mask for the upper triangle
mask = np.zeros_like(corr_df, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


In [None]:
import matplotlib.pyplot as plt

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_df, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})


Drop columns:

In [None]:
# Compute the correlation matrix
corrII_df = subset_train.drop(columns=['Unnamed: 0',
                                      'id_company',
                                      'id_branch',
                                      'year_qty_employees',
                                      'id_company_creditproxy',
                                      'year_revenue',
                                      'amt_consolidated_revenue',
                                      'year_consolidated_revenue',
                                      'amt_consolidated_operating_result',
                                      'year_consolidated_operating_result',
                                      'is_sole_proprietor',
                                      'qty_address_mutations_total',
                                      'qty_address_mutations_month',
                                      'has_name_change',
                                      'vice_president']).corr()


In [None]:

# Generate a mask for the upper triangle
mask = np.zeros_like(corrII_df, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corrII_df, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})


In [None]:

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corrII_df, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .9})


<a id='variance'></a>
# Variance of Columns

<a href=#top>Top</a>

In [None]:
subset_train.drop(columns=['Unnamed: 0',
                                      'id_company',
                                      'id_branch',
                                      'year_qty_employees',
                                      'id_company_creditproxy',
                                      'year_revenue',
                                      'amt_consolidated_revenue',
                                      'year_consolidated_revenue',
                                      'amt_consolidated_operating_result',
                                      'year_consolidated_operating_result',
                                      'is_sole_proprietor',
                                      'qty_address_mutations_total',
                                      'qty_address_mutations_month',
                                      'has_name_change',
                                      'vice_president']).var()

In [None]:
subset_train.columns

In [None]:
subset_data = subset_train.drop(columns=['Unnamed: 0',
                                      'id_company',
                                      'id_branch',
                                      'year_qty_employees',
                                      'id_company_creditproxy',
                                      'year_revenue',
                                      'amt_consolidated_revenue',
                                      'year_consolidated_revenue',
                                      'amt_consolidated_operating_result',
                                      'year_consolidated_operating_result',
                                      'is_sole_proprietor',
                                      'qty_address_mutations_total',
                                      'qty_address_mutations_month',
                                      'has_name_change',
                                      'vice_president'])
col_variances = subset_data.var()

col_variances

In [None]:
subset_data.shape

In [None]:
subset_data[:10]

In [None]:
np.abs(stats.zscore(subset_data[:10]))

In [None]:
exclude_outliers = subset_data[(np.abs(stats.zscore(subset_data)) < 2).all(axis=1)]

In [None]:
exclude_outliers.shape()

In [None]:
subset_data[np.abs(df.Data-df.Data.mean()) <= (3*df.Data.std())]

In [None]:
type(col_variances)


<a id='ProbMoves'></a>
# Probability that company or branch moves:

<a href=#top>Top</a>

In [None]:
subset_train = pd.read_csv('subset_2018.csv')

In [None]:
num_unique_companies=subset_train['id_company'].nunique()
num_unique_companies

In [None]:
subset_train[subset_train.has_relocated==True].groupby(['id_company'])['id_company'].nunique().count()

In [None]:
num_companies_moved = subset_train[subset_train.has_relocated==True]['id_company'].nunique()
num_companies_moved

## Probability that a company moves:

In [None]:
prob_move = num_companies_moved/ num_unique_companies
prob_move

## Probability that a branch moves:

In [None]:
num_unique_branches=subset_train['id_branch'].nunique()
num_unique_branches

In [None]:
num_branches_moved = subset_train[subset_train.has_relocated==True]['id_branch'].nunique()
num_branches_moved

In [None]:
prob_branch_move = num_branches_moved/ num_unique_branches
prob_branch_move

In [None]:
subset_train[(subset_train.date_month=='2018-01-01') | (subset_train.date_month=='2018-02-01')]['id_company'].nunique()

In [None]:
jan_ids = subset_train[(subset_train.date_month=='2018-01-01')]['id_company'].unique()

In [None]:
feb_ids = subset_train[(subset_train.date_month=='2018-02-01')]['id_company'].unique()

In [None]:
len(jan_ids), len(feb_ids) 

In [None]:
len(set(np.append(jan_ids, feb_ids)))

In [None]:
subset_train.groupby(['id_company', 'has_relocated']).nunique()

In [None]:
subset_train.groupby('date_month')['id_company'].nunique()

In [None]:
moved_cs = subset_train[subset_train.has_relocated==True].groupby(['id_company'])['has_relocated'].sum()

In [None]:
subset_train[subset_train.has_relocated==True].groupby(['id_company'])['id_company'].count()

In [None]:
subset_train[subset_train.has_relocated==True].groupby(['id_company'])[['id_company']]

In [None]:
moved_cs['id_company'].nunique()

In [None]:
5724+64

In [None]:
moved_cs[moved_cs.has_relocated>1].shape

In [None]:
moved_cs[(moved_cs.has_relocated>1) & (moved_cs.has_relocated<3)].shape

In [None]:
moved_cs[moved_cs.has_relocated>2]

In [None]:
subset_train.groupby('id_company')['has_relocated'].nunique()

In [None]:
subset_train.loc[subset_train.has_relocated==True].groupby(['id_company'])['has_relocated'].sum()

In [None]:
432310 - num_unique_companies

In [None]:
subset_train['has_relocated'].value_counts()

In [None]:
subset_train['qty_address_mutations_month'].value_counts()

In [None]:
subset_train.shape

In [None]:
subset_train['has_relocated'].nunique()

In [None]:
subset_train.loc[subset_train.has_relocated==True].count()

In [None]:
subset_train.columns

<a id='bottom'></a>

<a href=#top>Top</a>

# Bottom