In [1]:
# Ignore 'dask' warning
import pandas as pd
import gcsfs
from google.cloud import storage
from pandas import DataFrame
from IPython.display import HTML
from google.cloud.storage import Blob
import datalab.storage as gcs_datalab
from datetime import date
import numpy as np

In [2]:
# Setting up constants. All required
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'

In [3]:
# Initializing bucket
fs = gcsfs.GCSFileSystem(project='graydon-moving-indicator')
gcs = storage.Client()
bucket = gcs.get_bucket(bucket_name)


In [4]:
# Setting up dictionary of column types
dtype={ 'id_company'  :np.float64,
        'id_branch'    :np.int64,
        'is_discontinued':bool,
        'code_discontinuation': np.float64,
        'code_financial_calamity':object,
        'financial_calamity_outcome'   : np.float64,
        'code_legal_form' : np.float64,
        'qty_employees' :np.float64,
        'year_qty_employees' :np.float64,
        'id_company_creditproxy':object,
        'score_payment_assessment'    : np.float64,
        'amt_revenue'  : np.float64,
        'year_revenue'  : np.float64,
        'amt_operating_result'   : np.float64,
        'year_operating_result'    :object,
        'amt_consolidated_revenue'   : np.float64,
        'year_consolidated_revenue'   :object,
        'amt_consolidated_operating_result'     : np.float64,
        'year_consolidated_operating_result'   :object,
        'qty_issued_credit_reports' : np.float64,
        'perc_credit_limit_adjustment' :object,
        'color_credit_status'  :object,
        'rat_pd'              :object,
        'score_pd'            : np.float64,
        'has_increased_risk'  :bool,
        'is_sole_proprietor'   :bool,
        'code_sbi_2'         : np.float64,
        'qty_address_mutations_total'  :np.float64,
        'qty_address_mutations_month'   :np.float64,
        'has_relocated':bool,
        'qty_started_names': np.float64,
        'qty_stopped_names': np.float64,
        'total_changeof_board_members_' :np.float64
      }

In [5]:
# Setting up dictionary of column types
parse_dates= ['date_established' , 'date_financial_calamity_started',
        'date_financial_calamity_stopped', 'date_start', 'from_date_start', 'date_month' ]

In [6]:
def read_one_year_from_bucket_merged_csv(year, dir_prefix = ''):
    """ Reads a whole year of data from the already merged files """
    full_year_df = pd.DataFrame()
    print('Starting with year: ', year)
    print(dir_prefix)
    blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
    for blob in blob_list:  
        print("blob", blob.name)
        if year in blob.name:
            print('Processing file: ', blob.name)
            with fs.open('graydon-data/' + blob.name) as f:
                full_year_df = pd.read_csv(f, sep=',', index_col=0, dtype=dtype, parse_dates=parse_dates 
                                         )   
        print('The number of rows so far is: ', full_year_df.shape[0])
    return full_year_df

In [7]:
one_year_df = read_one_year_from_bucket_merged_csv(
    dir_prefix= 'including_scores/merged_per_year/merged_cleaned/relocation_dates'
                                                   , year = '2017')

Starting with year:  2017
including_scores/merged_per_year/merged_cleaned/relocation_dates
blob including_scores/merged_per_year/merged_cleaned/relocation_dates/
The number of rows so far is:  0
blob including_scores/merged_per_year/merged_cleaned/relocation_dates/2008_merged.csv
The number of rows so far is:  0
blob including_scores/merged_per_year/merged_cleaned/relocation_dates/2009_merged.csv
The number of rows so far is:  0
blob including_scores/merged_per_year/merged_cleaned/relocation_dates/2010_merged.csv
The number of rows so far is:  0
blob including_scores/merged_per_year/merged_cleaned/relocation_dates/2011_merged.csv
The number of rows so far is:  0
blob including_scores/merged_per_year/merged_cleaned/relocation_dates/2012_merged.csv
The number of rows so far is:  0
blob including_scores/merged_per_year/merged_cleaned/relocation_dates/2013_merged.csv
The number of rows so far is:  0
blob including_scores/merged_per_year/merged_cleaned/relocation_dates/2014_merged.csv
The n

  if (yield from self.run_code(code, result)):
  mask |= (ar1 == a)


The number of rows so far is:  22729762
blob including_scores/merged_per_year/merged_cleaned/relocation_dates/2018_merged.csv
The number of rows so far is:  22729762


#### Previewing input data

In [14]:
HTML(DataFrame(one_year_df).head(10).to_html())

Unnamed: 0,date_month,id_company,id_branch,date_established,is_discontinued,code_discontinuation,code_financial_calamity,date_financial_calamity_started,date_financial_calamity_stopped,financial_calamity_outcome,code_legal_form,qty_employees,year_qty_employees,id_company_creditproxy,score_payment_assessment,amt_revenue,year_revenue,amt_operating_result,year_operating_result,amt_consolidated_revenue,year_consolidated_revenue,amt_consolidated_operating_result,year_consolidated_operating_result,qty_issued_credit_reports,perc_credit_limit_adjustment,color_credit_status,rat_pd,score_pd,has_increased_risk,is_sole_proprietor,code_sbi_2,code_sbi_1,qty_address_mutations_total,qty_address_mutations_month,date_start,from_date_start,has_relocated,qty_started_names,qty_stopped_names,has_name_change,total_changeof_board_members_,date_relocation_last,date_relocation_penultimate
0,2017-01-01,3.0,10079408,1921-03-17,False,,,NaT,NaT,-1.0,5.0,1.0,2017.0,3,21.0,3.55253e-316,2009.0,9.219265e-318,2015.0,4.954293e-316,2015,2.34681e-318,2015,4.0,15,G,CCC,-4.907,False,False,64.0,,0.0,0.0,NaT,NaT,False,0.0,0.0,False,0.0,2015-06-11,2014-10-02
1,2017-01-01,5.0,10079416,1740-01-01,False,,,NaT,NaT,-1.0,5.0,9.0,2017.0,1064993,20.0,0.0,0.0,0.0,,0.0,0,0.0,0,1.0,5,G,A,-4.892,False,False,46.0,,0.0,0.0,NaT,NaT,False,0.0,0.0,False,0.0,,
2,2017-01-01,6.0,10079424,1874-11-20,False,,F,2013-01-29,NaT,-1.0,5.0,25.0,2014.0,6,24.0,4.446591e-317,2011.0,0.0,,0.0,0,0.0,0,0.0,-100,R,D,0.0,True,False,41.0,,0.0,0.0,NaT,NaT,False,0.0,0.0,False,0.0,2001-06-15,
3,2017-01-01,9.0,10079432,1897-05-01,False,,,NaT,NaT,-1.0,5.0,45.0,2017.0,9,20.0,0.0,0.0,0.0,,0.0,0,0.0,0,0.0,25,G,CCC,-4.998,False,False,47.0,,0.0,0.0,NaT,NaT,False,0.0,0.0,False,0.0,1997-03-20,
4,2017-01-01,12.0,35,1924-08-01,False,,,NaT,NaT,-1.0,5.0,5.0,2017.0,12,28.0,0.0,0.0,0.0,,0.0,0,0.0,0,0.0,30,G,BB,-5.096,False,False,64.0,,0.0,0.0,NaT,NaT,False,0.0,0.0,False,0.0,2017-08-03,
5,2017-01-01,14.0,43,1821-05-01,False,,,NaT,NaT,-1.0,5.0,6.0,2017.0,14,20.0,0.0,0.0,0.0,,0.0,0,0.0,0,0.0,45,G,BBB,-5.005,False,False,64.0,,0.0,0.0,NaT,NaT,False,0.0,0.0,False,0.0,,
6,2017-01-01,17.0,51,NaT,False,,,NaT,NaT,-1.0,5.0,1.0,2017.0,17,20.0,0.0,0.0,0.0,,0.0,0,0.0,0,0.0,20,G,A,-5.014,False,False,64.0,,0.0,0.0,NaT,NaT,False,0.0,0.0,False,0.0,2002-02-07,
7,2017-01-01,23.0,94,1915-11-15,True,7.0,,NaT,NaT,-1.0,5.0,0.0,,23,29.0,0.0,0.0,0.0,,0.0,0,0.0,0,0.0,-5,R,D,0.0,True,False,46.0,,0.0,0.0,NaT,NaT,False,0.0,0.0,False,0.0,,
8,2017-01-01,25.0,10079467,1970-12-07,True,7.0,,NaT,NaT,-1.0,5.0,0.0,,25,29.0,0.0,0.0,0.0,,0.0,0,0.0,0,0.0,0,R,D,0.0,True,False,61.0,,0.0,0.0,NaT,NaT,False,0.0,0.0,False,0.0,1988-12-01,
9,2017-01-01,47.0,140,1862-01-01,False,,,NaT,NaT,-1.0,5.0,1.0,2007.0,47,20.0,0.0,0.0,0.0,,0.0,0,0.0,0,0.0,50,G,BBB,-5.122,False,False,70.0,,0.0,0.0,NaT,NaT,False,0.0,0.0,False,0.0,,


#### Ages of dates

In [15]:
def calculate_age_based_on_date(df, col_list):
    df['max_date_month'] = df.groupby(['id_branch', 'id_company']).date_month.transform('max')
    df['max_date_month_year'] = df['max_date_month'].apply(lambda x: x.year)
    for col in col_list:
        if col == 'date_established':
            df['temp_date_established_year'] = df.date_established.apply(lambda x: x.year)
            df['company_age'] = df['max_date_month_year'] - df.temp_date_established_year 
            df = df.drop(labels =['temp_date_established_year'], axis= 1)
        elif col == 'year_consolidated_operating_result':            
            mask = (df['year_consolidated_operating_result'].astype(float) > 0)
            df_valid = df[mask]
            df['years_since_last_amt_consolidated_operating_result'] = np.nan
            df.loc[mask, 'years_since_last_amt_consolidated_operating_result'] = (df['max_date_month_year'] - 
                                df_valid.year_consolidated_operating_result.astype(float))  
        elif col == 'year_consolidated_revenue':
            mask = (df['year_consolidated_revenue'].astype(float) > 0)
            df_valid = df[mask]
            df['years_since_last_amt_consolidated_revenue'] = np.nan
            df.loc[mask, 'years_since_last_amt_consolidated_revenue'] = (df['max_date_month_year'] - 
                                df_valid.year_consolidated_revenue.astype(float))    
        elif col == 'year_operating_result':
            mask = (df['year_operating_result'].astype(float) > 0)
            df_valid = df[mask]
            df['years_since_last_amt_operating_result'] = np.nan
            df.loc[mask, 'years_since_last_amt_operating_result'] = (df['max_date_month_year'] - 
                                df_valid.year_operating_result.astype(float))    
        elif col == 'year_qty_employees':
            mask = (df['year_qty_employees'].astype(float) > 0)
            df_valid = df[mask]
            df['years_since_last_qty_employees'] = np.nan
            df.loc[mask, 'years_since_last_qty_employees'] = (df['max_date_month_year'] - 
                                df_valid.year_qty_employees.astype(float))  
        elif col == 'year_revenue':
            mask = (df['year_revenue'].astype(float) > 0)
            df_valid = df[mask]
            df['years_since_last_amt_revenue'] = np.nan
            df.loc[mask, 'years_since_last_amt_revenue'] = (df['max_date_month_year'] - 
                                df_valid.year_revenue.astype(float)) 
    df = df.drop(labels =['max_date_month', 'max_date_month_year'], axis= 1)
    return df

#### Deltas

In [16]:
def calculate_delta_of_column(df, col_list):
    subset_columns = ['date_month', 'id_company', 'id_branch']
    subset_columns.extend(col_list)
    temp_df = df.reset_index().loc[:, subset_columns].sort_values(['id_company','id_branch', 'date_month'])
    temp_df = temp_df.groupby(['id_branch', 'id_company']).agg(['first', 'last'])
    for col in col_list:
        if col == 'qty_employees':
            temp_df['delta_qty_employees'] = temp_df['qty_employees']['last'] - temp_df['qty_employees']['first']    
        elif col == 'qty_issued_credit_reports':
            temp_df['delta_qty_issued_credit_reports'] = (temp_df['qty_issued_credit_reports']['last'] - 
                                                          temp_df['qty_issued_credit_reports']['first'] )
        elif col == 'score_payment_assessment':
            temp_df['delta_score_payment_assessment'] = (temp_df['score_payment_assessment']['last'] - 
                                                          temp_df['score_payment_assessment']['first'] )
        elif col == 'score_pd':
            temp_df['delta_score_pd'] = (temp_df['score_pd']['last'] - 
                                                          temp_df['score_pd']['first'] )
        elif col == 'code_legal_form':
            temp_df['code_legal_form_has_changed'] = (temp_df['code_legal_form']['last'] !=
                                                          temp_df['code_legal_form']['first'] )
        elif col == 'code_SBI_2_group':
            temp_df['SBI_has_changed'] = (temp_df['code_SBI_2_group']['last'] !=
                                                          temp_df['code_SBI_2_group']['first'] )   
    temp_df.columns = temp_df.columns.droplevel(1)
    temp_df = temp_df.loc[:,~temp_df.columns.duplicated()]
    temp_df = temp_df.drop(axis=1, columns=col_list)        
    df = df.merge(temp_df, how='left', on=['date_month', 'id_company', 'id_branch']) 
    return df

#### If any true then true

In [17]:
def calculate_if_any_true(df, col_list):
    for col in col_list:
        if col == 'is_discontinued': 
            df = df.merge(df.groupby(['id_branch', 'id_company'])['is_discontinued'] 
                        .any()              # True if any items are True
                        .rename('is_discontinued_any')    # name Series 
                        .to_frame()         # make a dataframe for merging
                        .reset_index())
        elif col == 'code_financial_calamity':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['code_financial_calamity'] 
                        .any()            
                        .rename('has_financial_calamity')   
                        .to_frame() 
                        .reset_index())
        elif col == 'has_relocated':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['has_relocated'] 
                        .any()            
                        .rename('has_relocated_next_year')   
                        .to_frame() 
                        .reset_index())
    return df

#### Mean

In [18]:
def calculate_mean_of_column(df, col_list):
    for col in col_list:
        if col == 'amt_consolidated_operating_result':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['amt_consolidated_operating_result'] 
                        .agg('mean')             
                        .rename('mean_amt_consolidated_operating_result')    
                        .to_frame()       
                        .reset_index())
        if col == 'amt_consolidated_revenue':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['amt_consolidated_revenue'] 
                        .agg('mean')              
                        .rename('mean_amt_consolidated_revenue')    
                        .to_frame()      
                        .reset_index())
        if col == 'amt_operating_result':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['amt_operating_result'] 
                        .agg('mean')           
                        .rename('mean_amt_operating_result')    
                        .to_frame()         
                        .reset_index())
        if col == 'amt_revenue':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['amt_revenue']
                        .agg('mean')        
                        .rename('mean_amt_revenue')   
                        .to_frame()     
                        .reset_index())
        if col == 'qty_employees':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['qty_employees'] 
                        .agg('mean')          
                        .rename('mean_qty_employees')    
                        .to_frame()       
                        .reset_index())
        if col == 'qty_issued_credit_reports':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['qty_issued_credit_reports'] 
                        .agg('mean')       
                        .rename('mean_qty_issued_credit_reports')    
                        .to_frame()        
                        .reset_index())
        if col == 'score_payment_assessment':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['score_payment_assessment'] 
                        .agg('mean')       
                        .rename('mean_score_payment_assessment')    
                        .to_frame()        
                        .reset_index())
        if col == 'score_pd':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['score_pd'] 
                        .agg('mean')       
                        .rename('mean_score_pd')    
                        .to_frame()        
                        .reset_index())        
    return df

#### Dummies into counts

In [19]:
def column_dummies_into_counts(df, col_list):
    df = df.reset_index()
    subset_columns = ['id_branch']
    subset_columns.extend(col_list)
    df['unique_id'] =  df['id_branch'].astype(str) + '_' + df['id_company'].astype(str)
    for col in col_list:
        temp_df = df.loc[:, subset_columns]
        if col == 'color_credit_status':
            temp_df = pd.crosstab(df['unique_id'], df['color_credit_status']).reset_index().rename_axis(None,
                                                                                                        axis=1).rename(
                columns={"G": "qty_green_flags", "O": "qty_orange_flags","R": "qty_red_flags"})
        elif col == 'rat_pd':
            temp_df = pd.crosstab(df['unique_id'], df['rat_pd']).reset_index().rename_axis(None, axis=1)
        elif col == 'code_SBI_2_group':
            temp_df = pd.crosstab(df['unique_id'], df['code_SBI_2_group']).reset_index().rename_axis(None,
                                                                                                        axis=1).rename(
                columns={"1": "SBI_group_1", "2": "SBI_group_2"})
        elif col == 'code_legal_form_group':
            temp_df = pd.crosstab(df['unique_id'], df['code_legal_form_group']).reset_index().rename_axis(None,
                                                                                                        axis=1).rename(
                columns={"1": "code_legal_form_group_1", "2": "code_legal_form_group_2"})
        df = df.merge(temp_df, how='left', on= ['unique_id']) 
    return df

#### Ratio 

In [20]:
def calculate_ratio_of_column(df, col_list):
    for col in col_list:
        subset_columns = ['id_branch']
        subset_columns.extend(col)
        temp_df = df.loc[:, subset_columns]
        if col == 'amt_operating_result':
            temp_df = df.groupby(['id_branch', 'id_company'])
            temp_df = temp_df.agg({'amt_operating_result': 'sum', 'amt_consolidated_operating_result': 'sum'}).rename(
    columns={'amt_operating_result': 'sum_amt_operating_result', 
             'amt_consolidated_operating_result': 'sum_amt_consolidated_operating_result'})
            temp_df['ratio_operating_result_consolidated_operating_result'] = np.divide(
                temp_df['sum_amt_operating_result'], temp_df['sum_amt_consolidated_operating_result'])
            temp_df = temp_df.reset_index()
            temp_df = temp_df.drop(axis=1, columns=['sum_amt_consolidated_operating_result', 
                                                    'sum_amt_operating_result'])
            df = df.merge(temp_df, how='left', on= ['id_branch', 'id_company'])  
        elif col == 'amt_revenue':
            temp_df = df.groupby(['id_branch', 'id_company'])
            temp_df = temp_df.agg({'amt_revenue': 'sum', 'amt_consolidated_revenue': 'sum'}).rename(
    columns={'amt_revenue': 'sum_amt_revenue', 
             'amt_consolidated_revenue': 'sum_amt_consolidated_revenue'})
            temp_df['ratio_revenue_consolidated_revenue'] = np.divide(temp_df['sum_amt_revenue'],
                                                                     temp_df['sum_amt_consolidated_revenue'])
            temp_df = temp_df.reset_index()
            temp_df = temp_df.drop(axis=1, columns=['sum_amt_revenue', 'sum_amt_consolidated_revenue'])
            df = df.merge(temp_df, how='left',  on= ['id_branch', 'id_company'])  
    return df

#### Sum

In [21]:
def calculate_sum_of_column(df, col_list):
    for col in col_list:
        if col == 'qty_address_mutations_month':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['qty_address_mutations_month'] 
                        .agg('sum')             
                        .rename('qty_address_mutations_year')    
                        .to_frame()       
                        .reset_index())
        elif col == 'qty_started_names':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['qty_started_names'] 
                        .agg('sum')              
                        .rename('qty_started_names_year')    
                        .to_frame()      
                        .reset_index())
        elif col == 'qty_stopped_names':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['qty_stopped_names'] 
                        .agg('sum')           
                        .rename('qty_stopped_names_year')    
                        .to_frame()         
                        .reset_index())
        elif col == 'total_changeof_board_members_':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['total_changeof_board_members_']
                        .agg('sum')        
                        .rename('qty_board_changes_year')   
                        .to_frame()     
                        .reset_index())
    return df

#### Variance

In [22]:
def calculate_variance_of_column(df, col_list):
    for col in col_list:
        if col == 'qty_employees':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['qty_employees'] 
                        .agg('var')             
                        .rename('variance_qty_employees')    
                        .to_frame()       
                        .reset_index())
        elif col == 'qty_issued_credit_reports':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['qty_issued_credit_reports'] 
                        .agg('var')              
                        .rename('variance_qty_issued_credit_reports')    
                        .to_frame()      
                        .reset_index())
        elif col == 'score_payment_assessment':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['score_payment_assessment'] 
                        .agg('sum')           
                        .rename('variance_score_payment_assessment')    
                        .to_frame()         
                        .reset_index())
        elif col == 'score_pd':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['score_pd']
                        .agg('sum')        
                        .rename('variance_score_pd')   
                        .to_frame()     
                        .reset_index())
    return df

#### Get has_relocated from next year DF

In [37]:
def replace_has_relocated_with_nextyear(df, next_year, dir_prefix = ''):
    dtype={ 
            'id_branch'    :np.int64,
            'id_company'    :np.int64,
            'has_relocated':bool
    }
    full_next_year_df = pd.DataFrame()
    cols = ['id_company', 'id_branch', 'has_relocated']
    print('Starting withGra year: ', next_year)
    print(dir_prefix)
    blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
    for blob in blob_list:         
        if next_year in blob.name:
            print('Processing file: ', blob.name)
            with fs.open('graydon-data/' + blob.name) as f:
                full_next_year_df = pd.read_csv(f, sep=',',  dtype=dtype, usecols= cols
                                        )   
        print('The number of rows so far is: ', full_next_year_df.shape[0])
    full_next_year_df = calculate_if_any_true(full_next_year_df, col_list = ['has_relocated'])
    full_next_year_df = full_next_year_df.drop(axis=1, columns='has_relocated')
    full_next_year_df = full_next_year_df.drop_duplicates().reset_index().drop(axis=1, columns='index')
    df = df.merge(full_next_year_df, on=['id_branch', 'id_company'], how='left', suffixes='_C')
    return df

#### Creating SBI code groups

In [24]:
def create_sbi_groups(df):
    code_SBI_2_group1 = [1,19,35,51,53,59,61,62,63,69,72,73,74,78,79,80,82,85,86,87,88,90,93,94]
    df['code_SBI_2_group'] = np.where(df['code_sbi_2'].isin(code_SBI_2_group1), "1", "2")
    df = df.drop(axis=1, labels='code_sbi_2', inplace=False)
    return df

#### Creating code legal from groups

In [25]:
def create_code_legal_form_groups(df):
    code_legal_form_group = [1,4,6,7,8,9,15,17,18]
    df['code_legal_form_group'] = np.where(df['code_legal_form'].isin(code_legal_form_group), "1", "2")
    df = df.drop(axis=1, labels='code_legal_form', inplace=False)
    return df

#### Dropping old columns

In [26]:
def drop_old_columns(df, col_list):
    df = df.drop(axis=1, labels=col_list, inplace=False)   
    return df

#### Aggregating dataframe into one year

In [27]:
%%time
def aggregate_full_year(df):
    
    print('Creating SBI groups ')
    df = create_sbi_groups(df)
    print('Done creating SBI groups')
    
    print('Calculating delta of variables ')
    df = calculate_delta_of_column(df, col_list=['qty_employees','qty_issued_credit_reports', 
                                                        'score_payment_assessment',
                                                       'code_legal_form', 'code_SBI_2_group'])
    print('Done calculating delta of variables ') 
    
    print('Creating code legal form groups ')
    df = create_code_legal_form_groups(df)
    print('Done creating code legal form groups')
    
    print('Calculating ages of variables ')
    df = calculate_age_based_on_date(df,['date_established', 'year_consolidated_operating_result', 
                                         'year_consolidated_revenue',
                                        'year_operating_result', 'year_qty_employees', 'year_revenue'])
    print('Done calculating ages of variables ')
    
    print('Calculating ratio of columns')
    df = calculate_ratio_of_column(df, col_list=['amt_operating_result',
                                                        'amt_consolidated_operating_result',
                                                         'amt_revenue',
                                                        'amt_consolidated_revenue'])
    print('Done calculating ratio of columns')
        
    print('Making dummies into counts')
    df = column_dummies_into_counts(df, col_list=['color_credit_status','rat_pd', 'code_legal_form_group',
                                                  'code_SBI_2_group'])
    print('Done making dummies into counts')
    
 
    print('Calculating if any true ')
    df = calculate_if_any_true(df, col_list=['is_discontinued', 'code_financial_calamity'])
    print('Done calculating if any true ')
    
    print('Calculating mean of columns ')
    df = calculate_mean_of_column(df, col_list=['amt_consolidated_operating_result', 
                                                        'amt_consolidated_revenue',
                                                       'amt_operating_result','amt_revenue',
                                                       'qty_employees', 'qty_issued_credit_reports',
                                                      'score_payment_assessment' , 
                                                       'score_pd'])
    print('Done calculating mean of columns ')
    
    print('Calculating sum of columns')
    df = calculate_sum_of_column(df, col_list=['qty_address_mutations_month',
                                                        'qty_started_names',
                                                         'qty_stopped_names',
                                                        'total_changeof_board_members_']) 
    print('Done calculating sum of columns')
    
    print('Calculating variance of columns')
    df = calculate_variance_of_column(df, col_list=['qty_employees',
                                                        'qty_issued_credit_reports',
                                                         'score_payment_assessment',
                                                        'score_pd']) 
    print('Done calculatinh variance of columns')


 
    print('Dropping old columns')
    df = drop_old_columns(df, col_list = ['date_established', 'year_consolidated_operating_result', 
                                          'year_consolidated_revenue', 
                                          'year_operating_result', 'year_qty_employees', 'year_revenue',
                                          'is_discontinued', 'code_financial_calamity', 
                                          'amt_consolidated_operating_result', 'amt_consolidated_revenue', 
                                          'amt_operating_result','amt_revenue','qty_employees', 
                                          'qty_issued_credit_reports', 'score_payment_assessment' ,
                                          'score_pd', 'color_credit_status','rat_pd',
                                          'qty_address_mutations_month','qty_started_names',
                                          'qty_stopped_names', 'total_changeof_board_members_', 'is_sole_proprietor',
                                          'code_discontinuation','date_financial_calamity_started', 
                                          'date_financial_calamity_stopped', 'id_company_creditproxy', 
                                          'financial_calamity_outcome', 'has_increased_risk' , 
                                          'perc_credit_limit_adjustment',
                                          'date_start', 'from_date_start', 'qty_address_mutations_total',
                                          'code_legal_form_group', 'code_SBI_2_group' ])
    print('Done dropping old columns')
    
    print('Done aggreating dataframe')
    return df

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 52.5 µs


In [28]:
%%time
one_year_df = aggregate_full_year(one_year_df)

Creating SBI groups 
Done creating SBI groups
Calculating delta of variables 
Done calculating delta of variables 
Creating code legal form groups 
Done creating code legal form groups
Calculating ages of variables 
Done calculating ages of variables 
Calculating ratio of columns


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':


Done calculating ratio of columns
Making dummies into counts
Done making dummies into counts
Calculating if any true 
Done calculating if any true 
Calculating mean of columns 
Done calculating mean of columns 
Calculating sum of columns
Done calculating sum of columns
Calculating variance of columns
Done variance sum of columns
Dropping old columns
Done dropping old columns
Done aggreating dataframe
CPU times: user 30min 27s, sys: 16min 27s, total: 46min 54s
Wall time: 46min 50s


#### Deduplicating rows of original dataframe

In [29]:
def deduplicate_rows(df):
    df = one_year_df.groupby(['id_branch', 'id_company']).first()
    df = df.reset_index()
    df = df.drop(axis=1, columns='index')
    return df    

In [30]:
one_year_df = deduplicate_rows(one_year_df)

#### Appending has_relocated column from next year 

In [38]:
one_year_df = replace_has_relocated_with_nextyear(df= one_year_df, next_year='2018',
                                   dir_prefix= 'including_scores/merged_per_year/merged_cleaned/relocation_dates')

Starting withGra year:  2018
including_scores/merged_per_year/merged_cleaned/relocation_dates
The number of rows so far is:  0
The number of rows so far is:  0
The number of rows so far is:  0
The number of rows so far is:  0
The number of rows so far is:  0
The number of rows so far is:  0
The number of rows so far is:  0
The number of rows so far is:  0
The number of rows so far is:  0
The number of rows so far is:  0
The number of rows so far is:  0
Processing file:  including_scores/merged_per_year/merged_cleaned/relocation_dates/2018_merged.csv
The number of rows so far is:  19311866


#### Previewing data

In [39]:
HTML(DataFrame(one_year_df).head(100).to_html())

Unnamed: 0,id_branch,id_company,date_month,code_sbi_1,has_relocated,has_name_change,date_relocation_last,date_relocation_penultimate,delta_qty_employees,delta_qty_issued_credit_reports,delta_score_payment_assessment,code_legal_form_has_changed,SBI_has_changed,company_age,years_since_last_amt_consolidated_operating_result,years_since_last_amt_consolidated_revenue,years_since_last_amt_operating_result,years_since_last_qty_employees,years_since_last_amt_revenue,ratio_operating_result_consolidated_operating_result,ratio_revenue_consolidated_revenue,unique_id,qty_green_flags,qty_orange_flags,qty_red_flags,A,AA,AAA,B,BB,BBB,C,CC,CCC,D,NR,code_legal_form_group_1,code_legal_form_group_2,SBI_group_1,SBI_group_2,is_discontinued_any,has_financial_calamity,mean_amt_consolidated_operating_result,mean_amt_consolidated_revenue,mean_amt_operating_result,mean_amt_revenue,mean_qty_employees,mean_qty_issued_credit_reports,mean_score_payment_assessment,mean_score_pd,qty_address_mutations_year,qty_started_names_year,qty_stopped_names_year,qty_board_changes_year,variance_qty_employees,variance_qty_issued_credit_reports,variance_score_payment_assessment,variance_score_pd,has_relocated_next_year
0,0,428134.0,2017-01-01,,False,False,,,0.0,0.0,0.0,False,False,37.0,,,,,,,,0_428134.0,0,0,12,0,0,0,12,0,0,0,0,0,0,0,0,12,0,12,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
1,0,803351.0,2017-01-01,,False,False,,,0.0,0.0,0.0,False,False,,,,,,,,,0_803351.0,0,0,12,0,0,0,0,0,0,0,0,0,0,12,0,12,0,12,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
2,0,810695.0,2017-01-01,,False,False,,,0.0,0.0,0.0,False,False,,,,,,,,,0_810695.0,0,0,12,0,0,0,0,0,0,0,0,0,0,12,0,12,0,12,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3,0,811258.0,2017-01-01,,False,False,,,0.0,0.0,0.0,False,False,,,,,,,,,0_811258.0,0,0,12,0,0,0,0,0,0,0,0,0,0,12,0,12,0,12,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
4,0,811333.0,2017-01-01,,False,False,,,0.0,0.0,0.0,False,False,,,,,,,,,0_811333.0,0,0,12,0,0,0,0,0,0,0,0,0,0,12,0,12,0,12,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5,0,818258.0,2017-01-01,,False,False,,,0.0,0.0,0.0,False,False,,,,,,,,,0_818258.0,0,0,12,0,0,0,0,0,0,0,0,0,0,12,0,12,0,12,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
6,0,823722.0,2017-01-01,,False,False,,,0.0,0.0,0.0,False,False,,,,,,,,,0_823722.0,0,0,12,0,0,0,0,0,0,0,0,0,0,12,0,12,0,12,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
7,0,829630.0,2017-01-01,,False,False,,,0.0,0.0,0.0,False,False,,,,,,,,,0_829630.0,0,0,12,0,0,0,0,0,0,0,0,0,0,12,0,12,0,12,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
8,0,834432.0,2017-01-01,,False,False,,,0.0,0.0,0.0,False,False,,,,,,,,,0_834432.0,0,0,12,0,0,0,0,0,0,0,0,0,0,12,0,12,0,12,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
9,0,834909.0,2017-01-01,,False,False,,,0.0,0.0,0.0,False,False,,,,,,,,,0_834909.0,0,0,12,0,0,0,0,0,0,0,0,0,0,12,0,12,0,12,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [35]:
one_year_df.dtypes

id_branch                                                        int64
id_company                                                     float64
date_month                                              datetime64[ns]
code_sbi_1                                                      object
has_relocated                                                     bool
has_name_change                                                   bool
date_relocation_last                                            object
date_relocation_penultimate                                     object
delta_qty_employees                                            float64
delta_qty_issued_credit_reports                                float64
delta_score_payment_assessment                                 float64
code_legal_form_has_changed                                       bool
SBI_has_changed                                                   bool
company_age                                                    float64
years_

#### Cleaning after aggregation

In [31]:
def clean_after_aggregations(df):
    df[['has_financial_calamity', 'is_discontinued_any', 'SBI_has_changed'
        ,'code_legal_form_has_changed']] = df[['has_financial_calamity', 'is_discontinued_any',
                                                'code_legal_form_has_changed', 'SBI_has_changed']].fillna(value=False)
    
    columns_to_zero = ['mean_qty_issued_credit_reports', 'qty_green_flags', 'qty_orange_flags', 
                       'qty_red_flags', 'AAA', 'AA', 'A', 'BBB', 'B' , 'CCC', 'CC', 'C', 'D', 
                       'NR', 'qty_address_mutations_year', 'qty_started_names_year', 
                       'qty_stopped_names_year', 'qty_board_changes_year', 'code_legal_form_group_1', 
                       'code_legal_form_group_2', 'SBI_group_1', 'SBI_group_2']
    
    df[columns_to_zero] = df[columns_to_zero].fillna(value=0)
    df = df.replace([np.inf, -np.inf], np.nan)
    return df 

In [32]:
one_year_df = clean_after_aggregations(one_year_df)

In [40]:
one_year_df = one_year_df.drop(axis=1, columns='code_sbi_1')

#### Saving DF Locally

In [41]:
def save_df_locally(df, dir_prefix, year, as_json= False):
    """ Saves df as json or csv locally on server """
    if as_json:        
        file_path = dir_prefix + '/' + year + '_merged_cleaned.json'
        df.to_json(file_path)
    else:
        file_path =  dir_prefix + '/' + year + '_merged_cleaned.csv'
        df.to_csv(file_path)

In [43]:
save_df_locally(df= one_year_df, dir_prefix= 'files_to_bucket', year= '2017')

In [44]:
one_year_df.shape

(1966208, 58)