In [1]:
# Ignore 'dask' warning
import pandas as pd
import gcsfs
from google.cloud import storage
from pandas import DataFrame
from IPython.display import HTML
from google.cloud.storage import Blob
import datalab.storage as gcs_datalab
from datetime import date
import numpy as np

In [2]:
# Setting up constants. All required
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'

In [3]:
# Initializing bucket
fs = gcsfs.GCSFileSystem(project='graydon-moving-indicator')
gcs = storage.Client()
bucket = gcs.get_bucket(bucket_name)

In [4]:
def create_dict_types_aggregated_data():
    # Setting up dictionary of column types for the aggregated dataset
    dtype={ 
        'id_company'  :np.float64,
        'id_branch'    :np.int64, 
        'code_sbi_2'         : np.float64, 
        'has_relocated':bool,
        'has_relocated_next_year ' : bool,
        'has_name_change' : bool,
        'qty_address_mutations_total' :np.float64,
        'ratio_operating_result_consolidated_operating_result': np.float64,
        'ratio_revenue_consolidated_revenue': np.float64,
        'qty_green_flags'   :np.float64,
        'qty_orange_flags'   :np.float64,
        'qty_red_flags'   :np.float64,
        'A'   :np.float64,
        'AA'   :np.float64,
        'AAA'   :np.float64,
        'B'   :np.float64,
        'BB'   :np.float64,
        'BBB'   :np.float64,
        'C'   :np.float64,
        'CC'   :np.float64,
        'CCC'   :np.float64,
        'D'   :np.float64,
        'NR'   :np.float64,
        'code_legal_form_group_1':  np.int64,
        'code_legal_form_group_2':  np.int64,
        'SBI_group_1':  np.int64,
        'SBI_group_2':  np.int64,
        'company_age'   :np.float64,
        'years_since_last_amt_consolidated_operating_result'   :np.float64,
        'years_since_last_amt_consolidated_revenue'   :np.float64,
        'years_since_last_amt_operating_result'   :np.float64,
        'years_since_last_qty_employees'   :np.float64,
        'years_since_last_amt_revenue'   :np.float64,
        'delta_qty_employees'   :np.float64,
        'delta_qty_issued_credit_reports'   :np.float64,
        'delta_score_payment_assessment'   :np.float64,
        'SBI_has_changed' : bool,
        'unique_id' : object,
        'code_legal_form_has_changed ' : bool,
        'is_discontinued_any ' : bool,
        'has_financial_calamity ' : bool,
        'mean_amt_consolidated_operating_result'   :np.float64,
        'mean_amt_consolidated_revenue'   :np.float64,
        'mean_amt_operating_result'   :np.float64,
        'mean_amt_revenue'   :np.float64,
        'mean_qty_employees'   :np.float64,
        'mean_qty_issued_credit_reports'   :np.float64,
        'mean_score_payment_assessment'   :np.float64,
        'mean_score_pd'   :np.float64,
        'qty_address_mutations_year'   :np.float64,
        'qty_started_names_year'   :np.float64,
        'qty_stopped_names_year'   :np.float64,
        'qty_board_changes_year'   :np.float64,
        'variance_qty_employees'   :np.float64,
        'variance_qty_issued_credit_reports'   :np.float64,
        'variance_score_payment_assessment'   :np.float64,
        'variance_score_pd'   :np.float64
      }
    return dtype

def create_parse_dates_list_aggregated_data():
    # Setting up dictionary of column types for the aggregated dataset
    parse_dates= ['date_month', 'years_in_current_location']
    return parse_dates

In [9]:
def calculate_if_any_true(df, col_list):
    for col in col_list:
        if col == 'is_discontinued': 
            df = df.merge(df.groupby(['id_branch', 'id_company'])['is_discontinued'] 
                        .any()              # True if any items are True
                        .rename('is_discontinued_any')    # name Series 
                        .to_frame()         # make a dataframe for merging
                        .reset_index())
        elif col == 'code_financial_calamity':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['code_financial_calamity'] 
                        .any()            
                        .rename('has_financial_calamity')   
                        .to_frame() 
                        .reset_index())
        elif col == 'has_relocated':
            df = df.merge(df.groupby(['id_branch', 'id_company'])['has_relocated'] 
                        .any()            
                        .rename('has_relocated_next_year')   
                        .to_frame() 
                        .reset_index())
    return df

#### Get has_relocated from next year DF

In [5]:
def replace_has_relocated_with_nextyear(df, next_year, dir_prefix = ''):
    dtype={ 
            'id_branch'    :np.int64,
            'id_company'    :np.int64,
            'has_relocated':bool
    }
    full_next_year_df = pd.DataFrame()
    cols = ['id_company', 'id_branch', 'has_relocated']
    print('Starting withGra year: ', next_year)
    print(dir_prefix)
    blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
    for blob in blob_list:         
        if str(next_year) in blob.name:
            print('Processing file: ', blob.name)
            with fs.open('graydon-data/' + blob.name) as f:
                full_next_year_df = pd.read_csv(f, sep=',',  dtype=dtype, usecols= cols
                                     )   
        print('The number of rows so far is: ', full_next_year_df.shape[0])
    full_next_year_df = calculate_if_any_true(full_next_year_df, col_list = ['has_relocated'])
    full_next_year_df = full_next_year_df.drop(axis=1, columns='has_relocated')
    full_next_year_df = full_next_year_df.drop_duplicates().reset_index().drop(axis=1, columns='index')
    df = df.merge(full_next_year_df, on=['id_branch', 'id_company'], how='left', suffixes='_C')
    return df

#### Saving DF locally

In [6]:
def save_df_locally(df, dir_prefix, year, as_json= False):
    """ Saves df as json or csv locally on server """
    if as_json:        
        file_path = dir_prefix + '/' + year + '_aggregated.json'
        df.to_json(file_path)
    else:
        file_path =  dir_prefix + '/' + year + '_aggregated.csv'
        df.to_csv(file_path)

#### Aggregating dataframe into one year. Main function that calls them all

In [7]:
# Aggregating dataframe into one year. Main function that calls them all
def aggregate_full_year(year, dir_prefix = '', save_df_locally_flag = False):

    
    next_year = int(year) + 1
    print('Getting target of next year and adding it as a column') 
    df = replace_has_relocated_with_nextyear(df= df, next_year= next_year,
                                   dir_prefix= 'including_scores/merged_per_year/merged_cleaned/relocation_dates')
    print('Done getting target of next year and adding it as a column' )
    if save_df_locally_flag:
        print('Saving DF local to VM into files_to_bucket folder')
        save_df_locally(df= df, dir_prefix= 'files_to_bucket/aggregated', year = year)
    return df

In [11]:
file_name = 'files_to_bucket/2017_aggregated_old.csv'
dtype = create_dict_types_aggregated_data()
parse_dates = create_parse_dates_list_aggregated_data()
df = pd.read_csv(file_name, sep=',', index_col=0, dtype=dtype, parse_dates=parse_dates)

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
df.shape

(1966208, 58)

In [14]:
next_year = 2018

In [17]:
df = replace_has_relocated_with_nextyear(df= df, next_year= next_year,
                               dir_prefix= '02_cleaned')

Starting withGra year:  2018
02_cleaned
The number of rows so far is:  0
The number of rows so far is:  0
The number of rows so far is:  0
The number of rows so far is:  0
The number of rows so far is:  0
Processing file:  02_cleaned/2018_merged.csv
The number of rows so far is:  23224251


In [20]:
save_df_locally(df= df, dir_prefix= 'files_to_bucket/aggregated', year = '2017')

# All at once

In [18]:
%%time
one_year_df = aggregate_full_year(year = '2017', dir_prefix= '02_clean',
                                  save_df_locally_flag= True)

KeyboardInterrupt: 

#### Previewing data

In [None]:
HTML(DataFrame(one_year_df).head(100).to_html())

In [None]:
len(one_year_df.columns)

In [None]:
### one_year_df.shape