#### Reading csv graydon data from buckets
#### Merging csv monthly files into yearly files
#### Uploading yearly files to Google Storage

In [1]:
# Ignore 'dask' warning
import pandas as pd
import gcsfs
from google.cloud import storage
from pandas import DataFrame
from IPython.display import HTML
from google.cloud.storage import Blob
import datalab.storage as gcs_datalab

In [2]:
# Setting up constants. All required
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'

In [3]:
# Initializing bucket
fs = gcsfs.GCSFileSystem(project='graydon-moving-indicator')
gcs = storage.Client()
bucket = gcs.get_bucket(bucket_name)

In [4]:
selected_columns = ['date_month', 'id_company', 'id_branch',
       'is_discontinued',
       'financial_calamity_outcome',
       'qty_employees', 'year_qty_employees', 'id_company_creditproxy',
       'score_payment_assessment', 'amt_revenue',
       'year_revenue', 'amt_consolidated_revenue', 'year_consolidated_revenue',
       'amt_consolidated_operating_result',
       'year_consolidated_operating_result', 
       'perc_credit_limit_adjustment', 'color_credit_status', 'rat_pd',
       'score_pd','has_increased_risk',
       'is_sole_proprietor', 'code_SBI_2', 'code_SBI_1',
       'qty_address_mutations_total',
       'qty_address_mutations_month', 
       'has_relocated',
       'has_name_change',  'Vice President'
       ]

In [5]:
selected_columns_small = ['date_month', 'id_company', 'id_branch', 'date_established',
       'is_discontinued',
       'financial_calamity_outcome',
       'qty_employees', 'year_qty_employees', 'id_company_creditproxy',
       'score_payment_assessment', 'amt_revenue',
       'year_revenue', 'amt_consolidated_revenue', 'year_consolidated_revenue',
       'perc_credit_limit_adjustment', 'color_credit_status', 'rat_pd',
       'score_pd','has_increased_risk',
       'is_sole_proprietor', 'code_SBI_2', 'code_SBI_1', 'qty_address_mutations_total',
       'qty_address_mutations_month', 'has_relocated',
        'has_name_change'
       ]

In [6]:
#%%time
def read_all_csv_months_yearly_from_bucket(years_to_read_in_list, dir_prefix = '', selected_columns = ''):
    """ Reads a whole year of data and returns a dictionary with year 
        number as key and a list of monthly pandas dfs  
    """
    all_years_dict = {}
    for year in years_to_read_in_list:
        print('Starting with year: ', year)
        dir_prefix = dir_prefix + '/' + year
        one_year_csvs = []
        blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
        for blob in blob_list:  
            one_month_df = None
            if 'CSV' in blob.name:
                print('Processing file: ', blob.name)
                with fs.open('graydon-data/' + blob.name) as f:
                    one_month_df = pd.read_csv(f, sep=';', usecols= selected_columns)   
                    one_year_csvs.append(one_month_df)      
        all_years_dict[year] = one_year_csvs
    return all_years_dict

In [7]:
#%%time
def read_all_csv_months_yearly_from_bucket_merged(years_to_read_in_list, dir_prefix = '', selected_columns = ''):
    """ Reads a whole year of data and returns a monthly merged pandas Df """
    all_years_merged_df = pd.DataFrame()
    for year in years_to_read_in_list:
        print('Starting with year: ', year)
        dir_prefix = dir_prefix + '/' + year
        blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
        for blob in blob_list:  
            one_month_df = None
            if 'CSV' in blob.name:
                print('Processing file: ', blob.name)
                with fs.open('graydon-data/' + blob.name) as f:
                    one_month_df = pd.read_csv(f, sep=';', usecols= selected_columns)   
                    one_month_df = one_month_df[(one_month_df['is_sole_proprietor'] == 0) ]
                                               # & (one_month_df['is_discontinued'] == 0) 
                    one_month_df.columns = (one_month_df.columns.str.strip().str.lower(). 
                    str.replace(' ', '_').str.replace('(', '').str.replace(')', '') )
                    all_years_merged_df = all_years_merged_df.append(one_month_df)
            print('The number of rows so far is: ', all_years_merged_df.shape[0])
    return all_years_merged_df

In [8]:
def read_one_month_csv_from_bucket(year, month, last_day_of_month, dir_prefix = '', selected_columns= ''):
    """ Reads one month of data and returns a pandas Df """
    one_month_df = pd.DataFrame()
    dir_prefix = dir_prefix + '/' + year
    print(dir_prefix)
    blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
    for blob in blob_list:
        if month + '-' + last_day_of_month in blob.name:
            print('Processing file: ', blob.name)
            with fs.open('graydon-data/' + blob.name) as f:
                if selected_columns == '' or None:
                    one_month_df = pd.read_csv(f, sep=';')
                else:
                    one_month_df = pd.read_csv(f, sep=';', usecols= selected_columns)
    one_month_df.columns = (one_month_df.columns.str.strip().str.lower(). 
                    str.replace(' ', '_').str.replace('(', '').str.replace(')', '') )
    return one_month_df

In [None]:
def upload_df_to_gc_bucket(df, dir_prefix, year, as_json= False):
    """ Uploads pandas DF to Gc bucket either as json or csv """
    if as_json:
        df_json = df.to_json()
        new_file_path = dir_prefix + '/' + year + '_merged.json'
        gcs_datalab.Bucket(bucket_name).item(new_file_path).write_to(df_json,'text/json')
    else:
        df_csv = df.to_csv()
        new_file_path = dir_prefix + '/' + year + '_merged.csv'
        gcs_datalab.Bucket(bucket_name).item(new_file_path).write_to(df_csv,'text/csv')

In [None]:
jan_2017_df.shape

#### Read one month

In [None]:
jan_2017_df = read_one_month_csv_from_bucket(year= '2017', month= '01', 
                                             last_day_of_month= '31', dir_prefix ='including_scores/unzipped' , 
                                             selected_columns= selected_columns)

#### Read one month all columns

In [None]:
jan_2017_df_all_columns = read_one_month_csv_from_bucket(year= '2017', month= '01', 
                                             last_day_of_month= '31', dir_prefix ='including_scores/unzipped' , 
                                             )

#### Read one full year

In [None]:
one_year_df = read_all_csv_months_yearly_from_bucket_merged(dir_prefix='including_scores/unzipped', 
                                                            years_to_read_in_list= ['2018'], 
                                                            selected_columns= selected_columns)

Starting with year:  2018
The number of rows so far is:  0
Processing file:  including_scores/unzipped/2018/modelling_2018-01-01_2018-01-31.CSV


  if (yield from self.run_code(code, result)):


The number of rows so far is:  1907886
Processing file:  including_scores/unzipped/2018/modelling_2018-02-01_2018-02-28.CSV
The number of rows so far is:  3821899
Processing file:  including_scores/unzipped/2018/modelling_2018-03-01_2018-03-31.CSV


#### Upload df to bucket

In [None]:
upload_df_to_gc_bucket(df= jan_2017_df, dir_prefix='including_scores/merged_per_year', year= '2017')

#### Preview df

In [None]:
HTML(DataFrame(one_year_df.head(20)).to_html())

In [None]:
#save_df_to_bucket(df= one_year_df, dir_prefix='including_scores/merged_per_year', year= '2018')