#### Reading csv graydon data from buckets
#### Merging csv monthly files into yearly files
#### Uploading yearly files to Google Storage

In [1]:
# Ignore 'dask' warning
import pandas as pd
import gcsfs
from google.cloud import storage
from pandas import DataFrame
from IPython.display import HTML
from google.cloud.storage import Blob
import datalab.storage as gcs_datalab

In [2]:
# Setting up constants. All required
project = 'graydon-moving-indicator'
bucket_name = 'graydon-data'

In [3]:
# Initializing bucket
fs = gcsfs.GCSFileSystem(project='graydon-moving-indicator')
gcs = storage.Client()
bucket = gcs.get_bucket(bucket_name)

In [4]:
selected_columns = ['date_month', 'id_company', 'id_branch',
       'is_discontinued',
       'financial_calamity_outcome',
       'qty_employees', 'year_qty_employees', 'id_company_creditproxy',
       'score_payment_assessment', 'amt_revenue',
       'year_revenue', 'amt_consolidated_revenue', 'year_consolidated_revenue',
       'amt_consolidated_operating_result',
       'year_consolidated_operating_result', 
       'perc_credit_limit_adjustment', 'color_credit_status', 'rat_pd',
       'score_pd','has_increased_risk',
       'is_sole_proprietor', 'code_SBI_2', 'code_SBI_1',
       'qty_address_mutations_total',
       'qty_address_mutations_month', 
       'has_relocated',
       'has_name_change',  'Vice President'
       ]

In [5]:
selected_columns_small = ['date_month', 'id_company', 'id_branch', 'date_established',
       'is_discontinued',
       'financial_calamity_outcome',
       'qty_employees', 'year_qty_employees', 'id_company_creditproxy',
       'score_payment_assessment', 'amt_revenue',
       'year_revenue', 'amt_consolidated_revenue', 'year_consolidated_revenue',
       'perc_credit_limit_adjustment', 'color_credit_status', 'rat_pd',
       'score_pd','has_increased_risk',
       'is_sole_proprietor', 'code_SBI_2', 'code_SBI_1', 'qty_address_mutations_total',
       'qty_address_mutations_month', 'has_relocated',
        'has_name_change'
       ]

In [6]:
#%%time
def read_all_csv_months_yearly_from_bucket(years_to_read_in_list, dir_prefix = '', selected_columns = ''):
    """ Reads a whole year of data and returns a dictionary with year 
        number as key and a list of monthly pandas dfs  
    """
    all_years_dict = {}
    for year in years_to_read_in_list:
        print('Starting with year: ', year)
        dir_prefix = dir_prefix + '/' + year
        one_year_csvs = []
        blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
        for blob in blob_list:  
            one_month_df = None
            if 'CSV' in blob.name:
                print('Processing file: ', blob.name)
                with fs.open('graydon-data/' + blob.name) as f:
                    one_month_df = pd.read_csv(f, sep=';', usecols= selected_columns)   
                    one_year_csvs.append(one_month_df)      
        all_years_dict[year] = one_year_csvs
    return all_years_dict

In [7]:
#%%time
def read_all_csv_months_yearly_from_bucket_merged(years_to_read_in_list, dir_prefix = '', selected_columns = ''):
    """ Reads a whole year of data and returns a monthly merged pandas Df """
    all_years_merged_df = pd.DataFrame()
    for year in years_to_read_in_list:
        print('Starting with year: ', year)
        dir_prefix = dir_prefix + '/' + year
        blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
        for blob in blob_list:  
            one_month_df = None
            if 'CSV' in blob.name:
                print('Processing file: ', blob.name)
                with fs.open('graydon-data/' + blob.name) as f:
                    one_month_df = pd.read_csv(f, sep=';', usecols= selected_columns)   
                    one_month_df = one_month_df[(one_month_df['is_sole_proprietor'] == 0) ]
                                               # & (one_month_df['is_discontinued'] == 0) 
                    one_month_df.columns = (one_month_df.columns.str.strip().str.lower(). 
                    str.replace(' ', '_').str.replace('(', '').str.replace(')', '') )
                    all_years_merged_df = all_years_merged_df.append(one_month_df)
            print('The number of rows so far is: ', all_years_merged_df.shape[0])
    return all_years_merged_df

In [16]:
def read_one_year_from_bucket_merged_csv(year, dir_prefix = ''):
    """ Reads a whole year of data from the already merged files """
    full_year_df = pd.DataFrame()
    print('Starting with year: ', year)
    print(dir_prefix)
    blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
    for blob in blob_list:  
        print("blob", blob.name)
        if year in blob.name:
            print('Processing file: ', blob.name)
            with fs.open('graydon-data/' + blob.name) as f:
                full_year_df = pd.read_csv(f, sep=';')   
        print('The number of rows so far is: ', full_year_df.shape[0])
    return full_year_df

In [8]:
def read_one_month_csv_from_bucket(year, month, last_day_of_month, dir_prefix = '', selected_columns= ''):
    """ Reads one month of data and returns a pandas Df """
    one_month_df = pd.DataFrame()
    dir_prefix = dir_prefix + '/' + year
    print(dir_prefix)
    blob_list = list(bucket.list_blobs(prefix=dir_prefix))    
    for blob in blob_list:
        if month + '-' + last_day_of_month in blob.name:
            print('Processing file: ', blob.name)
            with fs.open('graydon-data/' + blob.name) as f:
                if selected_columns == '' or None:
                    one_month_df = pd.read_csv(f, sep=';')
                else:
                    one_month_df = pd.read_csv(f, sep=';', usecols= selected_columns)
    one_month_df.columns = (one_month_df.columns.str.strip().str.lower(). 
                    str.replace(' ', '_').str.replace('(', '').str.replace(')', '') )
    return one_month_df

In [9]:
def upload_df_to_gc_bucket(df, dir_prefix, year, as_json= False):
    """ Uploads pandas DF to Gc bucket either as json or csv """
    if as_json:
        df_json = df.to_json()
        new_file_path = dir_prefix + '/' + year + '_merged.json'
        gcs_datalab.Bucket(bucket_name).item(new_file_path).write_to(df_json,'text/json')
    else:
        df_csv = df.to_csv()
        new_file_path = dir_prefix + '/' + year + '_merged.csv'
        gcs_datalab.Bucket(bucket_name).item(new_file_path).write_to(df_csv,'text/csv')

In [10]:
def save_df_locally(df, dir_prefix, year, as_json= False):
    """ Saves df as json or csv locally on server """
    if as_json:        
        file_path = dir_prefix + '/' + year + '_merged.json'
        df.to_json(file_path)
    else:
        file_path =  dir_prefix + '/' + year + '_merged.csv'
        df.to_csv(file_path)
        

#### Read one month

In [None]:
jan_2017_df = read_one_month_csv_from_bucket(year= '2017', month= '01', 
                                             last_day_of_month= '31', dir_prefix ='including_scores/unzipped' , 
                                             selected_columns= selected_columns)

#### Read one month all columns

In [None]:
jan_2017_df_all_columns = read_one_month_csv_from_bucket(year= '2017', month= '01', 
                                             last_day_of_month= '31', dir_prefix ='including_scores/unzipped' , 
                                             )

#### Read one full year monthly and merge it

In [11]:
one_year_df = read_all_csv_months_yearly_from_bucket_merged(dir_prefix='including_scores/unzipped', 
                                                            years_to_read_in_list= ['2016'], 
                                                            selected_columns= selected_columns)

Starting with year:  2016
The number of rows so far is:  0
Processing file:  including_scores/unzipped/2016/modelling_2016-01-01_2016-01-31.CSV


  if (yield from self.run_code(code, result)):


The number of rows so far is:  1832645
Processing file:  including_scores/unzipped/2016/modelling_2016-02-01_2016-02-29.CSV
The number of rows so far is:  3672550
Processing file:  including_scores/unzipped/2016/modelling_2016-03-01_2016-03-31.CSV
The number of rows so far is:  5517412
Processing file:  including_scores/unzipped/2016/modelling_2016-04-01_2016-04-30.CSV
The number of rows so far is:  7366476
Processing file:  including_scores/unzipped/2016/modelling_2016-05-01_2016-05-31.CSV
The number of rows so far is:  9218843
Processing file:  including_scores/unzipped/2016/modelling_2016-06-01_2016-06-30.CSV
The number of rows so far is:  11075834
Processing file:  including_scores/unzipped/2016/modelling_2016-07-01_2016-07-31.CSV
The number of rows so far is:  12936952
Processing file:  including_scores/unzipped/2016/modelling_2016-08-01_2016-08-31.CSV
The number of rows so far is:  14801923
Processing file:  including_scores/unzipped/2016/modelling_2016-09-01_2016-09-30.CSV
The n

#### Read one full year from already merged files

In [None]:
one_year_df = read_one_year_from_buckdir_prefix=erged_csv(dir_prefix= 'including_scores/merged_per_year', year = '2016')

#### Upload df to bucket

In [None]:
upload_df_to_gc_bucket(df= jan_2017_df, dir_prefix='including_scores/merged_per_year', year= '2017')

#### Save df locally in server

In [17]:
save_df_locally(df= one_year_df, dir_prefix= 'files_to_bucket', year= '2016')

#### Preview df

In [13]:
HTML(DataFrame(one_year_df.head(20)).to_html())

Unnamed: 0,date_month,id_company,id_branch,is_discontinued,financial_calamity_outcome,qty_employees,year_qty_employees,id_company_creditproxy,score_payment_assessment,amt_revenue,year_revenue,amt_consolidated_revenue,year_consolidated_revenue,amt_consolidated_operating_result,year_consolidated_operating_result,perc_credit_limit_adjustment,color_credit_status,rat_pd,score_pd,has_increased_risk,is_sole_proprietor,code_sbi_2,code_sbi_1,qty_address_mutations_total,qty_address_mutations_month,has_relocated,has_name_change,vice_president
0,2016-01-01,3,10079408,False,,1.0,2016.0,3,20.0,3.55253e-310,2009.0,,,,,20,G,BB,-4975.0,False,False,41.0,,0,0,False,False,0
1,2016-01-01,5,10079416,False,,9.0,2015.0,1064993,20.0,,,,,,,5,O,CC,-4756.0,False,False,46.0,,0,0,False,False,0
2,2016-01-01,6,10079424,False,,25.0,2014.0,6,24.0,4.446591e-311,2011.0,,,,,-100,R,D,,,False,41.0,,0,0,False,False,0
3,2016-01-01,9,10079432,False,,33.0,2015.0,9,20.0,,,,,,,25,G,BB,-4857.0,False,False,47.0,,0,0,False,False,0
4,2016-01-01,12,35,False,,5.0,2016.0,12,28.0,,,,,,,30,G,BB,-5089.0,False,False,64.0,,0,0,False,False,0
5,2016-01-01,14,43,False,,6.0,2015.0,14,20.0,,,,,,,45,G,BBB,-4993.0,False,False,64.0,,0,0,False,False,0
6,2016-01-01,17,51,False,,1.0,2016.0,17,20.0,,,,,,,20,G,A,-5020.0,False,False,64.0,,0,0,False,False,0
7,2016-01-01,23,94,True,,,,23,29.0,,,,,,,-5,R,D,,,False,46.0,,0,0,False,False,0
8,2016-01-01,25,10079467,True,,,,25,29.0,,,,,,,5,R,D,,,False,61.0,,0,0,False,False,0
9,2016-01-01,47,140,False,,1.0,2007.0,47,20.0,5.281809e-313,2006.0,,,,,45,G,A,-5101.0,False,False,70.0,,0,0,False,False,0
