# Practice Optimizing DataFrames and Processing in Chunks

*Connecting to the database and then make some queries.*

In [15]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 99

# Reading the data

Let's take a look at the data and find the appropriate chunk amount - to stay under 5 Mb.

In [16]:
loans_first_5 = pd.read_csv('loans_2007.csv', nrows=5)
loans_first_5

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,,10+ years,RENT,24000.0,Verified,Dec-2011,Fully Paid,n,credit_card,Computer,860xx,AZ,27.65,0.0,Jan-1985,1.0,3.0,0.0,13648.0,83.7%,9.0,f,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,Jan-2015,171.62,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,Dec-2011,Charged Off,n,car,bike,309xx,GA,1.0,0.0,Apr-1999,5.0,3.0,0.0,1687.0,9.4%,4.0,f,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,Apr-2013,119.66,Sep-2013,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,,10+ years,RENT,12252.0,Not Verified,Dec-2011,Fully Paid,n,small_business,real estate business,606xx,IL,8.72,0.0,Nov-2001,2.0,2.0,0.0,2956.0,98.5%,10.0,f,0.0,0.0,3005.666844,3005.67,2400.0,605.67,0.0,0.0,0.0,Jun-2014,649.91,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,AIR RESOURCES BOARD,10+ years,RENT,49200.0,Source Verified,Dec-2011,Fully Paid,n,other,personel,917xx,CA,20.0,0.0,Feb-1996,1.0,10.0,0.0,5598.0,21%,37.0,f,0.0,0.0,12231.89,12231.89,10000.0,2214.92,16.97,0.0,0.0,Jan-2015,357.48,Apr-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,University Medical Group,1 year,RENT,80000.0,Source Verified,Dec-2011,Current,n,other,Personal,972xx,OR,17.94,0.0,Jan-1996,0.0,15.0,0.0,27783.0,53.9%,38.0,f,461.73,461.73,3581.12,3581.12,2538.27,1042.85,0.0,0.0,0.0,Jun-2016,67.79,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0


In [17]:
loans_3000 = pd.read_csv('loans_2007.csv', nrows=3000)
print(loans_3000.info(memory_usage='deep'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 52 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          3000 non-null   int64  
 1   member_id                   3000 non-null   float64
 2   loan_amnt                   3000 non-null   float64
 3   funded_amnt                 3000 non-null   float64
 4   funded_amnt_inv             3000 non-null   float64
 5   term                        3000 non-null   object 
 6   int_rate                    3000 non-null   object 
 7   installment                 3000 non-null   float64
 8   grade                       3000 non-null   object 
 9   sub_grade                   3000 non-null   object 
 10  emp_title                   2829 non-null   object 
 11  emp_length                  2917 non-null   object 
 12  home_ownership              3000 non-null   object 
 13  annual_inc                  3000 

We will be working on `chunk_iter` separately for each of our tasks to display the results separately in each cell.

Let's:
* find the total memory usage across all chunks in Mb
* find amount of rows
* see if there are any memory footprints for each chunk of this size that are more than 4.9 Mb

In [18]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)
total_memory_usage = 0
total_rows = 0
for chunk in chunk_iter:
    total_rows += len(chunk)
    total_memory_usage += (chunk.memory_usage(deep=True).sum())/(1024*1024)
    if (chunk.memory_usage(deep=True).sum())/(1024*1024) > 4.9:
        print('not enough memory', chunk.memory_usage(deep=True).sum()/1024*1024)
        break

total_memory_usage, total_rows

(66.21605968475342, 42538)

Our dataset is 66.216 Mb and has 42 538 rows.  
Also, it didn't show us 'not enough memory' - it means that all chunks are under 4.9 Mb.

# Inconsistency

Are string columns consistent across all chunks?

In [20]:
prev_column_names = []
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)
for chunk in chunk_iter:
    object_column_names = chunk.select_dtypes(include=['object']).columns.tolist()
    if len(prev_column_names) > 0:
        if object_column_names != prev_column_names:
            print(
                'previous columns:',
                prev_column_names,
                '\n',
                'current columns:',
                object_column_names
            )
    else:
        prev_column_names = object_column_names

previous columns: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] 
 current columns: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type']
previous columns: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] 
 c

We can see that in two chunks `id` is an `object`. Since it is not useful for further research, we can ignore this inconsistency.

# Memory optimization - to category

We need to change `object` columns to `categorical` (if values are less than 50% unique).

In [21]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)
for chunk in chunk_iter:
    object_columns = chunk.select_dtypes(include=['object'])
    chunk_length = len(chunk.index)
    for column in object_columns:
        unique_values = len(object_columns[column].value_counts())
        if unique_values/chunk_length < 0.5:
            chunk[column] = chunk[column].astype('category')
chunk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 538 entries, 42000 to 42537
Data columns (total 52 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   id                          538 non-null    object  
 1   member_id                   536 non-null    float64 
 2   loan_amnt                   536 non-null    float64 
 3   funded_amnt                 536 non-null    float64 
 4   funded_amnt_inv             536 non-null    float64 
 5   term                        536 non-null    category
 6   int_rate                    536 non-null    category
 7   installment                 536 non-null    float64 
 8   grade                       536 non-null    category
 9   sub_grade                   536 non-null    category
 10  emp_title                   499 non-null    object  
 11  emp_length                  536 non-null    category
 12  home_ownership              536 non-null    category
 13  annual_inc    

We converted all the object columns that have less than 50% unique values to `category` type to save memory.

We can see some changes here in the last chunk. For example, we don't need to clean `term` and `revol_util` and convert them later to numerical, because they are already in `category` type.

# Memory optimization - to datetime

Next we need to:
* convert `issue_d` `earliest_cr_line` `last_pymnt_d` `last_credit_pull_d` to the `datetime`
* check if there are any `float` columns with no missing values to convert them to `integer`

In [23]:
chunk_iter = pd.read_csv(
    'loans_2007.csv',
    chunksize=3000,
    parse_dates = [
        'issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'
    ])

float_columns_withoutnulls = []
for chunk in chunk_iter:
    float_columns = chunk.select_dtypes(include=np.float).isnull().sum()
    float_columns_withoutnulls.append(float_columns)
    
float_columns_withoutnulls_df = pd.concat(float_columns_withoutnulls)
float_columns_for_conversion = float_columns_withoutnulls_df.groupby(float_columns_withoutnulls_df.index).sum()
float_columns_for_conversion

acc_now_delinq                  32
annual_inc                       7
chargeoff_within_12_mths       148
collection_recovery_fee          3
collections_12_mths_ex_med     148
delinq_2yrs                     32
delinq_amnt                     32
dti                              3
funded_amnt                      3
funded_amnt_inv                  3
inq_last_6mths                  32
installment                      3
last_pymnt_amnt                  3
loan_amnt                        3
member_id                        3
open_acc                        32
out_prncp                        3
out_prncp_inv                    3
policy_code                      3
pub_rec                         32
pub_rec_bankruptcies          1368
recoveries                       3
revol_bal                        3
tax_liens                      108
total_acc                       32
total_pymnt                      3
total_pymnt_inv                  3
total_rec_int                    3
total_rec_late_fee  

As we can see, there are no columns without missing values to convert to `integer` type.

# Memory optimization - downcast to subtype

Let's convert them to a more space efficient `subtype` then.  
We don't touch `integer` types, because we have only one integer column - `id`, - which we decided to ignore in our further research.

In [None]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)
for chunk in chunk_iter:
    float_columns = chunk.select_dtypes(include=np.float)
    for column in float_columns:
        chunk[column] = pd.to_numeric(chunk[column], downcast='float')
        
chunk.info()

Finally we can also clean `revol_util` column to convert it to numeric data type.

In [28]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)

for chunk in chunk_iter:
    revol_util = chunk['revol_util'].str.replace(r'%', '')
    chunk['revol_util'] = pd.to_numeric(revol_util, downcast='float')
    
chunk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 538 entries, 42000 to 42537
Data columns (total 52 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          538 non-null    object 
 1   member_id                   536 non-null    float64
 2   loan_amnt                   536 non-null    float64
 3   funded_amnt                 536 non-null    float64
 4   funded_amnt_inv             536 non-null    float64
 5   term                        536 non-null    object 
 6   int_rate                    536 non-null    object 
 7   installment                 536 non-null    float64
 8   grade                       536 non-null    object 
 9   sub_grade                   536 non-null    object 
 10  emp_title                   499 non-null    object 
 11  emp_length                  536 non-null    object 
 12  home_ownership              536 non-null    object 
 13  annual_inc                  5

# Combining it all together

At last we calculate total memory usage again - let's put it all together.

In [29]:
chunk_iter = pd.read_csv(
    'loans_2007.csv',
    chunksize=3000,
    parse_dates = [
        'issue_d', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'
    ])

total_memory_usage = 0

for chunk in chunk_iter:
    revol_util = chunk['revol_util'].str.replace(r'%', '')
    chunk['revol_util'] = pd.to_numeric(revol_util)
    
    object_columns = chunk.select_dtypes(include=['object'])
    chunk_length = len(chunk.index)
    for column in object_columns:
        unique_values = len(object_columns[column].value_counts())
        if unique_values/chunk_length < 0.5:
            chunk[column] = chunk[column].astype('category')
    
    float_columns = chunk.select_dtypes(include=np.float)
    for column in float_columns:
        chunk[column] = pd.to_numeric(chunk[column], downcast='float')
    
    total_memory_usage += (chunk.memory_usage(deep=True).sum())/(1024*1024)
    
total_memory_usage, chunk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 538 entries, 42000 to 42537
Data columns (total 52 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          538 non-null    object        
 1   member_id                   536 non-null    float32       
 2   loan_amnt                   536 non-null    float32       
 3   funded_amnt                 536 non-null    float32       
 4   funded_amnt_inv             536 non-null    float32       
 5   term                        536 non-null    category      
 6   int_rate                    536 non-null    category      
 7   installment                 536 non-null    float32       
 8   grade                       536 non-null    category      
 9   sub_grade                   536 non-null    category      
 10  emp_title                   499 non-null    object        
 11  emp_length                  536 non-null    category

(14.272079467773438, None)

As we can see, our `memory footprint` has been reduced from `66.216` Mb to `14.272` Mb.