In [2]:
import pandas as pd
pd.options.display.max_columns = 99

In [3]:
loans = pd.read_csv('loans_2007.csv',nrows = 5)

print(loans)

        id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \
0  1077501  1296599.0     5000.0       5000.0           4975.0   36 months   
1  1077430  1314167.0     2500.0       2500.0           2500.0   60 months   
2  1077175  1313524.0     2400.0       2400.0           2400.0   36 months   
3  1076863  1277178.0    10000.0      10000.0          10000.0   36 months   
4  1075358  1311748.0     3000.0       3000.0           3000.0   60 months   

  int_rate  installment grade sub_grade                 emp_title emp_length  \
0   10.65%       162.87     B        B2                       NaN  10+ years   
1   15.27%        59.83     C        C4                     Ryder   < 1 year   
2   15.96%        84.33     C        C5                       NaN  10+ years   
3   13.49%       339.31     C        C1       AIR RESOURCES BOARD  10+ years   
4   12.69%        67.79     B        B5  University Medical Group     1 year   

  home_ownership  annual_inc verification_status  

How many rows can be read at a time to use less than 5MB of memory?

In [4]:
loans = pd.read_csv('loans_2007.csv',nrows = 3200)

print(loans.memory_usage(deep = True).sum() /  (1024*1024) )

4.959044456481934


In [5]:
total_cols = len(list(loans.columns))
print("Number of columns =", total_cols)

obj_cols = len(list(loans.select_dtypes(include=['object'])))

print("Number of numeric columns =", total_cols - obj_cols)
print("Number of string columns =", obj_cols)


Number of columns = 52
Number of numeric columns = 31
Number of string columns = 21


In [6]:
loans_iter = pd.read_csv('loans_2007.csv',chunksize= 3200)

string_dict = {}
string_rows = {}

# Loop through each chunk
for chunk in loans_iter:
    # Loop through each object column on each chunk
    for c in chunk.select_dtypes(include=['object']):
        unique_lst = []
        row_count = 0
        
        col_stats = {}
        
        if c in string_dict.keys():
            unique_lst = string_dict[c]['unique']
            row_count = string_dict[c]['row_count']     
      
        # Get unique values
        unique_lst.append(chunk[c].value_counts())
        
        # Get total rows 
        row_count += len(chunk[c])
        
        # Save data
        col_stats = {'unique': unique_lst, 'row_count': row_count }
        string_dict[c] = col_stats

# Concatenate  chunk list into one per column
str_combined_dict = {}
for k, v in string_dict.items():
    col_stats = {'unique': pd.concat(v['unique']),
                 'row_count': v['row_count']}
    
    str_combined_dict[k] = col_stats
        
# Calculate unique values and total rows per column    
str_vc = {}    
for k, v in str_combined_dict.items():
    col_stats = {'unique': v['unique'].groupby(level=0).sum(),
                  'row_count': v['row_count']}
    
    str_vc[k] = col_stats

final_str = {}
for c, v in str_vc.items():
    
    less_50_pct_unique = 0 # No
    if len(v['unique'])/v['row_count'] < 0.5:
        less_50_pct_unique = 1 # Yes
        
    col_stats = {'unique': len(v['unique']),
                 'row_count': v['row_count'],
                 'less_50_unique': less_50_pct_unique}
    final_str[c] = col_stats

    
cat_cols = [k for k,v in final_str.items() if v['less_50_unique'] == 1]

print("""How many string columns contain values that are less 
than 50% unique?""",
      len(cat_cols))

How many string columns contain values that are less 
than 50% unique? 20


In [75]:
loans_iter = pd.read_csv('loans_2007.csv',chunksize= 3200)

total_mem = 0
memory_chunk_vc = []
for chunk in loans_iter:
    total_mem += chunk.memory_usage(deep= True).sum()

print("Total csv file Memory =",total_mem/(1024*1024))

Total csv file Memory = 66.2467565536499


In [25]:
loans = pd.read_csv('loans_2007.csv',nrows= 10)

# Get list of string column and loop through them to get a sample data
# TO determine which object columns can be converted to a numeric data type
for c in loans.select_dtypes(include=['object']).columns:
    print(c)
    print(loans[c])
    print()


term
0     36 months
1     60 months
2     36 months
3     36 months
4     60 months
5     36 months
6     60 months
7     36 months
8     60 months
9     60 months
Name: term, dtype: object

int_rate
0     10.65%
1     15.27%
2     15.96%
3     13.49%
4     12.69%
5      7.90%
6     15.96%
7     18.64%
8     21.28%
9     12.69%
Name: int_rate, dtype: object

grade
0    B
1    C
2    C
3    C
4    B
5    A
6    C
7    E
8    F
9    B
Name: grade, dtype: object

sub_grade
0    B2
1    C4
2    C5
3    C1
4    B5
5    A4
6    C5
7    E1
8    F2
9    B5
Name: sub_grade, dtype: object

emp_title
0                          NaN
1                        Ryder
2                          NaN
3          AIR RESOURCES BOARD
4     University Medical Group
5         Veolia Transportaton
6    Southern Star Photography
7              MKC Accounting 
8                          NaN
9                    Starbucks
Name: emp_title, dtype: object

emp_length
0    10+ years
1     < 1 year
2    10+ years
3   

Columns that can be converted to numeric after clean up:

- earliest_cr_line, last_pymnt_d, last_credit_pull_d, 
    - convert to date
- addr_state, initial_list_status, application_type
    - Convert to category
- zip_code
    - Rename column to first_three_digit_zip_code
    - Remove xx
- issue_d
    - Convert to date
- home_ownership, verification_status, loan_status, pymnt_plan, purpose
    - Covert to Category
- emp_length
    - rename it to emp_length_year
    - remove both years and year from the row
- Grade and sub_grade
    - Convert both to category data type
- int_rate, revol_util
    - remove %
    - Convert it to numeric(4,2)
- term
    - Rename it to term_in_months
    - Remove months from each row
    - Convert to int


In [77]:
# Cast object columns to category as they are read from file
cat_cols = {'addr_state': 'category', 'initial_list_status': 'category', 
            'application_type': 'category', 'home_ownership': 'category', 
            'verification_status': 'category', 'loan_status': 'category', 
            'pymnt_plan': 'category', 'purpose': 'category', 
            'grade': 'category', 'sub_grade': 'category'}

# Cast object columns to date as they are read from file
date_cols = ['earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d']

loans_iter = pd.read_csv('loans_2007.csv', chunksize = 3000, 
                         dtype= cat_cols,
                        parse_dates = date_cols)

tot_mem = 0
for chunk in loans_iter:

    # Cleanup and cast object columns to a numeric data type when possible
    chunk['zip_code'] = pd.to_numeric(chunk['zip_code'].str.rstrip('xxx'))
    chunk['emp_length'] = chunk['emp_length'].str.rstrip(' ').str.rstrip('year').str.strip('years')
    chunk['int_rate'] = pd.to_numeric(chunk['int_rate'].str.rstrip('%'))
    chunk['revol_util'] = pd.to_numeric(chunk['revol_util'].str.rstrip('%'))
    chunk['term'] = pd.to_numeric(chunk['term'].str.rstrip(' ').str.rstrip('months'))
    
     
    # Identify float columns with missing data that can be downcast
    for c in chunk.select_dtypes(include=['float']):
          
        missing_value = len(chunk) - chunk[c].count()
        if c in float_cols.keys():
            float_cols[c] += missing_value
        else:
            float_cols[c] = missing_value
    
    # Downscast float column to either float subtypeor integer
    for c in float_cols.keys():
        if float_cols[c] == 0:
            chunk[c] = pd.to_numeric(chunk[c], downcast='float')
        else:
            chunk[c] = pd.to_numeric(chunk[c], downcast='integer')
            
    tot_mem += (chunk.memory_usage(deep=True).sum()/(1024*1024))  

print('New csv Total memory =', tot_mem)



New csv Total memory = 19.4713191986084
here


In [76]:
loans_iter = pd.read_csv('loans_2007.csv',chunksize= 3200)

total_mem = 0
memory_chunk_vc = []
for chunk in loans_iter:
    total_mem += chunk.memory_usage(deep= True).sum()

print("Original csv file Memory =",total_mem/(1024*1024))


Original csv file Memory = 66.2467565536499


In [88]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3000)

# Create dictionary (key: column, value: list of Series objects representing each chunk's value counts)
str_cols_vc = {}

# Loop through each chunk
for chunk in chunk_iter:
    
    # Get object columns only
    str_cols = chunk.select_dtypes(include=['object'])
    
    # Loop through object columns and get unique values
    for col in str_cols.columns:
        current_col_vc = str_cols[col].value_counts()
        
        #  Save to dictionary
        if col in str_cols_vc:            
            str_cols_vc[col].append(current_col_vc)
        else:
            str_cols_vc[col] = [current_col_vc]

# Combine the value counts.
combined_vcs = {}
for col in str_cols_vc:
    combined_vc = pd.concat(str_cols_vc[col])
    final_vc = combined_vc.groupby(combined_vc.index).sum()
    combined_vcs[col] = final_vc

# Identify possible columns candidates to be converted to category data type
print('Columns low unique values (less than 50%):')
count = 0
for c in combined_vcs.keys():
    if len(combined_vcs[c])/ combined_vcs[c].sum() < 0.5:
        print(c)
        count = 1

if count == 0:
    print('none')

Columns low unique values (less than 50%):
application_type
home_ownership
last_pymnt_d
last_credit_pull_d
loan_status
int_rate
earliest_cr_line
verification_status
zip_code
addr_state
purpose
issue_d
sub_grade
emp_length
revol_util
initial_list_status
term
pymnt_plan
grade
