In [None]:
############################################
# 
# Marcus Bischof
# Divvy EDA : Chicago
#
############################################

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
# Do we need to create and pickle a memory efficient version of the divvy data?
CREATE_SMALL_MEMORY_SET = False

if CREATE_SMALL_MEMORY_SET:

    df = pd.read_csv('../data/raw/divvy_data.csv') 
    
    # Objects with defined ranges (I do this to from_ and to_station because we are just dealing with Chicago)
    for object_to_cat_feature in ['gender', 'usertype', 'events', 'from_station_name', 'to_station_name']:
        df[object_to_cat_feature] = df[object_to_cat_feature].astype('category')
        
    # We will need to apply space saving operations on the data here to make it more manageable in local memory.
    # df.describe() defaults to numerics, so figure out the ranges for numerics and downsize when possible.
    df.describe().T[['min', 'max']]

    # Ints with values ranging from 0 to 255 can be stored as uint8
    for small_int_feature in ['day', 'month', 'week', 'hour', 'tripduration', 'dpcapacity_start', 'dpcapacity_end']:
        df[small_int_feature] = df[small_int_feature].astype('uint8')

    # Ints with values ranging from 0 to 65535 can be stored as uint16
    for med_int_feature in ['year', 'from_station_id']:
        df[small_int_feature] = df[small_int_feature].astype('uint16')

    # Floats that don't need single or double precision
    for sm_float_val in ['latitude_start', 'longitude_start', 'latitude_end', 'longitude_end']:
        df[sm_float_val] = df[sm_float_val].astype('float16')

    # int8 -128 to 127
    df.temperature = df.temperature.astype('int8')
    
    df.to_pickle('../data/processed/divvy_data_small_memory.pkl')

In [None]:
# Do we want to load the more memory efficient pickle or not?
LOAD_FROM_SMALLER_MEMORY = True

df = pd.read_csv('../data/raw/divvy_data.csv') if not LOAD_FROM_SMALLER_MEMORY else pd.read_pickle('../data/processed/divvy_data_small_memory.pkl')

In [None]:
df.info()