In [1]:
import pandas as pd

In [3]:
import random

In [2]:
def multi_assign(df, transform_fn, condition):
    df_to_use = df.copy()
    
    return (df_to_use
        .assign(
            **{col: transform_fn(df_to_use[col])
               for col in condition(df_to_use)})
           )

In [16]:
def all_low_cardinality_to_categorical(df):
    df_to_use = df.copy()
    
    transform_fn = lambda x: x.astype("category")
    
    condition = lambda x:  (x.select_dtypes(include=['object']).nunique() > 0.5).index
    
    return multi_assign(df_to_use, transform_fn, condition)

    

In [7]:
df = pd.DataFrame(
    {"A": [random.choice(["animal", "vegetable", "mineral"]) for _ in range(10)],
     "B": list(range(10)),
     "C": [random.choice(["alice", "bob", "charlie"]) for _ in range(10)]}
)

In [23]:
all_low_cardinality_to_categorical(df).dtypes

A    category
B       int64
C    category
dtype: object

In [18]:
import dask.dataframe as da45

In [27]:
(
    df.pipe(all_low_cardinality_to_categorical)
    .pipe(da.from_pandas, npartitions=4)
    .pipe(all_float_to_int)
    .pipe(downcast_all, "float")
    .pipe(downcast_all, "integer")
    .pipe(downcast_all, target_type="unsigned", inital_type="integer")
).compute()

Unnamed: 0,A,B,C
0,mineral,0,bob
1,mineral,1,alice
2,animal,2,charlie
3,vegetable,3,alice
4,mineral,4,alice
5,animal,5,alice
6,mineral,6,charlie
7,mineral,7,charlie
8,vegetable,8,charlie
9,vegetable,9,charlie


In [25]:
def float_to_int(ser):
    try:
        int_ser = ser.astype(int)
        if (ser == int_ser).all():
            return int_ser
        else:
            return ser
    except ValueError:
        return ser
    
def multi_assign(df, transform_fn, condition):
    df_to_use = df.copy()
    
    return (df_to_use
        .assign(
            **{col: transform_fn(df_to_use[col])
               for col in condition(df_to_use)})
           )

def all_float_to_int(df):
    df_to_use = df.copy()
    transform_fn = float_to_int
    condition = lambda x: list(x
                    .select_dtypes(include=["float"])
                    .columns)    
    
    return multi_assign(df_to_use, transform_fn, condition)

def downcast_all(df, target_type, inital_type=None):
    #Gotta specify floats, unsigned, or integer
    #If integer, gotta be 'integer', not 'int'
    #Unsigned should look for Ints
    if inital_type is None:
        inital_type = target_type
    
    df_to_use = df.copy()
    
    transform_fn = lambda x: pd.to_numeric(x, 
                                downcast=target_type)
    
    condition = lambda x: list(x
                    .select_dtypes(include=[inital_type])
                    .columns) 
    
    return multi_assign(df_to_use, transform_fn, condition)



In [None]:
test_new_fn2 = (small_merged_with_cats
    .pipe(remove_dup_cols)
    .pipe(all_float_to_int)
     .pipe(downcast_all, "float")
     .pipe(downcast_all, "integer")
     .pipe(downcast_all,  
           target_type = "unsigned", 
           inital_type = "integer")
     .to_hdf("../data/interim/small_merged_retention_finaid_frsh_soph.h5",
                   "df",
                   format="table")
)

In [None]:
test_new_fn2.to_hdf("../data/interim/small_merged_retention_finaid_frsh_soph.h5",
                   "df",
                   format="table")

In [4]:
[random.choice(["animal",
              "vegetable",
              "mineral"])
for _ in range(10)]

['animal',
 'vegetable',
 'mineral',
 'mineral',
 'animal',
 'vegetable',
 'vegetable',
 'mineral',
 'mineral',
 'vegetable']

In [None]:
#Note that it might be helpful to actually look at the value counts
#If one turns out to be WAY more common that the others, might be easier to do the 
#feature engineering in a different way

In [None]:
smallMergedDF.select_dtypes(include=['object']).nunique()

In [None]:
df["A"]

In [None]:
small_merged_with_cats = smallMergedDF.assign(**{col : lambda x: x[col].astype("category")  
                                                 for col in 
                                                 smallMergedDF.select_dtypes(include=['object']) 
                                                 if col not in {"emplid", 
                                                                "person_nm", 
                                                                "cche_student_id"}})

In [11]:
(df.select_dtypes(include=['object']).nunique() > 0.5).index

Index(['A', 'C'], dtype='object')

In [12]:
df.assign(**{col: lambda x: x[col].astype("category")
            for col in (df.select_dtypes(include=['object']).nunique() > 0.5).index})

Unnamed: 0,A,B,C
0,bob,0,bob
1,alice,1,alice
2,charlie,2,charlie
3,alice,3,alice
4,alice,4,alice
5,alice,5,alice
6,charlie,6,charlie
7,charlie,7,charlie
8,charlie,8,charlie
9,charlie,9,charlie


In [8]:
[col for col in df.select_dtypes(include=['object']) 
 if 

Unnamed: 0,A,C
0,mineral,bob
1,mineral,alice
2,animal,charlie
3,vegetable,alice
4,mineral,alice
5,animal,alice
6,mineral,charlie
7,mineral,charlie
8,vegetable,charlie
9,vegetable,charlie


In [13]:
def all_low_cardinality_to_categorical(df):
    df_to_use = df.copy()
    
    transform_fn = lambda x: x.astype("category")
    
    condition = lambda x:  (x.select_dtypes(include=['object']).nunique() > 0.5).index
    
    return multi_assign(df_to_use, transform_fn, condition)

    

In [15]:
all_low_cardinality_to_categorical(df).dtypes

A    category
B       int64
C    category
dtype: object