In [14]:
import pandas as pd
import numpy as np

# ------------------------------------------------------------------------------
# CONF: Global Settings for Production
# ------------------------------------------------------------------------------
pd.options.mode.copy_on_write = True

# ------------------------------------------------------------------------------
# MODULE 1: NUMERIC & STRING OPTIMIZATION (College Dataset)
# ------------------------------------------------------------------------------
def clean_college_types(df: pd.DataFrame) -> pd.DataFrame:
    """
    Optimizes numeric, boolean, and string types for the College dataset.
    *FIXED: Adjusted for lowercase column schema (instnm vs INSTNM).*
    """
    return (df
        .assign(
            # 1. STRING OPTIMIZATION (PyArrow)
            instnm=lambda x: x['instnm'].astype("string[pyarrow]"),
            city=lambda x: x['city'].astype("string[pyarrow]"),
            
            # 2. NULLABLE INTEGERS (Int16)
            # SAT scores (0-800) fit in Int16. 'Int16' handles NaNs safely.
            satmtmid=lambda x: x['satmtmid'].astype("Int16"),
            satvrmid=lambda x: x['satvrmid'].astype("Int16"),
            
            # 3. BOOLEAN COERCION
            # Raw data is 0/1 integers. We cast to proper Bools.
            hbcu=lambda x: x['hbcu'].astype(bool),
            menonly=lambda x: x['menonly'].astype(bool),
            womenonly=lambda x: x['womenonly'].astype(bool),
            
            # 4. FLOAT OPTIMIZATION
            # Float32 is sufficient for demographics (0.0 to 1.0) and saves 50% RAM.
            ugds=lambda x: x['ugds'].astype("Float32")
        )
        # Select specific columns to validate our work
        .loc[:, ['instnm', 'city', 'hbcu', 'satmtmid', 'satvrmid', 'ugds']]
    )

# ------------------------------------------------------------------------------
# MODULE 2: ORDERED CATEGORICALS (Diamonds Dataset)
# ------------------------------------------------------------------------------
def enforce_diamond_schema(df: pd.DataFrame) -> pd.DataFrame:
    """
    Enforces Strict Ordered Categorical types for Diamonds.
    """
    # Define strict hierarchies (Business Logic)
    cut_order = pd.CategoricalDtype(
        categories=['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'], 
        ordered=True
    )
    clarity_order = pd.CategoricalDtype(
        categories=['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'], 
        ordered=True
    )
    
    return (df
        .assign(
            # Apply ordered categorical types
            cut=lambda x: x['cut'].astype(cut_order),
            clarity=lambda x: x['clarity'].astype(clarity_order),
            
            # Unordered categorical (Nominal)
            color=lambda x: x['color'].astype('category')
        )
        .sort_values(by=['cut', 'clarity'], ascending=[False, True])
    )

# ------------------------------------------------------------------------------
# MODULE 3: DATETIME, TIMEDELTA & PERIOD (Bikes Dataset)
# ------------------------------------------------------------------------------
def process_temporal_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Handles Temporal data types: Datetime, Timedelta, and Period.
    """
    return (df
        .assign(
            # 1. DATETIME CONVERSION
            starttime=lambda x: pd.to_datetime(x['starttime']),
            stoptime=lambda x: pd.to_datetime(x['stoptime']),
            
            # 2. TIMEDELTA (Duration)
            # Vectorized subtraction creates 'timedelta64' objects
            actual_duration=lambda x: x['stoptime'] - x['starttime'],
            
            # 3. PERIOD (Reporting)
            # Convert to 'Month' period for easy cohort grouping
            ride_month=lambda x: x['starttime'].dt.to_period('M')
        )
        # Filter using readable Timedelta semantics
        .loc[
            lambda x: x['actual_duration'] > pd.Timedelta(minutes=2),
             ['starttime','stoptime', 'actual_duration', 'ride_month']
            ]

    )

# ------------------------------------------------------------------------------
# MAIN PIPELINE
# ------------------------------------------------------------------------------
def main():
    # 1. College Data
    print("--- 1. Processing College Types (Fixed Schema) ---")
    college_df = (pd.read_csv('../data/college.csv')
        .pipe(clean_college_types)
    )
    print(college_df.info())
    print("\n")

    # 2. Diamonds Data
    print("--- 2. Processing Diamonds Schema ---")
    diamonds_df = (pd.read_csv('../data/diamonds.csv')
        .pipe(enforce_diamond_schema)
    )
    print(diamonds_df[['cut', 'clarity']].head())
    print("\n")

    # 3. Bikes Data
    print("--- 3. Processing Temporal Data ---")
    bikes_df = (pd.read_csv('../data/bikes.csv')
        .pipe(process_temporal_data)
    )
    print(bikes_df.head())
    print(f"\nDuration Type: {bikes_df['actual_duration'].dtype}")

if __name__ == "__main__":
    main()

--- 1. Processing College Types (Fixed Schema) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   instnm    7535 non-null   string 
 1   city      7535 non-null   string 
 2   hbcu      7535 non-null   bool   
 3   satmtmid  1196 non-null   Int16  
 4   satvrmid  1185 non-null   Int16  
 5   ugds      6874 non-null   Float32
dtypes: Float32(1), Int16(2), bool(1), string(2)
memory usage: 495.8 KB
None


--- 2. Processing Diamonds Schema ---
       cut clarity
315  Ideal      I1
535  Ideal      I1
551  Ideal      I1
653  Ideal      I1
718  Ideal      I1


--- 3. Processing Temporal Data ---
            starttime            stoptime actual_duration ride_month
0 2013-06-28 19:01:00 2013-06-28 19:17:00 0 days 00:16:00    2013-06
1 2013-06-28 22:53:00 2013-06-28 23:03:00 0 days 00:10:00    2013-06
2 2013-06-30 14:43:00 2013-06-30 15:01:00 0 days 00:18:

In [10]:
diamonds_df = (pd.read_csv('../data/diamonds.csv')
        .pipe(enforce_diamond_schema)
    )

diamonds_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
315,0.96,Ideal,F,I1,60.7,55.0,2801,6.37,6.41,3.88
535,0.96,Ideal,F,I1,60.7,55.0,2826,6.41,6.37,3.88
551,0.97,Ideal,F,I1,60.7,56.0,2830,6.41,6.43,3.90
653,1.01,Ideal,I,I1,61.5,57.0,2844,6.45,6.46,3.97
718,0.97,Ideal,F,I1,60.7,56.0,2856,6.43,6.41,3.90
...,...,...,...,...,...,...,...,...,...,...
41242,0.30,Fair,D,IF,60.5,57.0,1208,4.47,4.35,2.67
43778,0.37,Fair,D,IF,61.2,57.0,1440,4.68,4.73,2.88
47407,0.52,Fair,G,IF,65.5,55.0,1849,4.98,5.06,3.29
49683,0.52,Fair,F,IF,64.6,58.0,2144,5.04,5.17,3.30


In [15]:
bikes_df = (pd.read_csv('../data/bikes.csv')
        .pipe(process_temporal_data)
    )

bikes_df

Unnamed: 0,starttime,stoptime,actual_duration,ride_month
0,2013-06-28 19:01:00,2013-06-28 19:17:00,0 days 00:16:00,2013-06
1,2013-06-28 22:53:00,2013-06-28 23:03:00,0 days 00:10:00,2013-06
2,2013-06-30 14:43:00,2013-06-30 15:01:00,0 days 00:18:00,2013-06
3,2013-07-01 10:05:00,2013-07-01 10:16:00,0 days 00:11:00,2013-07
5,2013-07-01 12:37:00,2013-07-01 12:48:00,0 days 00:11:00,2013-07
...,...,...,...,...
50084,2017-12-30 13:07:00,2017-12-30 13:34:00,0 days 00:27:00,2017-12
50085,2017-12-30 13:34:00,2017-12-30 13:44:00,0 days 00:10:00,2017-12
50086,2017-12-30 13:34:00,2017-12-30 13:48:00,0 days 00:14:00,2017-12
50087,2017-12-31 09:30:00,2017-12-31 09:33:00,0 days 00:03:00,2017-12
