In [1]:
import pandas as pd
import numpy as np
import logging
import warnings

warnings.filterwarnings("ignore")

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def get_column_stats(df):
    """Get basic statistics for each column in the dataframe."""
    stats = {}
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            stats[col] = {
                'min': df[col].min(),
                'max': df[col].max(),
                'mean': df[col].mean(),
            }
        else:
            stats[col] = {
                'unique': df[col].nunique()
            }
    return stats

def compare_stats(stats_before, stats_after):
    """Compare statistics before and after type conversion."""
    for col in stats_before:
        if stats_before[col] != stats_after[col]:
            logging.warning(f"Column {col} has changed: {stats_before[col]} != {stats_after[col]}")

def calculate_precision_loss(stats_before, stats_after):
    """Calculate and log precision loss for numeric columns."""
    for col in stats_before:
        if 'mean' in stats_before[col]:
            mean_before = stats_before[col]['mean']
            mean_after = stats_after[col]['mean']
            precision_loss = abs(mean_before - mean_after) / abs(mean_before) * 100
            logging.info(f"Column {col} precision loss: {precision_loss:.6f}%")

def reduce_mem_usage(df, verbose=True):
    """Iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'Start memory usage of dataframe: {start_mem:.2f} MB')

    stats_before = get_column_stats(df)

    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    stats_after = get_column_stats(df)
    compare_stats(stats_before, stats_after)
    calculate_precision_loss(stats_before, stats_after)

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'End memory usage of dataframe: {end_mem:.2f} MB')
        logging.info(f'Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%')

    return df

def safe_map(df, column, mapping):
    """Map categorical values to numerical values and log any unknown categories."""
    unknown_categories = set(df[column]) - set(mapping.keys())
    if unknown_categories:
        logging.warning(f"Unknown categories in column {column}: {unknown_categories}")
    df[column] = df[column].map(mapping)
    return df

# Load a sample of the dataset
def load_sample_data(file, sample_size=10000):
    """Load a sample of the dataset for testing."""
    try:
        df = pd.read_csv(file, nrows=sample_size)
        logging.info(f'Sample data loaded from {file}')
    except Exception as e:
        logging.error(f"Error loading sample data from {file}: {e}")
        raise
    return df

def preprocess_data(df):
    """Preprocess the dataset."""
    gender_mapping = {'Male': 1, 'Female': 0}
    vehicle_damage_mapping = {'Yes': 1, 'No': 0}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    
    df = safe_map(df, 'Gender', gender_mapping)
    df = safe_map(df, 'Vehicle_Damage', vehicle_damage_mapping)
    df = safe_map(df, 'Vehicle_Age', vehicle_age_mapping)
    
    df.drop(['Driving_License'], axis=1, inplace=True)
    return df

def feature_engineering(df):
    """Feature engineering on the dataset."""
    df['Previously_Insured_Annual_Premium'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Age'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Damage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)))[0]
    df['Previously_Insured_Vintage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)))[0]
    return df

# Path to the dataset
file_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv"

# Load a sample of the data
sample_df = load_sample_data(file_path)

# Save a copy of the original sample for comparison
original_sample_df = sample_df.copy()

# Reduce memory usage
sample_df = reduce_mem_usage(sample_df)

# Compare specific rows and columns
comparison = original_sample_df.compare(sample_df)
if not comparison.empty:
    logging.warning("Differences found between original and optimized data:")
    print(comparison)

# Apply preprocessing
sample_df = preprocess_data(sample_df)

# Apply feature engineering
sample_df = feature_engineering(sample_df)

logging.info("Sample data processed successfully.")

# Display the processed sample data
print(sample_df.head())


2024-07-23 21:16:21,727 - INFO - Sample data loaded from C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv
2024-07-23 21:16:21,729 - INFO - Start memory usage of dataframe: 0.92 MB
2024-07-23 21:16:21,744 - INFO - Column id precision loss: 0.000000%
2024-07-23 21:16:21,744 - INFO - Column Age precision loss: 0.000000%
2024-07-23 21:16:21,744 - INFO - Column Driving_License precision loss: 0.000000%
2024-07-23 21:16:21,745 - INFO - Column Region_Code precision loss: 0.000001%
2024-07-23 21:16:21,745 - INFO - Column Previously_Insured precision loss: 0.000000%
2024-07-23 21:16:21,745 - INFO - Column Annual_Premium precision loss: 0.000001%
2024-07-23 21:16:21,747 - INFO - Column Policy_Sales_Channel precision loss: 0.000003%
2024-07-23 21:16:21,747 - INFO - Column Vintage precision loss: 0.000000%
2024-07-23 21:16:21,748 - INFO - Column Response precision loss: 0.000000%
2024-07-23 21:16:21,749 - INFO - End memory usage of dataframe: 0.22 MB
2024-07-23 21:16:21,74

   id Gender  Age  Region_Code  Previously_Insured Vehicle_Age Vehicle_Damage  \
0   0      1   21         35.0                   0           1              1   
1   1      1   43         28.0                   0           2              1   
2   2      0   25         14.0                   1           0              0   
3   3      0   35          1.0                   0           1              1   
4   4      0   36         15.0                   1           1              0   

   Annual_Premium  Policy_Sales_Channel  Vintage  Response  \
0         65101.0                 124.0      187         0   
1         58911.0                  26.0      288         1   
2         38043.0                 152.0      254         0   
3          2630.0                 156.0       76         0   
4         31951.0                 152.0      294         0   

   Previously_Insured_Annual_Premium  Previously_Insured_Vehicle_Age  \
0                                  0                               0