# Imports

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

# Read Data

In [5]:
test_data = pd.read_csv(filepath_or_buffer='data/test.csv', nrows=100)
train_data = pd.read_csv(filepath_or_buffer='data/train.csv', nrows=100)

# Data Preprocessing/Transformations

## Explore

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    100 non-null    int64  
 1   Age                   99 non-null     float64
 2   Gender                100 non-null    object 
 3   Annual Income         94 non-null     float64
 4   Marital Status        99 non-null     object 
 5   Number of Dependents  92 non-null     float64
 6   Education Level       100 non-null    object 
 7   Occupation            65 non-null     object 
 8   Health Score          94 non-null     float64
 9   Location              100 non-null    object 
 10  Policy Type           100 non-null    object 
 11  Previous Claims       71 non-null     float64
 12  Vehicle Age           100 non-null    float64
 13  Credit Score          86 non-null     float64
 14  Insurance Duration    100 non-null    float64
 15  Policy Start Date     10

In [13]:
def cleanse_column_names(data):
    new_cols = []
    for col in data.columns:
        new_cols.append(col.lower().replace(' ', '_'))
    data.columns = new_cols
    return data.columns

In [16]:
cleanse_column_names(test_data)

Index(['id', 'age', 'gender', 'annual_income', 'marital_status',
       'number_of_dependents', 'education_level', 'occupation', 'health_score',
       'location', 'policy_type', 'previous_claims', 'vehicle_age',
       'credit_score', 'insurance_duration', 'policy_start_date',
       'customer_feedback', 'smoking_status', 'exercise_frequency',
       'property_type'],
      dtype='object')

In [11]:
numeric_features = train_data.select_dtypes(include='number').columns
numeric_features

Index(['id', 'Age', 'Annual Income', 'Number of Dependents', 'Health Score',
       'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration',
       'Premium Amount'],
      dtype='object')

In [None]:
categorical_features = []

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())   
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features)
    ]
)

In [None]:

def create_pipeline(numeric_features, categorical_features):
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ])
    
    return full_pipeline


In [None]:
numeric_features = ['age', 'income']
categorical_features = ['education', 'location']
    
pipeline = create_pipeline(numeric_features, categorical_features)
pipeline.fit(X, y)
    
# Make predictions
predictions = pipeline.predict(X)