In [10]:
# !pip install pandas
# !pip install scikit-learn

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd

df = pd.read_csv("../data/MachineLearningRating_v3.txt", sep="|")


print("Shape:", df.shape)
df.head()

In [None]:
df.info()
df.describe(include='all')
df.isnull().sum()

In [None]:
# Convert TransactionMonth to datetime
df["TransactionMonth"] = pd.to_datetime(df["TransactionMonth"])

In [None]:
df = df.drop(columns=["CrossBorder", "NumberOfVehiclesInFleet"])

### Handling Missing Values

In [8]:
# Missing values before applying ML model to hanldle it
df['CustomValueEstimate'].isnull().sum()  

np.int64(779642)

In [9]:
# import pandas as pd
# from sklearn.ensemble import RandomForestRegressor

# # Features that are useful for predicting
# features = [
#     'make', 'Province', 'RegistrationYear', 'SumInsured',
#     'kilowatts', 'cubiccapacity', 'bodytype'
# ]

# # Split into rows with and without missing CustomValueEstimate
# df_train = df[df['CustomValueEstimate'].notnull()]
# df_missing = df[df['CustomValueEstimate'].isnull()]

# # Sample dataset to train (only 10,000 rows for training to avoid memory issues)
# df_train_sample = df_train.sample(10000, random_state=42)

# # Combine both sets 
# df_all = pd.concat([df_train_sample, df_missing], axis=0)

# # One-hot encode the categorical columns
# df_encoded = pd.get_dummies(df_all[features])

# # Split encoded features back into X_train and X_pred
# X_train = df_encoded.iloc[:len(df_train_sample), :]
# X_pred = df_encoded.iloc[len(df_train_sample):, :]

# # Target variable
# y_train = df_train_sample['CustomValueEstimate']

# # Train model
# model = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)
# model.fit(X_train, y_train)

# # Predict missing values
# predicted_values = model.predict(X_pred)

# # Fill in the missing values in the original dataframe
# df.loc[df['CustomValueEstimate'].isnull(), 'CustomValueEstimate'] = predicted_values


In [10]:
# After handling missing values 
# df['CustomValueEstimate'].isnull().sum() 

### Handling some common missing values

In [11]:
# # Fill common missing values
# df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
# df['MaritalStatus'].fillna(df['MaritalStatus'].mode()[0], inplace=True)

# df['Bank'].fillna("Unknown", inplace=True)
# df['AccountType'].fillna("Unknown", inplace=True)

In [12]:
# After handling missing values 
df['CustomValueEstimate'].isnull().sum() 

np.int64(779642)

In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split

# Fill Simple Columns
def fill_simple_values(df):
    df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
    df['MaritalStatus'].fillna(df['MaritalStatus'].mode()[0], inplace=True)
    df['Bank'].fillna("Unknown", inplace=True)
    df['AccountType'].fillna("Unknown", inplace=True)
    return df


# Predict Binary Columns
def predict_binary_column(df, target_col, features):
    print(f"Imputing {target_col}...")

    # Convert Yes/No to binary
    df[target_col] = df[target_col].map({'Yes': 1, 'No': 0})

    # Create subset where target and features are not missing
    df_subset = df[features + [target_col]].dropna()

    # Split into known and missing
    train_data = df_subset[df_subset[target_col].notnull()]
    predict_data = df[df[target_col].isnull()]

    # One-hot encode features
    X = pd.get_dummies(train_data[features])
    y = train_data[target_col]

    X_pred = pd.get_dummies(predict_data[features])
    X_pred = X_pred.reindex(columns=X.columns, fill_value=0)

    # Train classifier
    model = RandomForestClassifier(n_estimators=50, max_depth=8, random_state=42)
    model.fit(X, y)

    # Predict
    predicted = model.predict(X_pred)

    # Fill predictions back
    df.loc[df[target_col].isnull(), target_col] = predicted

    return df


# Predict CustomValueEstimate

def predict_custom_value(df):
    print("Imputing CustomValueEstimate...")

    features = [
        'make', 'Province', 'bodytype',
        'RegistrationYear', 'SumInsured', 'kilowatts', 'cubiccapacity'
    ]

    # Clean training data
    df_train = df[df['CustomValueEstimate'].notnull()].dropna(subset=features)
    df_missing = df[df['CustomValueEstimate'].isnull()].dropna(subset=features)

    # Save the indexes of rows we're predicting
    missing_indexes = df_missing.index

    # Sample only 10k rows to avoid memory issues
    df_train_sample = df_train.sample(10000, random_state=42)

    # Combine for consistent encoding
    df_all = pd.concat([df_train_sample[features], df_missing[features]])
    df_encoded = pd.get_dummies(df_all)

    # Split into train and predict sets
    X_train = df_encoded.iloc[:len(df_train_sample), :]
    X_pred = df_encoded.iloc[len(df_train_sample):]
    y_train = df_train_sample['CustomValueEstimate']

    # Train model
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor(n_estimators=50, max_depth=8, random_state=42)
    model.fit(X_train, y_train)

    # Predict
    predicted = model.predict(X_pred)

    # ✅ Assign back using matching indexes
    df.loc[missing_indexes, 'CustomValueEstimate'] = predicted

    print("CustomValueEstimate imputation complete.")
    return df



def run_full_imputation(df):
    # Fill simple columns
    df = fill_simple_values(df)

    # Common features to use in all imputations
    features = [
    'make', 'Province', 'bodytype',
    'RegistrationYear', 'SumInsured', 'kilowatts', 'cubiccapacity'
]

    # Predict binary targets
    for col in ['WrittenOff', 'Rebuilt', 'Converted']:
        df = predict_binary_column(df, col, features)

    # Predict CustomValueEstimate
    df = predict_custom_value(df)

    return df


In [14]:
df = run_full_imputation(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['MaritalStatus'].fillna(df['MaritalStatus'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate o

Imputing WrittenOff...
Imputing Rebuilt...
Imputing Converted...
Imputing CustomValueEstimate...
CustomValueEstimate imputation complete.


In [15]:
# After handling missing values 
df['CustomValueEstimate'].isnull().sum() 

np.int64(552)

In [16]:
# Final clean-up after model-based predictions
df['CustomValueEstimate'].fillna(df['CustomValueEstimate'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CustomValueEstimate'].fillna(df['CustomValueEstimate'].median(), inplace=True)


In [17]:
df['CustomValueEstimate'].isnull().sum() 

np.int64(0)

In [18]:
df.isnull().sum()

UnderwrittenCoverID              0
PolicyID                         0
TransactionMonth                 0
IsVATRegistered                  0
Citizenship                      0
LegalType                        0
Title                            0
Language                         0
Bank                             0
AccountType                      0
MaritalStatus                    0
Gender                           0
Country                          0
Province                         0
PostalCode                       0
MainCrestaZone                   0
SubCrestaZone                    0
ItemType                         0
mmcode                         552
VehicleType                    552
RegistrationYear                 0
make                           552
Model                          552
Cylinders                      552
cubiccapacity                  552
kilowatts                      552
bodytype                       552
NumberOfDoors                  552
VehicleIntroDate    

In [1]:
categorical_car_cols = ['make', 'Model', 'bodytype', 'VehicleType']
for col in categorical_car_cols:
    df[col] = df[col].fillna('Unknown')

numerical_car_cols = ['kilowatts', 'cubiccapacity', 'Cylinders']
for col in numerical_car_cols:
    df[col] = df[col].astype(str).str.replace(",", "").astype(float)
    df[col] = df[col].fillna(df[col].median())

if df['VehicleIntroDate'].isnull().sum() > 0:
    df['VehicleIntroDate'] = df['VehicleIntroDate'].fillna(df['VehicleIntroDate'].mode()[0])

df['NewVehicle'] = df['NewVehicle'].fillna('Unknown')

df['CapitalOutstanding'] = (
    df['CapitalOutstanding']
    .astype(str)
    .str.replace(",", ".")  # comma to dot
    .str.replace(r"[^\d.]", "", regex=True)  # remove any extra characters
)
df['CapitalOutstanding'] = pd.to_numeric(df['CapitalOutstanding'], errors='coerce')
df['CapitalOutstanding'] = df['CapitalOutstanding'].fillna(df['CapitalOutstanding'].median())


NameError: name 'df' is not defined