### Part 1: setting up the environment

Here I am gonna set the brilliant basics of our environment. 
I skipped the dependency install because most of us, data analyst/scientist, already have those on.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import re

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("All dependencies are running.")

### Part 2: data cleaning

In [None]:
#Dataset loading
data = pd.read_csv('test.csv')
print("Dataset loaded successfully.")

Let's evaluate some issues our data might have, as missing or invalid values in the rows. Here we want to know what columns and how many rows we have NULL values. 

This is going to help us to decide whether to act because not every missing row should be filled due some columns not being important here.

In [None]:
missing_percentage = (data.isnull().sum() / len(data)) * 100
print(missing_percentage)

Look closely: the metered fare column has only 0.40% of missing values, probably showing failed trips because... well, a finished trip always have an actual fare. As we knew, the upfront price has 31% of missing values, maybe of rides started without waiting for the prediction. 

Those missing values are going to be filled later, when we have the prediction model.

As we see, there's no need to fill missing values right now. Instead, we are going to remove those from those two columns, so they won't get in our way in the models.

In [None]:
data = data.dropna(subset=['metered_price', 'upfront_price'])

print(f"Remaining rows: {len(data)}")
print(f"% of NaN in metered_price: {data['metered_price'].isnull().mean() * 100:.2f}%")
print(f"% of NaN in upfront_price: {data['upfront_price'].isnull().mean() * 100:.2f}%")

Now let's remove duplicates, using the column 'order_id_new' since it's the unique ID of each trip. 

In [None]:
data = data.drop_duplicates(subset='order_id_new')

print(f"Remaining rows: {len(data)}")
print(f"% of NaN in order_id_new: {data['order_id_new'].isnull().mean() * 100:.2f}%")


As we been told in case explanation, every time the metered fare goes 20% above or below the upfront fare, we run on the metered fare. 

It's important to flag when this happen, so let's create a column beside the upfront fare to see it. 
- True (1) for when it happens; 
- and False (0) when it doesn't.

In [None]:
# Calculate percentage deviation
data['deviation_percent'] = abs(data['metered_price'] - data['upfront_price']) / data['upfront_price'] * 100

# Create a flag column for 20% deviation
data['deviation_flag'] = (data['deviation_percent'] > 20).astype(int)

# Display the updated DataFrame
print(data[['metered_price', 'upfront_price', 'deviation_percent', 'deviation_flag']].head())

After removing the NaN, nulls, missings, etc., we got to take care of the outliers. 

But, before, we create a backup copy of dataset named filtered_data.

In [None]:
filtered_data = data.copy()

Dealing with outliers using interquartile range separation and quartile splits.

In [None]:
def remove_outliers_iqr(df, columns):
    """
    Removes outliers from specified columns in a DataFrame using the IQR method.

    Parameters:
        df (DataFrame): The DataFrame to process.
        columns (list): The list of columns to apply IQR outlier removal.

    Returns:
        DataFrame: A new DataFrame without outliers in the specified columns.
    """
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filter data within IQR bounds
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

    return df

# Define columns to check for outliers
columns_to_check = ['distance', 'duration', 'metered_price', 'upfront_price']

# Remove outliers from the filtered_data DataFrame
filtered_data_no_outliers = remove_outliers_iqr(filtered_data.copy(), columns_to_check)

print(f"Rows before: {filtered_data.shape[0]}")
print(f"Rows after: {filtered_data_no_outliers.shape[0]}")

Splitting in long and short rides...

In [None]:
# Calculate Q3 (75th percentile) as the threshold for splitting rides
Q3 = filtered_data_no_outliers['metered_price'].quantile(0.75)

# Split into short and long rides
short_rides = filtered_data_no_outliers[filtered_data_no_outliers['metered_price'] <= Q3].copy()
long_rides = filtered_data_no_outliers[filtered_data_no_outliers['metered_price'] > Q3].copy()

print(f"Threshold for short rides (Q3): {Q3:.2f}")
print(f"Short rides: {short_rides.shape[0]} rows")
print(f"Long rides: {long_rides.shape[0]} rows")

I think there's no need for transformations

In [None]:
# filtered_data_no_outliers['distance_log'] = np.log1p(filtered_data_no_outliers['distance'])
# filtered_data_no_outliers['duration_log'] = np.log1p(filtered_data_no_outliers['duration'])
# filtered_data_no_outliers['metered_price_log'] = np.log1p(filtered_data_no_outliers['metered_price'])
# filtered_data_no_outliers['upfront_price_log'] = np.log1p(filtered_data_no_outliers['upfront_price'])

# # Log-transform distance, duration, and prices for short rides
# short_rides['distance_log'] = np.log1p(short_rides['distance'])
# short_rides['duration_log'] = np.log1p(short_rides['duration'])
# short_rides['metered_price_log'] = np.log1p(short_rides['metered_price'])
# short_rides['upfront_price_log'] = np.log1p(short_rides['upfront_price'])

# # Log-transform distance, duration, and prices for long rides
# long_rides['distance_log'] = np.log1p(long_rides['distance'])
# long_rides['duration_log'] = np.log1p(long_rides['duration'])
# long_rides['metered_price_log'] = np.log1p(long_rides['metered_price'])
# long_rides['upfront_price_log'] = np.log1p(long_rides['upfront_price'])

# print("Log transformations applied successfully.")

In [None]:
# # Function to calculate statistics
# def calculate_statistics(df, columns):
#     stats = {}
#     for col in columns:
#         stats[col] = {
#             'mean': df[col].mean(),
#             'std': df[col].std(),
#             'skew': df[col].skew(),
#             'kurtosis': df[col].kurtosis()
#         }
#     return pd.DataFrame(stats).T

# # Columns before and after transformations
# original_columns = ['distance', 'duration', 'metered_price', 'upfront_price']
# log_transformed_columns = ['distance_log', 'duration_log', 'metered_price_log', 'upfront_price_log']

# # Calculate statistics before and after transformations
# stats_before = calculate_statistics(filtered_data_no_outliers, original_columns)
# stats_after = calculate_statistics(filtered_data_no_outliers, log_transformed_columns)

# # Display the statistics
# print("Statistics Before Transformations:")
# print(stats_before)

# print("\nStatistics After Transformations:")
# print(stats_after)

Plotting before and after the transformations to analyse the changes of skewness.

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Create subplots for each variable before and after transformation
# fig, axes = plt.subplots(4, 2, figsize=(14, 16))
# fig.suptitle('Distributions Before and After Log Transformations', fontsize=16)

# # List of variables and their log-transformed counterparts
# variables = ['distance', 'duration', 'metered_price', 'upfront_price']
# log_variables = ['distance_log', 'duration_log', 'metered_price_log', 'upfront_price_log']

# # Plot each variable before and after transformation
# for i, var in enumerate(variables):
#     # Plot original variable
#     sns.histplot(filtered_data_no_outliers[var], kde=True, ax=axes[i, 0], bins=30)
#     axes[i, 0].set_title(f'{var.capitalize()} (Before Transformation)')
#     axes[i, 0].set_xlabel(var.capitalize())
#     axes[i, 0].set_ylabel('Frequency')
    
#     # Plot log-transformed variable
#     sns.histplot(filtered_data_no_outliers[log_variables[i]], kde=True, ax=axes[i, 1], bins=30)
#     axes[i, 1].set_title(f'{var.capitalize()} (After Log Transformation)')
#     axes[i, 1].set_xlabel(f'{var.capitalize()} (Log)')
#     axes[i, 1].set_ylabel('Frequency')

# plt.tight_layout(rect=[0, 0, 1, 0.96])
# plt.show()

### Part 3: data exploration and model creation

Now that we cleaned and split the data into short and long rides, we aim to model the fare calculation. Fares are often calculated using linear combinations of distance, time, and base fare, real-world data may include non-lineartiy and interactions (as dynamic fare multipliers). Because of this, we are going to use Random Forest, a  non-linear model, to capture these relationships without assuming a pre-defined function. Specifically, we want to predict the upfront prices based on:	

- Distance.

- Duration.

- Base fare

The model allows us to approximate the fare calculation while dealing with complexities and outliers in the data.

Using a regression algorithm, we are going to learn and discover how the pricing equation behaves, separated by short and long, due their differences. Due the non-linear relation, let's use something different than the usual linear regression algorithm.

In [None]:
# Short Rides Random Forest Model
print("=== Short Rides ===")

# Define features and target for short rides
X_short = short_rides[['distance', 'duration', 'upfront_price']]
y_short = short_rides['metered_price']

# Split the data into training and testing sets
X_train_short, X_test_short, y_train_short, y_test_short = train_test_split(
    X_short, y_short, test_size=0.2, random_state=42
)

# Initialize and train the Random Forest model
rf_model_short = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_short.fit(X_train_short, y_train_short)

# Make predictions and evaluate
y_pred_short = rf_model_short.predict(X_test_short)
mae_short = mean_absolute_error(y_test_short, y_pred_short)
mse_short = mean_squared_error(y_test_short, y_pred_short)
r2_short = r2_score(y_test_short, y_pred_short)

print(f"Mean Absolute Error (MAE): {mae_short}")
print(f"Mean Squared Error (MSE): {mse_short}")
print(f"R-squared (R2): {r2_short}")

# Long Rides Random Forest Model
print("\n=== Long Rides ===")

# Define features and target for long rides
X_long = long_rides[['distance', 'duration', 'upfront_price']]
y_long = long_rides['metered_price']

# Split the data into training and testing sets
X_train_long, X_test_long, y_train_long, y_test_long = train_test_split(
    X_long, y_long, test_size=0.2, random_state=42
)

# Initialize and train the Random Forest model
rf_model_long = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_long.fit(X_train_long, y_train_long)

# Make predictions and evaluate
y_pred_long = rf_model_long.predict(X_test_long)
mae_long = mean_absolute_error(y_test_long, y_pred_long)
mse_long = mean_squared_error(y_test_long, y_pred_long)
r2_long = r2_score(y_test_long, y_pred_long)

print(f"Mean Absolute Error (MAE): {mae_long}")
print(f"Mean Squared Error (MSE): {mse_long}")
print(f"R-squared (R2): {r2_long}")

Now we have a fairly assertive model to understand how metered prices work using basic variables as distance and time. Let's bring the upfront prices, repeating the analysis to understand in the same variables where they came from and compare with the metered prices, to understand the difference.

There's some spread here because we aren't taking in consideration some external variables as GPS and destination changes, but the R-squared gives some degree of trustworthiness in the model. By now, that's enough. 

Now, let's keep looking to the upfront prices and understand how much some variables influenced them, mainly in the bad previsions, and that's why we kept them before.

- Rider/driver app version: some versions may be buggy about predicting pricing.
- Device brand/model: some devices may only run buggy versions.
- GPS confidence: this one is special. We can cross it with device specifications to see what models are likely to deliver bad predictions.

To understand the weight of each column in the prediction, we are going to use the Information Value (IV) tool.

In [None]:
#Rider app version treatment
filtered_data_no_outliers['rider_app_version'] = (
    filtered_data_no_outliers['rider_app_version']
    .fillna('0')
    .str.replace(r'[^0-9.]', '', regex=True)
    .str.lstrip('.')  
    .replace('', '0')  
    .astype(float)
)

#Rider app version treatment
filtered_data_no_outliers['driver_app_version'] = (
    filtered_data_no_outliers['driver_app_version']
    .fillna('0')
    .str.replace(r'[^0-9.]', '', regex=True)
    .str.lstrip('.') 
    .replace('', '0')
    .astype(float)
)

print(filtered_data_no_outliers[['rider_app_version', 'driver_app_version']].head())

Data treatment in the device name, generating the device brand column to identify the iOS vs. Android. 

In [None]:

def extract_brand(device_name):
    # Substituir "iPhone" por "iPhone"
    if 'iphone' in device_name.lower():
        return 'iPhone'
    
    # Remover separadores e capturar a primeira palavra
    cleaned_name = re.split(r'[ _\-,]', device_name)
    return cleaned_name[0].strip()

# Aplicar a função para a coluna 'device_name'
filtered_data_no_outliers['device_brand'] = filtered_data_no_outliers['device_name'].apply(extract_brand)

# Exibir as primeiras linhas para validação
print(filtered_data_no_outliers[['device_name', 'device_brand']].head())

In [None]:
#Information value and weight of evidence
def calculate_woe_iv(df, feature, target):
    grouped = df.groupby(feature)[target].agg(['count', 'sum'])
    grouped.columns = ['total', 'event']
    grouped['non_event'] = grouped['total'] - grouped['event']
    
    grouped['event_rate'] = grouped['event'] / grouped['event'].sum()
    grouped['non_event_rate'] = grouped['non_event'] / grouped['non_event'].sum()
    
    grouped['WoE'] = np.log((grouped['event_rate'] + 1e-10) / (grouped['non_event_rate'] + 1e-10))
    grouped['IV'] = (grouped['event_rate'] - grouped['non_event_rate']) * grouped['WoE']
    
    iv = grouped['IV'].sum()
    return grouped[['event_rate', 'non_event_rate', 'WoE', 'IV']], iv

woe_iv_rider_app, iv_rider_app = calculate_woe_iv(filtered_data_no_outliers, 'rider_app_version', 'deviation_flag')

woe_iv_driver_app, iv_driver_app = calculate_woe_iv(filtered_data_no_outliers, 'driver_app_version', 'deviation_flag')

woe_iv_device_name, iv_device_brand = calculate_woe_iv(filtered_data_no_outliers, 'device_brand', 'deviation_flag')

woe_iv_device_name, iv_device_name = calculate_woe_iv(filtered_data_no_outliers, 'device_name', 'deviation_flag')

woe_iv_gps_confidence, iv_gps_confidence = calculate_woe_iv(filtered_data_no_outliers, 'fraud_score', 'deviation_flag')


print("\nRider App Version")
print(f"Information Value: {iv_rider_app:.4f}")

print("\nDriver App Version")
print(f"Information Value: {iv_driver_app:.4f}")

print("\nDevice brand")
print(f"Information Value: {iv_device_brand:.4f}")

print("\nDevice name")
print(f"Information Value: {iv_device_name:.4f}")

print("\nGPS confidence")
print(f"Information Value: {iv_gps_confidence:.4f}")

Now we have the weight of influence of each column in the prediction.

To avoid bloating the dataset with boolean columns to mark devices, we are going to use a feature interaction to cross both variables (device name and rider app version) and create a new one, more useful to our needs.

### Part 4: the end

Long road, isn't? Almost wrapping everything up.
Now we are going to use the model to really predict the new upfront prices, based on what we did above. The last output is a new column, with predictions.

In [None]:
import pandas as pd
import numpy as np

# Função para aplicar Target Encoding com a mediana
def target_encoding_with_median(dataframe, column, target):
    """
    Aplica target encoding usando a mediana do target em relação a uma coluna categórica.
    Adiciona uma nova coluna com os valores da mediana e substitui nulos pela mediana geral.
    """
    median_values = dataframe.groupby(column)[target].median()
    new_column_name = f"{column}_median_target"
    dataframe[new_column_name] = dataframe[column].map(median_values)
    dataframe[new_column_name] = dataframe[new_column_name].fillna(dataframe[new_column_name].median())
    return dataframe

# Aplicar Target Encoding com mediana para 'device_name' e 'rider_app_version'
filtered_data_no_outliers = target_encoding_with_median(filtered_data_no_outliers, 'device_name', 'metered_price')
filtered_data_no_outliers = target_encoding_with_median(filtered_data_no_outliers, 'rider_app_version', 'metered_price')

# Nenhuma transformação na variável categórica 'gps_confidence'
filtered_data_no_outliers['gps_confidence_clean'] = filtered_data_no_outliers['gps_confidence']

# Verificar se as novas colunas foram criadas
print("Visualização das colunas criadas:")
print(filtered_data_no_outliers[['device_name_median_target', 'rider_app_version_median_target', 'gps_confidence_clean']].head())

# Filtragem para Corridas Curtas e Longas
quartiles = filtered_data_no_outliers['metered_price'].quantile([0.25, 0.5, 0.75])
Q1, median, Q3 = quartiles[0.25], quartiles[0.5], quartiles[0.75]
threshold = (median + Q3) / 2

short_rides = filtered_data_no_outliers[filtered_data_no_outliers['metered_price'] <= threshold]
long_rides = filtered_data_no_outliers[filtered_data_no_outliers['metered_price'] > threshold]

# Recursos e Target para o modelo
features = [
    'distance', 
    'duration', 
    'device_name_median_target', 
    'rider_app_version_median_target',
    'gps_confidence_clean'  # Variável categórica sem transformação
]
target = 'metered_price'

# Preparação dos dados para o modelo
X_short = short_rides[features]
y_short = short_rides[target]

X_long = long_rides[features]
y_long = long_rides[target]

# Validação das preparações
print("\nDados preparados para o modelo:")
print(f"Corridas Curtas: X_short = {X_short.shape}, y_short = {y_short.shape}")
print(f"Corridas Longas: X_long = {X_long.shape}, y_long = {y_long.shape}")

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Function for target encoding using the median
def target_encoding_with_median(dataframe, column, target):
    """
    Applies target encoding using the median of the target relative to a categorical column.
    Adds a new column with median values and replaces nulls with the overall median.
    """
    median_values = dataframe.groupby(column)[target].median()
    new_column_name = f"{column}_median_target"
    dataframe[new_column_name] = dataframe[column].map(median_values)
    dataframe[new_column_name] = dataframe[new_column_name].fillna(dataframe[new_column_name].median())
    return dataframe

# Apply target encoding for 'device_name' and 'rider_app_version'
filtered_data_no_outliers = target_encoding_with_median(filtered_data_no_outliers, 'device_name', 'metered_price')
filtered_data_no_outliers = target_encoding_with_median(filtered_data_no_outliers, 'rider_app_version', 'metered_price')

# Retain 'gps_confidence' without transformation
filtered_data_no_outliers['gps_confidence_clean'] = filtered_data_no_outliers['gps_confidence']

# Define threshold for splitting rides into short and long
quartiles = filtered_data_no_outliers['metered_price'].quantile([0.25, 0.5, 0.75])
Q1, median, Q3 = quartiles[0.25], quartiles[0.5], quartiles[0.75]
threshold = Q3

short_rides = filtered_data_no_outliers[filtered_data_no_outliers['metered_price'] <= threshold]
long_rides = filtered_data_no_outliers[filtered_data_no_outliers['metered_price'] > threshold]

# Features and target for the model
features = [
    'distance', 
    'duration', 
    'device_name_median_target', 
    'rider_app_version_median_target',
    'gps_confidence_clean'
]
target = 'metered_price'

# Prepare data for the model
X_short = short_rides[features]
y_short = short_rides[target]

X_long = long_rides[features]
y_long = long_rides[target]

# Random Forest Model for Short Rides
print("\n=== Short Rides ===")
X_train_short, X_test_short, y_train_short, y_test_short = train_test_split(X_short, y_short, test_size=0.2, random_state=42)
rf_model_short = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_short.fit(X_train_short, y_train_short)
y_pred_short = rf_model_short.predict(X_test_short)

# Evaluate Short Rides Model
mae_short = mean_absolute_error(y_test_short, y_pred_short)
mse_short = mean_squared_error(y_test_short, y_pred_short)
r2_short = r2_score(y_test_short, y_pred_short)
print(f"Mean Absolute Error (MAE): {mae_short}")
print(f"Mean Squared Error (MSE): {mse_short}")
print(f"R-squared (R2): {r2_short}")

# Random Forest Model for Long Rides
print("\n=== Long Rides ===")
X_train_long, X_test_long, y_train_long, y_test_long = train_test_split(X_long, y_long, test_size=0.2, random_state=42)
rf_model_long = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_long.fit(X_train_long, y_train_long)
y_pred_long = rf_model_long.predict(X_test_long)

# Evaluate Long Rides Model
mae_long = mean_absolute_error(y_test_long, y_pred_long)
mse_long = mean_squared_error(y_test_long, y_pred_long)
r2_long = r2_score(y_test_long, y_pred_long)
print(f"Mean Absolute Error (MAE): {mae_long}")
print(f"Mean Squared Error (MSE): {mse_long}")
print(f"R-squared (R2): {r2_long}")

In [None]:
import matplotlib.pyplot as plt

# Função para plotar gráficos
def plot_results(y_test, y_pred, title_prefix):
    # Garantir que y_test e y_pred possuem o mesmo tamanho
    y_test = y_test[:len(y_pred)]  # Ajustar o tamanho de y_test
    
    residuals = y_test - y_pred

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Gráfico: Valores Reais vs Preditos
    axes[0].scatter(y_test, y_pred, color='blue', alpha=0.6)
    axes[0].plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--', lw=2)
    axes[0].set_title(f'{title_prefix}: Reais vs Preditos')
    axes[0].set_xlabel('Valores Reais')
    axes[0].set_ylabel('Valores Preditos')
    
    # Gráfico: Resíduos vs Preditos
    axes[1].scatter(y_pred, residuals, color='orange', alpha=0.6)
    axes[1].axhline(0, color='red', linestyle='--', lw=2)
    axes[1].set_title(f'{title_prefix}: Resíduos vs Preditos')
    axes[1].set_xlabel('Valores Preditos')
    axes[1].set_ylabel('Resíduos')

    plt.tight_layout()
    plt.show()

# Garantir a consistência entre valores para evitar erros
y_test_short = y_test_short[:len(y_pred_short)]
y_test_long = y_test_long[:len(y_pred_long)]

# Plotar resultados para Corridas Curtas
plot_results(y_test_short, y_pred_short, 'Corridas Curtas')

# Plotar resultados para Corridas Longas
plot_results(y_test_long, y_pred_long, 'Corridas Longas')