In [2]:
# Task 3: Hypothesis Testing

# **1. Import Necessary Libraries and Functions**
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath('../src/utils'))
from hypothesis_testing import (
    calculate_claim_metrics,
    segment_data,
    perform_t_test,
    perform_chi_squared_test,
    interpret_results
)
import matplotlib.pyplot as plt
import seaborn as sns

# **2. Load the Dataset**
# Replace with your actual dataset path
data_path = "../data/raw/MachineLearningRating_v3.txt"
df = pd.read_csv(data_path, sep="|")

# **3. Explore and Preprocess Data**
print("Initial Dataset Overview:")
print(df.head())
print("\nDataset Summary:")
print(df.info())

# Add calculated metrics
print("\nCalculating Claim Metrics...")
df = calculate_claim_metrics(df)
print(df.head())


# **4. Hypothesis 1: Risk Differences Across Provinces**
# Data segmentation
province_a, province_b = segment_data(df, "Province", "Province_A", "Province_B")

# Perform statistical testing
province_test_result = perform_t_test(province_a["ClaimFrequency"], province_b["ClaimFrequency"])

# Interpret results
province_interpretation = interpret_results("Province", province_test_result)
print(province_interpretation)

# **5. Hypothesis 2: Risk Differences Between Postal Codes**
# Perform Chi-Squared Test
zip_test_result = perform_chi_squared_test(df, "PostalCode", "ClaimFrequencyCat")

# Interpret results
zip_interpretation = interpret_results("PostalCode", zip_test_result)
print(zip_interpretation)

# **6. Visualization**
# Clean data for plotting
plot_df = df.copy()
plot_df = plot_df.dropna(subset=["Province", "ClaimFrequency"])
plot_df["Province"] = plot_df["Province"].astype(str).str.strip()

# Ensure there are at least two unique provinces
if plot_df["Province"].nunique() < 2:
    print("Not enough unique provinces for boxplot.")
else:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x="Province", y="ClaimFrequency", data=plot_df)
    plt.title("Claim Frequency by Province")
    plt.show()

# **7. Save Results**
# Save metrics and results
df.to_csv("../data/processed/claim_metrics.csv", index=False)
print("Claim metrics saved to claim_metrics.csv.")


  df = pd.read_csv(data_path, sep="|")


Initial Dataset Overview:
   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0               145249     12827  2015-03-01 00:00:00             True   
1               145249     12827  2015-05-01 00:00:00             True   
2               145249     12827  2015-07-01 00:00:00             True   
3               145255     12827  2015-05-01 00:00:00             True   
4               145255     12827  2015-07-01 00:00:00             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   
1              Close Corporation    Mr  English  First National Bank   
2              Close Corporation    Mr  English  First National Bank   
3              Close Corporation    Mr  English  First National Bank   
4              Close Corporation    Mr  English  First National Bank   

       AccountType  ...                    ExcessSelected CoverCategory  \
0  Current account  .

  return f(*args, **kwargs)


The null hypothesis for Province is not rejected (p = nan). No significant effect detected.
   PostalCode  ClaimFrequency  ClaimFrequencyCat
0        1459             NaN                  0
1        1459             NaN                  0
2        1459             NaN                  0
3        1459             NaN                  0
4        1459             NaN                  0
The null hypothesis for PostalCode is not rejected (p = 1.000). No significant effect detected.
Not enough unique provinces for boxplot.
Not enough unique provinces for boxplot.
Claim metrics saved to claim_metrics.csv.
Claim metrics saved to claim_metrics.csv.
