In [None]:
# Task 3: Hypothesis Testing

# **1. Import Necessary Libraries and Functions**
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath('../src/utils'))
from hypothesis_testing import (
    calculate_claim_metrics,
    segment_data,
    perform_t_test,
    perform_chi_squared_test,
    interpret_results
)
import matplotlib.pyplot as plt
import seaborn as sns

# **2. Load the Dataset**
# Replace with your actual dataset path
data_path = "../data/raw/MachineLearningRating_v3.txt"
df = pd.read_csv(data_path, sep="\t")

# **3. Explore and Preprocess Data**
print("Initial Dataset Overview:")
print(df.head())
print("\nDataset Summary:")
print(df.info())

# Add calculated metrics
print("\nCalculating Claim Metrics...")
df = calculate_claim_metrics(df)
print(df.head())


# **4. Hypothesis 1: Risk Differences Across Provinces**
# Data segmentation
province_a, province_b = segment_data(df, "Province", "Province_A", "Province_B")

# Perform statistical testing
province_test_result = perform_t_test(province_a["ClaimFrequency"], province_b["ClaimFrequency"])

# Interpret results
province_interpretation = interpret_results("Province", province_test_result)
print(province_interpretation)

# **5. Hypothesis 2: Risk Differences Between Zip Codes**
# Perform Chi-Squared Test
zip_test_result = perform_chi_squared_test(df, "ZipCode", "ClaimFrequency")

# Interpret results
zip_interpretation = interpret_results("ZipCode", zip_test_result)
print(zip_interpretation)

# **6. Visualization**
# Distribution of Claim Frequencies by Province
plt.figure(figsize=(10, 6))
sns.boxplot(x="Province", y="ClaimFrequency", data=df)
plt.title("Claim Frequency by Province")
plt.show()

# **7. Save Results**
# Save metrics and results
df.to_csv("../data/processed/claim_metrics.csv", index=False)
print("Claim metrics saved to claim_metrics.csv.")


Initial Dataset Overview:
  UnderwrittenCoverID|PolicyID|TransactionMonth|IsVATRegistered|Citizenship|LegalType|Title|Language|Bank|AccountType|MaritalStatus|Gender|Country|Province|PostalCode|MainCrestaZone|SubCrestaZone|ItemType|mmcode|VehicleType|RegistrationYear|make|Model|Cylinders|cubiccapacity|kilowatts|bodytype|NumberOfDoors|VehicleIntroDate|CustomValueEstimate|AlarmImmobiliser|TrackingDevice|CapitalOutstanding|NewVehicle|WrittenOff|Rebuilt|Converted|CrossBorder|NumberOfVehiclesInFleet|SumInsured|TermFrequency|CalculatedPremiumPerTerm|ExcessSelected|CoverCategory|CoverType|CoverGroup|Section|Product|StatutoryClass|StatutoryRiskType|TotalPremium|TotalClaims
0  145249|12827|2015-03-01 00:00:00|True|  |Close...                                                                                                                                                                                                                                                                                   