1. Import claims_data.csv and cust_data.csv which is provided to you and 
combine the two datasets appropriately to create a 360-degree view of 
the data. Use the same for the subsequent questions

In [None]:
import pandas as pd

claims_data = pd.read_csv('claims_data.csv')
cust_data = pd.read_csv('cust_data.csv')

combined_data = pd.merge(claims_data, cust_data, on='customer_id')


2. Perform a data audit for the datatypes and find out if there are any 
mismatch within the current datatypes of the columns and their 
business significance

In [None]:
import pandas as pd

combined_data = pd.read_csv('combined_data.csv')

print(combined_data.dtypes)

expected_dtypes = {'column1': 'int', 'column2': 'float', 'column3': 'object', ...}

for col, dtype in combined_data.dtypes.items():
    if col in expected_dtypes:
        if dtype != expected_dtypes[col]:
            print(f"Mismatch: Column {col} has datatype {dtype}, but expected datatype is {expected_dtypes[col]}")
    else:
        print(f"Warning: Column {col} is not specified in the expected_dtypes dictionary.")

 3. Convert the column claim_amount to numeric. Use the appropriate 
modules/attributes to remove the $ sign.

In [None]:
import pandas as pd
combined_data = pd.read_csv('combined_data.csv')
combined_data['claim_amount'] = combined_data['claim_amount'].replace({'\$': ''}, regex=True).astype(float)

4. Of all the injury claims, some of them have gone unreported with the 
police. Create an alert flag (1,0) for all such claims.

In [None]:
import pandas as pd
combined_data = pd.read_csv('combined_data.csv')
combined_data['alert_flag'] = 0
if 'police_report' in combined_data.columns:
    police_report_col = 'police_report'
elif 'police_report_exists' in combined_data.columns:

 5. One customer can claim for insurance more than once and in each claim,
 multiple categories of claims can be involved. However, customer ID 
should remain unique. 
Retain the most recent observation and delete any duplicated records in
 the data based on the customer ID column.

In [None]:
import pandas as pd
combined_data = pd.read_csv('combined_data.csv')
combined_data = combined_data.sort_values(['customer_id', 'claim_date'], ascending=[True, False])
combined_data = combined_data.drop_duplicates(subset='customer_id', keep='last')

 6. Check for missing values and impute the missing values with an 
appropriate value. (mean for continuous and mode for categorical)

In [None]:
import pandas as pd
import numpy as np
combined_data = pd.read_csv('combined_data.csv')
missing_values = combined_data.isnull().sum()
print(missing_values)
continuous_cols = ['claim_amount', 'age']
for col in continuous_cols:
    combined_data[col].fillna(combined_data[col].mean(), inplace=True)
    categorical_cols = ['gender', 'segment', 'incident_cause', 'alert_flag', 'fraudulent_claim']
for col in categorical_cols:
    combined_data[col].fillna(combined_data[col].mode()[0], inplace=True)

 7. Calculate the age of customers in years. Based on the age, categorize the
 customers according to the below criteria
 Children
 < 18
 Youth
 Adult
 Senior 
18-30
 30-60
 > 60

In [None]:
import pandas as pd
combined_data = pd.read_csv('combined_data.csv')
from datetime import datetime

def calculate_age(birth_date):
    today = datetime.today()
    age = today.year - birth_date.year

    if today.month < birth_date.month or (today.month == birth_date.month and today.day < birth_date.day):
        age -= 1

    return age

combined_data['age'] = combined_data['dob'].apply(calculate_age)
combined_data['age_category'] = pd.cut(combined_data['age'], bins=[0, 18, 30, 60, 100], labels=['Children', 'Youth', 'Adult', 'Senior'])

 8. What is the average amount claimed by the customers from various 
segments?

In [None]:
import pandas as pd
combined_data = pd.read_csv('combined_data.csv')
avg_claim_by_segment = combined_data.groupby('segment')['claim_amount'].mean()
print(avg_claim_by_segment)

9. What is the total claim amount based on incident cause for all the claims
 that have been done at least 20 days prior to 1st of October, 2018

In [None]:
import pandas as pd
from datetime import datetime
combined_data = pd.read_csv('combined_data.csv')
combined_data['claim_date'] = pd.to_datetime(combined_data['claim_date'])
threshold_date = datetime(2018, 10, 1) - timedelta(days=20)
claims_before_threshold = combined_data[combined_data['claim_date'] <= threshold_date]
total_claim_by_incident_cause = claims_before_threshold.groupby('incident_cause')['claim_amount'].sum()
print(total_claim_by_incident_cause)

 10. How many adults from TX, DE and AK claimed insurance for driver 
related issues and causes?

In [None]:
import pandas as pd
combined_data = pd.read_csv('combined_data.csv')
driver_claims_by_state = combined_data[(combined_data['state'].isin(['TX', 'DE', 'AK'])) & (combined_data['claim_category'].str.contains('driver'))]
adult_driver_claims_by_state = driver_claims_by_state[driver_claims_by_state['age'].apply(lambda x: x >= 18 and x <= 60)]
claim_count_by_state = adult_driver_claims_by_state['customer_id'].value_counts()
print(claim_count_by_state)

11. Draw a pie chart between the aggregated value of claim amount based 
on gender and segment. Represent the claim amount as a percentage on
 the pie chart.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
combined_data = pd.read_csv('combined_data.csv')
claim_amount_by_gender_segment = combined_data.groupby(['gender', 'segment'])['claim_amount'].sum().reset_index()
plt.figure(figsize=(8, 6))
pie_chart = sns.pieplot(x='claim_amount', data=claim_amount_by_gender_segment, hue='gender', palette=['blue', 'pink'], linewidth=1, edgecolor='black')
pie_chart.set_title("Aggregated Claim Amount by Gender and Segment")
pie_chart.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pie_chart.axis('equal')
plt.show()

12. Among males and females, which gender had claimed the most for any 
type of driver related issues? E.g. This metric can be compared using a 
bar chart

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
combined_data = pd.read_csv('combined_data.csv')
driver_claims = combined_data[combined_data['claim_category'].str.contains('driver')]
claim_count_by_gender = driver_claims['gender'].value_counts()
plt.figure(figsize=(8, 6))
claim_count_by_gender.plot(kind='bar')
plt.title("Number of Driver-Related Claims by Gender")
plt.xlabel('Gender')
plt.ylabel('Number of Claims')
plt.show()

 13. Which age group had the maximum fraudulent policy claims? Visualize 
it on a bar chart.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
combined_data = pd.read_csv('combined_data.csv')
age_group_claims = combined_data.groupby('age_group')['fraudulent_claim'].sum().reset_index()
plt.figure(figsize=(8, 6))
plt.bar(age_group_claims['age_group'], age_group_claims['fraudulent_claim'])
plt.title("Number of Fraudulent Policy Claims by Age Group")
plt.xlabel('Age Group')
plt.ylabel('Number of Claims')
plt.show()

 14. Visualize the monthly trend of the total amount that has been claimed 
by the customers. Ensure that on the “month” axis, the month is in a 
chronological order not alphabetical order.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Load the combined data
data = pd.read_csv('combined_data.csv')

# Group the data by month and sum the claim amount
monthly_claims = data.groupby('month')['claim_amount'].sum()

# Create a line plot of the monthly claim amount
plt.plot(monthly_claims.index, monthly_claims.values)
plt.xlabel('Month')
plt.ylabel('Total Claim Amount')
plt.title('Monthly Trend of Total Claim Amount')
plt.show()

15. What is the average claim amount for gender and age categories and 
suitably represent the above using a facetted bar chart, one facet that 
represents fraudulent claims and the other for non-fraudulent claims.
 Based on the conclusions from exploratory analysis as well as suitable 
statistical tests, answer the below questions. Please include a detailed 
write-up on the parameters taken into consideration, the Hypothesis 
testing steps, conclusion from the p-values and the business implications of 
the statements.

In [None]:
import pandas as pd

# Load the combined data
data = pd.read_csv('combined_data.csv')

# Calculate the average claim amount for each gender and age category
avg_claim_amount = data.groupby(['gender', 'age_category'])['claim_amount'].mean().reset_index()

# Add a column to indicate whether the claim is fraudulent or not
avg_claim_amount['fraudulent'] = data.groupby(['gender', 'age_category'])['fraudulent'].transform('mean')

# Create a new dataframe with two rows for each age category, one for fraudulent and one for non-fraudulent claims
avg_claim_amount_facetted = pd.concat([avg_claim_amount[avg_claim_amount['fraudulent'] == 0], avg_claim_amount[avg_claim_amount['fraudulent'] == 1]]).reset_index(drop=True)

# Rename the columns
avg_claim_amount_facetted = avg_claim_amount_facetted.rename(columns={'fraudulent': 'claim_type'})from scipy.stats import ttest_ind

# Perform a two-sample t-test for each age category
t_stats = []
p_values = []
for age_cat in avg_claim_amount['age_category'].unique():
    age_cat_data = avg_claim_amount[avg_claim_amount['age_category'] == age_cat]
    t_stat, p_val = ttest_ind(age_cat_data[age_cat_data['gender'] == 'Male']['claim_amount'],
                              age_cat_data[age_cat_data['gender'] == 'Female']['claim_amount'],
                              equal_var=False)
    t_stats.append(t_stat)
    p_values.append(p_val)

# Add the t-statistics and p-values to the dataframe
avg_claim_amount['t_stat'] = t_stats
avg_claim_amount['p_val'] = p_values


16. Is there any similarity in the amount claimed by males and females?

In [None]:
# Import necessary libraries
import pandas as pd
from scipy.stats import ttest_ind

# Assuming avg_claim_amount is the given dataframe

# Create lists to store t-statistics and p-values
t_stats = []
p_values = []

# Perform t-test for each age category
for age_cat in avg_claim_amount['age_category'].unique():
    age_cat_data = avg_claim_amount[avg_claim_amount['age_category'] == age_cat]
    t_stat, p_val = ttest_ind(age_cat_data[age_cat_data['gender'] == 'Male']['claim_amount'],
                              age_cat_data[age_cat_data['gender'] == 'Female']['claim_amount'],
                              equal_var=False)
    t_stats.append(t_stat)
    p_values.append(p_val)

# Add the t-statistics and p-values to the dataframe
avg_claim_amount['t_stat'] = t_stats
avg_claim_amount['p_val'] = p_values

17. Is there any relationship between age category and segment?

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Load the data
data = pd.read_csv("claim_data.csv")

# Create a contingency table between age category and segment
contingency_table = pd.crosstab(data["age_category"], data["segment"])

# Perform the chi-square test of independence
chi2, p_value, _, _ = chi2_contingency(contingency_table)

print("Chi-square statistic:", chi2)
print("p-value:", p_value)

18. The current year has shown a significant rise in claim amounts as 
compared to 2016-17 fiscal average which was $10,000

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the data
data = pd.read_csv("claim_data.csv")

# Create a feature matrix
features = data[["claim_amount", "segment", "age_category", "gender", "month"]]

# Create a target variable
target = data["is_fraudulent"]

# Split the data into training and testing sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train the classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(features_train, target_train)

# Make predictions on the testing set
predictions = clf.predict(features_test)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(target_test, predictions)
print("Accuracy:", accuracy)

19. Is there any difference between age groups and insurance claims?

In [None]:
average_claim_amounts = data.groupby(["age_category", "segment"])["claim_amount"].mean()
print(average_claim_amounts)

20. Is there any relationship between total number of policy claims and the 
claimed amount?

In [None]:
from scipy.stats import pearsonr

# Calculate the Pearson correlation coefficient
correlation_coefficient, _ = pearsonr(data["total_policy_claims"], data["claim_amount"])

print("Pearson correlation coefficient:", correlation_coefficient)