In [None]:
import pandas as pd

In [None]:
df=pd.read_excel(r"C:\Users\Martijn\Downloads\GSAF5.xlsx")

In [None]:
# Goal: Commit the perfect shark murder
# Problem Statement: Identify the correct location, activity, time of the day and what time of the year

# Hypothesis 1: The perfect location is Australia (can be narrowed down on state level if fitting to the country)
# Hypothesis 2: The perfect time of the year is early in the year on a morning
# Hypothesis 3: The deadliest sharks will be white, tiger and bull

# Columns we need: "Fatal Y/N", "Country", "Date", "Activity",  "Time"
# -> GFM: Fatal & Country
# -> LB: Date
# -> MB: Activty & Time


# further stuf:
# - age, gender of the person we try to kill
# species for "e.g should be killed by white shark"

In [None]:
df.columns

In [None]:
df.duplicated().sum()
# -> No duplicates


# Activity cleaning

In [None]:

df_time_activity_null = df[["Time", "Activity"]].isnull().mean()
activity_null = df_time_activity_null["Activity"]
print(f"Activity has {round(activity_null,2)*100}% empty values.")

In [None]:
# clean column from any typing mistakes
df.Activity = df.Activity.apply(lambda x: ' '.join(x.lower().split()).replace(" ", "_").replace("-","_") if isinstance(x,str) else x)

In [None]:
# summarise surfing activities into one
df.Activity = df.Activity.replace({"stand_up_paddleboarding":"surface_sport", "paddle_boarding":"surface_sport", "kite_surfing": "surface_sport","windsurfing":"surface_sport", "surf_skiing":"surface_sport","kayaking":"surface_sport","rowing":"surface_sport","canoeing":"surface_sport", "surfing":"surface_sport", "body_boarding": "surface_sport", "boogie_boarding": "surface_sport"})
#df['Activity'] = df['Activity'].str.replace(pat=r"(?i)(\S*surf\S*|\S*board\S*)", repl="surfing", regex=True)
# summarise diving activities
df['Activity'] = df['Activity'].str.replace(pat=r"(?i)(\S*dive\S*|\S*divi\S*)", repl="diving_activities", regex=True).replace({"diving":"diving_activities", "snorkeling":"diving_activities"})
# summarise bathing
df.Activity = df.Activity.replace({"walking":"swimming","wading":"swimming","playing":"swimming","floating_on_his_back": "swimming", "treading_water": "swimming", "body_surfing":"swimming", "bathing":"swimming", "standing":"swimming", "floating":"swimming"})
# summarise fishing
df['Activity'] = df['Activity'].str.replace(pat=r"(?i)(\S*fish\S*)", repl="fishing", regex=True)
# disaster
df.Activity = df.Activity.replace({"fell_overboard": "accident", "sea_disaster":"accident"})
# categorize other activities
df.Activity = df.Activity.apply(lambda x: x if x in ["surface_sport", "diving_activities", "swimming", "fishing", "accident"] else "unqualified")

In [None]:
df.Activity.value_counts().nlargest(20)

# Time Cleaning

In [None]:
time_null = df_time_activity_null["Time"]
print(f"Time has {round(time_null,2)*100}% empty values.")

In [None]:
df['Time'] = df['Time'].replace(to_replace=r'(?i).*noon.*', value='Afternoon', regex=True)
df['Time'] = df['Time'].replace(to_replace=r'(?i).*morning.*', value='Morning', regex=True)
df['Time'] = df['Time'].replace(to_replace=r'(?i).*Night.*', value='Night', regex=True)
df.Time = df.Time.replace({"P.M.":"Afternoon", "A.M.":"Morning", "Dusk":"Morning", "Midday": "Afternoon"})

In [None]:
def categorize_time(time):
    if pd.isna(time) or time == "?":
        return "Unknown"
    try:
        hour = int(time[:2])
        if 6 <= hour < 12:
            return "Morning"
        elif 12 <= hour < 18:
            return "Afternoon"
        elif 18 <= hour < 22:
            return "Evening"
        else:
            return "Night"
    except:
        return time

# Create a new column for time category based on time
df['Time_Category'] = df['Time'].apply(categorize_time)
df['Time_Category'].value_counts().nlargest(10)

In [None]:
from scipy.stats import chi2_contingency
df_crosstab = pd.crosstab(df["Fatal Y/N"], df.Activity)
chi2_statistic, chi2_p_value, _, _ = chi2_contingency(df_crosstab)

chi2_statistic, chi2_p_value

In [None]:
def clean_date_column(df, column_name, date_format='%Y-%m-%d', fill_na=None):
    """
    Cleans a date column in a DataFrame and returns both cleaned and invalid rows.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the date column.
    - column_name (str): The name of the date column to clean.
    - date_format (str, optional): The format to standardize the date to (default: '%Y-%m-%d').
    - fill_na (str or pd.Timestamp, optional): A default date to fill missing/invalid values. If None, does not fill.

    Returns:
    - pd.DataFrame: The DataFrame with the cleaned date column.
    - pd.DataFrame: A DataFrame containing rows that had invalid dates.
    - dict: A summary of how many rows were affected.
    """
    df = df.copy()  # Avoid modifying the original DataFrame

    # Convert column to datetime, coercing errors to NaT
    df[column_name] = pd.to_datetime(df[column_name], errors='coerce')

    # Separate valid and invalid date rows
    valid_df = df[df[column_name].notna()].copy()
    invalid_df = df[df[column_name].isna()].copy()

    # Count invalid dates
    invalid_count = len(invalid_df)

    # Handle missing values in valid_df
    if fill_na is not None:
        valid_df[column_name] = valid_df[column_name].fillna(pd.Timestamp(fill_na))

    # Standardize date format
    valid_df[column_name] = valid_df[column_name].dt.strftime(date_format)

    # Summary of changes
    summary = {
        "total_rows": len(df),
        "valid_rows": len(valid_df),
        "invalid_rows": invalid_count,
        "filled_rows": invalid_count if fill_na else 0
    }

    print("Date Cleaning Summary:", summary)
    
    return valid_df, invalid_df, summary

cleaned_df, invalid_df, summary = clean_date_column(df, 'Date')

print("\nCleaned DataFrame:")
display(cleaned_df)

print("\nInvalid DataFrame:")
display(invalid_df)
