In [1]:
import pandas as pd

In [2]:
df=pd.read_excel(r"C:\Users\Martijn\Downloads\GSAF5.xlsx")

In [3]:
# Goal: Commit the perfect shark murder
# Problem Statement: Identify the correct location, activity, time of the day and what time of the year

# Hypothesis 1: The perfect location is Australia (can be narrowed down on state level if fitting to the country)
# Hypothesis 2: The perfect time of the year is early in the year on a morning
# Hypothesis 3: The deadliest sharks will be white, tiger and bull

# Columns we need: "Fatal Y/N", "Country", "Date", "Activity",  "Time"
# -> GFM: Fatal & Country
# -> LB: Date
# -> MB: Activty & Time


# further stuf:
# - age, gender of the person we try to kill
# species for "e.g should be killed by white shark"

In [4]:
df.columns

Index(['Date', 'Year', 'Type', 'Country', 'State', 'Location', 'Activity',
       'Name', 'Sex', 'Age', 'Injury', 'Fatal Y/N', 'Time', 'Species ',
       'Source', 'pdf', 'href formula', 'href', 'Case Number', 'Case Number.1',
       'original order', 'Unnamed: 21', 'Unnamed: 22'],
      dtype='object')

In [5]:
df.duplicated().sum()
# -> No duplicates


0

# Activity cleaning

In [6]:

df_time_activity_null = df[["Time", "Activity"]].isnull().mean()
activity_null = df_time_activity_null["Activity"]
print(f"Activity has {round(activity_null,2)*100}% empty values.")

Activity has 8.0% empty values.


In [7]:
# clean column from any typing mistakes
df.Activity = df.Activity.apply(lambda x: ' '.join(x.lower().split()).replace(" ", "_").replace("-","_") if isinstance(x,str) else x)

In [8]:
# summarise surfing activities into one
df.Activity = df.Activity.replace({"stand_up_paddleboarding":"surface_sport", "paddle_boarding":"surface_sport", "kite_surfing": "surface_sport","windsurfing":"surface_sport", "surf_skiing":"surface_sport","kayaking":"surface_sport","rowing":"surface_sport","canoeing":"surface_sport", "surfing":"surface_sport", "body_boarding": "surface_sport", "boogie_boarding": "surface_sport"})
#df['Activity'] = df['Activity'].str.replace(pat=r"(?i)(\S*surf\S*|\S*board\S*)", repl="surfing", regex=True)
# summarise diving activities
df['Activity'] = df['Activity'].str.replace(pat=r"(?i)(\S*dive\S*|\S*divi\S*)", repl="diving_activities", regex=True).replace({"diving":"diving_activities", "snorkeling":"diving_activities"})
# summarise bathing
df.Activity = df.Activity.replace({"walking":"swimming","wading":"swimming","playing":"swimming","floating_on_his_back": "swimming", "treading_water": "swimming", "body_surfing":"swimming", "bathing":"swimming", "standing":"swimming", "floating":"swimming"})
# summarise fishing
df['Activity'] = df['Activity'].str.replace(pat=r"(?i)(\S*fish\S*)", repl="fishing", regex=True)
# disaster
df.Activity = df.Activity.replace({"fell_overboard": "accident", "sea_disaster":"accident"})
# categorize other activities
df.Activity = df.Activity.apply(lambda x: x if x in ["surface_sport", "diving_activities", "swimming", "fishing", "accident"] else "unqualified")

In [9]:
df.Activity.value_counts().nlargest(20)

Activity
unqualified          1808
swimming             1641
surface_sport        1436
fishing              1286
diving_activities     786
accident               47
Name: count, dtype: int64

# Time Cleaning

In [10]:
time_null = df_time_activity_null["Time"]
print(f"Time has {round(time_null,2)*100}% empty values.")

Time has 50.0% empty values.


In [11]:
df['Time'] = df['Time'].replace(to_replace=r'(?i).*noon.*', value='Afternoon', regex=True)
df['Time'] = df['Time'].replace(to_replace=r'(?i).*morning.*', value='Morning', regex=True)
df['Time'] = df['Time'].replace(to_replace=r'(?i).*Night.*', value='Night', regex=True)
df.Time = df.Time.replace({"P.M.":"Afternoon", "A.M.":"Morning", "Dusk":"Morning", "Midday": "Afternoon"})

In [None]:
def categorize_time(time):
    if pd.isna(time) or time == "?":
        return "Unknown"
    try:
        hour = int(time[:2])
        if 6 <= hour < 12:
            return "Morning"
        elif 12 <= hour < 18:
            return "Afternoon"
        elif 18 <= hour < 22:
            return "Evening"
        else:
            return "Night"
    except:
        return time

# Create a new column for time category based on time
df['Time_Category'] = df['Time'].apply(categorize_time)
df['Time_Category'].value_counts().nlargest(10)

Time_Category
Unknown          3545
Afternoon        1825
Morning          1124
Evening           289
Night             138
Sunset              6
--                  5
AM                  4
Late afternon       2
Dark                2
Name: count, dtype: int64

In [None]:
def clean_date_column(df, column_name, date_format='%Y-%m-%d', fill_na=None):
    """
    Cleans a date column in a DataFrame and returns both cleaned and invalid rows.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the date column.
    - column_name (str): The name of the date column to clean.
    - date_format (str, optional): The format to standardize the date to (default: '%Y-%m-%d').
    - fill_na (str or pd.Timestamp, optional): A default date to fill missing/invalid values. If None, does not fill.

    Returns:
    - pd.DataFrame: The DataFrame with the cleaned date column.
    - pd.DataFrame: A DataFrame containing rows that had invalid dates.
    - dict: A summary of how many rows were affected.
    """
    df = df.copy()  # Avoid modifying the original DataFrame

    # Convert column to datetime, coercing errors to NaT
    df[column_name] = pd.to_datetime(df[column_name], errors='coerce')

    # Separate valid and invalid date rows
    valid_df = df[df[column_name].notna()].copy()
    invalid_df = df[df[column_name].isna()].copy()

    # Count invalid dates
    invalid_count = len(invalid_df)

    # Handle missing values in valid_df
    if fill_na is not None:
        valid_df[column_name] = valid_df[column_name].fillna(pd.Timestamp(fill_na))

    # Standardize date format
    valid_df[column_name] = valid_df[column_name].dt.strftime(date_format)

    # Summary of changes
    summary = {
        "total_rows": len(df),
        "valid_rows": len(valid_df),
        "invalid_rows": invalid_count,
        "filled_rows": invalid_count if fill_na else 0
    }

    print("Date Cleaning Summary:", summary)
    
    return valid_df, invalid_df, summary

cleaned_df, invalid_df, summary = clean_date_column(df, 'Date')

print("\nCleaned DataFrame:")
display(cleaned_df)

print("\nInvalid DataFrame:")
display(invalid_df)


Date Cleaning Summary: {'total_rows': 7004, 'valid_rows': 6129, 'invalid_rows': 875, 'filled_rows': 0}

Cleaned DataFrame:


Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,...,Source,pdf,href formula,href,Case Number,Case Number.1,original order,Unnamed: 21,Unnamed: 22,Time_Category
0,2025-02-22,2025.0,Unprovoked,Australia,Western Australia,Blasck Wall Reach Swan River Perth,surface_sport,Georgina Smith,F,35,...,Bob Myatt GSAF,,,,,,,,,Morning
1,2025-02-22,2025.0,Questionable,Australia,NSW,Off Port Stephens near Newcastle,fishing,Paul Barning,M,58,...,Simon De Marchi: Bob Myatt GSAF,,,,,,,,,Afternoon
2,2025-02-22,2025.0,Unprovoked,Australia,Queensland,The Wrecks walking track Moreton Island,diving_activities,Unknown Male,M,29,...,Bob Myatt GSAF,,,,,,,,,Afternoon
3,2025-02-10,2025.0,Unprovoked,Bahamas,,Bimini Bay,swimming,Summer Layman,F,24,...,Dail Mail Australia Todd Smith: Kevin McMurray...,,,,,,,,,Evening
4,2025-02-10,2025.0,Unprovoked,Bahamas,,Bimini Bay,swimming,Rileigh Decker,F,20,...,Dail Mail Australia Todd Smith: Kevin McMurray...,,,,,,,,,Evening
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6853,1721-06-01,1721.0,Unprovoked,ITALY,Sardinia,"Ponte della Maddelena,",swimming,male,M,,...,F. Ricciardi; A. De Maddalena.,1721.06.00-Maddalena.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1721.06.00,1721.06.00,150.0,,,Unknown
6854,1703-03-26,1703.0,Unprovoked,BARBADOS,Southwest coast,Carlisle Bay,swimming,"Samuel Jennings, a deserter from the British f...",M,19,...,"W.R.Cutter, Vol.1, p.252",1703.03.26-Jennings.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1703.03.26,1703.03.26,149.0,,,Afternoon
6859,1691-01-01,1691.0,Unprovoked,CANADA,Quebec,Gaspé,unqualified,Sailor,M,,...,Canadian Shark Attack Registry,,,,,,,,,Unknown
6865,1970-01-01,1595.0,Unprovoked,INDIA,Kerala,River Cochin,unqualified,male,M,,...,The Voyage of John Huyghen van Linschoten,1595.00.00-Cochin.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1595.00.00,1595.00.00,140.0,,,Unknown



Invalid DataFrame:


Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,...,Source,pdf,href formula,href,Case Number,Case Number.1,original order,Unnamed: 21,Unnamed: 22,Time_Category
83,NaT,2023.0,Questionable,EGYPT,Southern Red Sea,Fury Shoal,diving_activities,Gary Virgo,M,66,...,"Daily Mail, 11/2/2023",,,,,,,,,Unknown
98,NaT,2023.0,Unprovoked,PHILIPPINES,Bolinao,"Pangasinan, Ilocos Norte, Luzon Island",fishing,male,M,,...,"Manila Bulletin, 9 /10/2023",,,,,,,,,Unknown
99,NaT,2023.0,Watercraft,Coral Sea,,,unqualified,9 m inflatable catamaran,M,,...,"B. Myatt, GSAF",,,,,,,,,Night
119,NaT,2023.0,Watercraft,SOUTH AFRICA,Eastern Cape Province,Chinta East,unqualified,crew,,,...,"Ladbible, 7/14/2023",,,,,,,,,Unknown
128,NaT,2023.0,Unprovoked,USA,Hawaii,Laysan,unqualified,Netflix crew,M,,...,"BBC, 6/14/2023",,,,,,,,,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6999,NaT,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,diving_activities,male,M,,...,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6.0,,,Unknown
7000,NaT,0.0,Unprovoked,AUSTRALIA,Western Australia,,diving_activities,Ahmun,M,,...,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5.0,,,Unknown
7001,NaT,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,swimming,Coast Guard personnel,M,,...,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4.0,,,Unknown
7002,NaT,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",unqualified,Jules Patterson,M,,...,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002,3.0,,,Unknown
