In [None]:
import pandas as pd
import re
import datetime

In [None]:
df = pd.read_excel("GSAF5.xlsx")

In [None]:
# Goal: Commit the perfect shark murder
# Problem Statement: Identify the correct location, activity, time of the day and what time of the year

# Hypothesis 1: The perfect location is Australia (can be narrowed down on state level if fitting to the country)
# Hypothesis 2: The perfect time of the year is early in the year on a morning
# Hypothesis 3: The deadliest sharks will be white, tiger and bull

# Columns we need: "Fatal Y/N", "Country", "Date", "Activity",  "Time"
# -> GFM: Fatal & Country
# -> LB: Date
# -> MB: Activty & Time


# further stuf:
# - age, gender of the person we try to kill
# species for "e.g should be killed by white shark"

In [None]:
df.columns

In [None]:
df.duplicated().sum()
# -> No duplicates


# Activity cleaning

In [None]:
# Goal: Commit the perfect shark murder
# Problem Statement: Identify the correct location, activity,  time of the day and what time of the year

df_time_activity_null = df[["Time", "Activity"]].isnull().mean()
activity_null = df_time_activity_null["Activity"]
print(f"Activity has {round(activity_null,2)*100}% empty values.")

In [None]:
# clean column from any typing mistakes
df.Activity = df.Activity.apply(lambda x: ' '.join(x.lower().split()).replace(" ", "_").replace("-","_") if isinstance(x,str) else x)

In [None]:
# summarise surfing activities into one
df.Activity = df.Activity.replace({"stand_up_paddleboarding":"surface_sport", "paddle_boarding":"surface_sport", "kite_surfing": "surface_sport","windsurfing":"surface_sport", "surf_skiing":"surface_sport","kayaking":"surface_sport","rowing":"surface_sport","canoeing":"surface_sport", "surfing":"surface_sport", "body_boarding": "surface_sport", "boogie_boarding": "surface_sport"})
#df['Activity'] = df['Activity'].str.replace(pat=r"(?i)(\S*surf\S*|\S*board\S*)", repl="surfing", regex=True)
# summarise diving activities
df['Activity'] = df['Activity'].str.replace(pat=r"(?i)(\S*dive\S*|\S*divi\S*)", repl="diving_activities", regex=True).replace({"diving":"diving_activities", "snorkeling":"diving_activities"})
# summarise bathing
df.Activity = df.Activity.replace({"walking":"swimming","wading":"swimming","playing":"swimming","floating_on_his_back": "swimming", "treading_water": "swimming", "body_surfing":"swimming", "bathing":"swimming", "standing":"swimming", "floating":"swimming"})
# summarise fishing
df['Activity'] = df['Activity'].str.replace(pat=r"(?i)(\S*fish\S*)", repl="fishing", regex=True)
# disaster
df.Activity = df.Activity.replace({"fell_overboard": "accident", "sea_disaster":"accident"})
# categorize other activities
df.Activity = df.Activity.apply(lambda x: x if x in ["surface_sport", "diving_activities", "swimming", "fishing", "accident"] else "unqualified")

In [None]:
df.Activity.value_counts().nlargest(20)

# Time Cleaning

In [None]:
time_null = df_time_activity_null["Time"]
print(f"Time has {round(time_null,2)*100}% empty values.")

In [None]:
df['Time'] = df['Time'].replace(to_replace=r'(?i).*noon.*', value='Afternoon', regex=True)
df['Time'] = df['Time'].replace(to_replace=r'(?i).*morning.*', value='Morning', regex=True)
df['Time'] = df['Time'].replace(to_replace=r'(?i).*Night.*', value='Night', regex=True)
df.Time = df.Time.replace({"P.M.":"Afternoon", "A.M.":"Morning", "Dusk":"Morning", "Midday": "Afternoon"})

In [None]:
def categorize_time(time):
    if pd.isna(time) or time == "?":
        return "Unknown"
    try:
        hour = int(time[:2])
        if 6 <= hour < 12:
            return "Morning"
        elif 12 <= hour < 18:
            return "Afternoon"
        elif 18 <= hour < 22:
            return "Evening"
        else:
            return "Night"
    except:
        return time

# Columns we need: "Fatal Y/N", "Country", "Date", "Activity",  "Time"
# -> GFM: Fatal & Country
# -> LB: Date
# -> MB: Activty & Time


# further stuf:
# - age, gender of the person we try to kill
# species for "e.g should be killed by white shark"





-----
# Guillermo Working on Cleaning  Fatal Y/N


In [None]:
# Code to clean the 'Fatal Y/N' column in the DataFrame

# Define a dictionary to map current values to 'Yes' or 'No'
'''replacements = {
    'Y': 'Yes',
    'N': 'No',
    'UNKNOWN': 'No',  # Example of treating unknown as 'No', adjust as needed
    'nan': 'No',      # Handle missing values, adjust if they should be 'Yes' 
    # Add more mappings if necessary
}
'''
# Sample data as a list
values = ['N', 'Y', 'F', 'M', float('nan'), 'n', 'Nq', 'UNKNOWN', 2017, 'Y x 2', ' N', 'N ', 'y']

# Define a dictionary for the mappings
mapping = {
    'N': 'N',
    'Y': 'Y',
    'F': 'U',           # Assuming 'F' and 'M' are unusual entries we'll consider unknown ('U')
    'M': 'U',
    'n': 'N',
    'Nq': 'N',
    'UNKNOWN': 'U',
    2017: 'U',         # With the numerical could Keep numerical years unchanged or convert to 'U'
    'Y x 2': 'Y',       # Considering 'Y x 2' as a form of 'Yes'
    ' N': 'N',
    'N ': 'N',
    'y': 'Y'
}

# Apply the mapping to the column
df['Fatal Y/N'] = df['Fatal Y/N'].replace(mapping).fillna('U')

# Print the cleaned DataFrame column
# print(df['Fatal Y/N'].value_counts())


''' Sample data as a list
values = ['N', 'Y', 'F', 'M', float('nan'), 'n', 'Nq', 'UNKNOWN', 2017, 'Y x 2', ' N', 'N ', 'y']

different values in column fatal y/n:
 Fatal Y/N
N          4877
Y          1474
UNKNOWN      71
 N            7
F             5
M             3
n             1
Nq            1
2017          1
Y x 2         1
N             1
y             1
Name: count, dtype: int64
['N' 'Y' 'F' 'M' nan 'n' 'Nq' 'UNKNOWN' 2017 'Y x 2' ' N' 'N ' 'y']
Number of NaN values in 'Fatal Y/N' column: 561

The result is:

different values in column fatal y/n:
 Fatal Y/N
N    4887
Y    1476
U     641
Name: count, dtype: int64
['N' 'Y' 'U']
Number of NaN values in 'Fatal Y/N' column: 0'''




----
# Cleaning Country:

In [None]:
# Strip leading/trailing whitespaces and convert to lowercase
df['Country'] = df['Country'].str.lower().str.strip()


In [None]:
# Replace NaN in the 'Country' column with 'unknown'
df['Country'] = df['Country'].fillna('unknown')


In [None]:
# 1st step: Clean the 'Country' column

# Create the 'Region' column because too many entries with regions instead of countries in the 'Country' column. 
# that way we can still use the data even if it more generalized. Also there are a lot of small islands that have to be research to know where are they located.

regions = [
    'atlantic ocean', 'pacific ocean', 'indian ocean', 'red sea', 'mediterranean sea',
    'caribbean sea', 'bay of bengal', 'gulf of aden', 'persian gulf', 'tasman sea',
    'south china sea', 'north atlantic ocean', 'south atlantic ocean', 'north pacific ocean',
    'south pacific ocean', 'southwest pacific ocean', 'central pacific', 'coral sea',
    'mid atlantic ocean', 'mid-pacifc ocean', 'between portugal & india', 'coast of africa',
    'the balkans', 'british isles', 'west indies', 'antarctica', 'ocean', 'africa',
    'europe', 'north america', 'south america', 'oceania', 'middle east', 'asia'
]
# Clean 'Country' by removing '?' and whitespace
df['Country_clean'] = df['Country'].str.replace('?', '', regex=False).str.strip()

# Initialize 'Region' as 'unknown'
df['Region'] = 'unknown'

# Identify rows where cleaned 'Country' is a region
is_region = df['Country_clean'].isin(regions)

# Update 'Region' and 'Country' for these rows
df.loc[is_region, 'Region'] = df.loc[is_region, 'Country_clean']
df.loc[is_region, 'Country'] = 'unknown'

# Drop the temporary 'Country_clean' column
df.drop(columns=['Country_clean'], inplace=True)

In [None]:
# Regroup regions and correct typos and more general regions
region_grouping = {
    # Ambiguous regions
    'british isles': 'northern europe',
    'ocean': 'unknown',
    'coast of africa': 'africa',
    'mid-pacifc ocean': 'pacific ocean',  # Corrected typo
    'between portugal & india': 'unknown',
    'the balkans': 'southeast europe',
    'mid atlantic ocean': 'atlantic ocean',
    'central pacific': 'pacific ocean',
    
    # Optional: Group smaller regions under continents
    'caribbean sea': 'caribbean',
    'bay of bengal': 'indian ocean',
    'gulf of aden': 'indian ocean',
    'persian gulf': 'middle east',
    'tasman sea': 'pacific ocean',
    'south china sea': 'pacific ocean',
    'mediterranean sea': 'mediterranean',
}
# Step 1: Correct typos (e.g., "mid-pacifc ocean" → "mid-pacific ocean")
df['Region'] = df['Region'].str.replace('pacifc', 'pacific')

# Step 2: Apply grouping
df['Region'] = df['Region'].replace(region_grouping)

# Step 3: Handle remaining ambiguous entries (e.g., "africa", "asia")
# Optional: If you want continents as regions, keep them. Otherwise:
continent_to_region = {
    'africa': 'africa',  # Keep if continents are valid regions for your analysis
    'asia': 'asia',
    'europe': 'europe',
    'north america': 'north america',
    'south america': 'south america',
}
df['Region'] = df['Region'].replace(continent_to_region)

In [None]:
# Get current column list
cols = df.columns.tolist()

# Remove 'Region' from its current position
cols.remove('Region')

# Find the position of 'Country' and insert 'Region' before it
country_position = cols.index('Country')
cols.insert(country_position, 'Region')

# Reorder the DataFrame
df = df[cols]

-------
Cleaning more the Country column

In [None]:
# 2nd version of country mapping (after taking out the regions)
country_mapping = {
    # Fix typos and variations
    'st kitts / nevis': 'saint kitts and nevis',
    'st. martin': 'saint martin',
    'st. maartin': 'saint martin',
    'st helena, british overseas territory': 'saint helena',
    'turks & caicos': 'turks and caicos',
    'ceylon (sri lanka)': 'sri lanka',
    'ceylon': 'sri lanka',
    'british isles': 'united kingdom',  # Preserve as a distinct entity
    'burma': 'myanmar',
    'western samoa': 'samoa',
    'british new guinea': 'papua new guinea',
    'andaman / nicobar islandas': 'andaman and nicobar islands',
    'andaman islands': 'andaman and nicobar islands',
    'johnston island': 'johnston atoll',  # Preserve as a distinct entity
    'british virgin islands': 'virgin islands',  # Preserve as a distinct entity
    'netherlands antilles': 'netherlands antilles',  # Preserve as a distinct entity
    'san domingo': 'dominican republic',
    'grand cayman': 'cayman islands',  # Preserve as a distinct entity
    'coast of africa': 'africa',  # Preserve as a distinct entity
    'red sea / indian ocean': 'indian ocean',  # Preserve as a distinct entity
    # 'between portugal & india': 'between portugal and india',  # Preserve as a distinct entity
    # 'ocean': 'ocean',  # Preserve as a distinct entity
    # 'unknown': 'unknown',

    # Fix misspellings without grouping
    'columbia': 'colombia',
    
    'united arab emirates (uae)': 'united arab emirates',
    'solomon islands / vanuatu': 'solomon islands and vanuatu',
    'trinidad & tobago': 'trinidad and tobago',
                                                # Countries with double country 
    'egypt / israel': 'egypt',           # Egypt is often in total
    'equatorial guinea / cameroon': 'equatorial guinea',
    'iran / iraq': 'iran',
    'italy / croatia': 'italy',
    # 'red sea / indian ocean': 'red sea and indian ocean',
    # 'mid-pacifc ocean': 'mid-pacific ocean',
    # 'the balkans': 'the balkans',  # Preserve as a distinct entity
}

# Apply the mapping to the 'Country' column
df['Country'] = df['Country'].replace(country_mapping)

print(df['Country'].unique())

In [None]:
region_mapping = {
    # Australia and Oceania
    'australia': 'australia and oceania',
    'new caledonia': 'australia and oceania',
    'new zealand': 'australia and oceania',
    'fiji': 'australia and oceania',
    'samoa': 'australia and oceania',
    'papua new guinea': 'australia and oceania',
    'solomon islands': 'australia and oceania',
    'vanuatu': 'australia and oceania',
    'kiribati': 'australia and oceania',
    'tonga': 'australia and oceania',
    'micronesia': 'australia and oceania',
    'marshall islands': 'australia and oceania',
    'palau': 'australia and oceania',
    'cook islands': 'australia and oceania',
    'french polynesia': 'australia and oceania',
    'american samoa': 'australia and oceania',
    'guam': 'australia and oceania',
    'northern mariana islands': 'australia and oceania',
    'new guinea': 'australia and oceania',
    'admiralty islands': 'australia and oceania',
    'johnston atoll': 'australia and oceania',

    # Caribbean
    'bahamas': 'caribbean',
    'turks and caicos': 'caribbean',
    'jamaica': 'caribbean',
    'cuba': 'caribbean',
    'dominican republic': 'caribbean',
    'cayman islands': 'caribbean',
    'aruba': 'caribbean',
    'puerto rico': 'caribbean',
    'saint kitts and nevis': 'caribbean',
    'saint martin': 'caribbean',
    'trinidad and tobago': 'caribbean',
    'barbados': 'caribbean',
    'grenada': 'caribbean',
    'antigua': 'caribbean',
    'virgin islands': 'caribbean',
    'haiti': 'caribbean',
    'belize': 'caribbean',
    'netherlands antilles': 'caribbean',
    'curacao': 'caribbean',

    # North America
    'usa': 'north america',
    'canada': 'north america',
    'mexico': 'north america',
    'greenland': 'north america',
    'bermuda': 'north america',

    # South America
    'brazil': 'south america',
    'argentina': 'south america',
    'colombia': 'south america',
    'ecuador': 'south america',
    'chile': 'south america',
    'peru': 'south america',
    'venezuela': 'south america',
    'uruguay': 'south america',
    'paraguay': 'south america',
    'guyana': 'south america',
    'falkland islands': 'south america',

    # Europe
    'spain': 'southern europe',
    'portugal': 'southern europe',
    'greece': 'southern europe',
    'italy': 'southern europe',
    'malta': 'southern europe',
    'croatia': 'southern europe',
    'montenegro': 'southern europe',
    'slovenia': 'southern europe',
    'france': 'western europe',
    'united kingdom': 'western europe',
    'ireland': 'western europe',
    'iceland': 'northern europe',
    'norway': 'northern europe',
    'sweden': 'northern europe',
    'azores': 'western europe',
    'cyprus': 'southern europe',
    'crete': 'southern europe',

    # Africa
    'egypt': 'northern africa',
    'morocco': 'northern africa',
    'libya': 'northern africa',
    'tunisia': 'northern africa',
    'algeria': 'northern africa',
    'south africa': 'southern africa',
    'mozambique': 'southern africa',
    'madagascar': 'southern africa',
    'mauritius': 'southern africa',
    'seychelles': 'southern africa',
    'comoros': 'southern africa',
    'reunion': 'southern africa',
    'reunion island': 'southern africa',
    'saint helena': 'southern africa',
    'angola': 'central africa',
    'cameroon': 'central africa',
    'gabon': 'central africa',
    'equatorial guinea': 'central africa',
    'kenya': 'eastern africa',
    'tanzania': 'eastern africa',
    'somalia': 'eastern africa',
    'ethiopia': 'eastern africa',
    'djibouti': 'eastern africa',
    'senegal': 'western africa',
    'nigeria': 'western africa',
    'ghana': 'western africa',
    'liberia': 'western africa',
    'sierra leone': 'western africa',
    'cape verde': 'western africa',
    'namibia': 'southern africa',
    'sudan': 'northern africa',
    'sudan?': 'northern africa',

    # Asia
    'india': 'south asia',
    'sri lanka': 'south asia',
    'maldives': 'south asia',
    'maldive islands': 'south asia',
    'bangladesh': 'south asia',
    'pakistan': 'south asia',
    'iran': 'middle east',
    'iraq': 'middle east',
    'saudi arabia': 'middle east',
    'yemen': 'middle east',
    'jordan': 'middle east',
    'lebanon': 'middle east',
    'syria': 'middle east',
    'kuwait': 'middle east',
    'united arab emirates': 'middle east',
    'qatar': 'middle east',
    'oman': 'middle east',
    'israel': 'middle east',
    'palestinian territories': 'middle east',
    'turkey': 'middle east',
    'china': 'east asia',
    'japan': 'east asia',
    'south korea': 'east asia',
    'north korea': 'east asia',
    'taiwan': 'east asia',
    'hong kong': 'east asia',
    'thailand': 'southeast asia',
    'malaysia': 'southeast asia',
    'singapore': 'southeast asia',
    'indonesia': 'southeast asia',
    'philippines': 'southeast asia',
    'vietnam': 'southeast asia',
    'myanmar': 'southeast asia',
    'cambodia': 'southeast asia',
    'laos': 'southeast asia',
    'brunei': 'southeast asia',

    # Pacific
    'hawaii': 'pacific',
    'okinawa': 'pacific',
    'new britain': 'pacific',
    'java': 'pacific',
    'solomon islands and vanuatu': 'pacific',
    'federated states of micronesia': 'pacific',

    # Other
    'unknown': 'unknown',
    'indian ocean': 'indian ocean',
    'northern arabian sea': 'indian ocean',
    'north sea': 'north sea',
    'atlantic ocean': 'atlantic ocean',
    'pacific ocean': 'pacific ocean',
    'mediterranean sea': 'mediterranean',
}

# Apply the mapping to create/update the 'Region' column
df['Region'] = df['Country'].map(region_mapping)

# Fill any missing values with 'unknown'
df['Region'] = df['Region'].fillna('unknown')

In [None]:
df["Date"] = df["Date"].astype(str).str.replace(r"\s*00:00:00", "", regex=True)
# Define a regex pattern for valid dates
valid_date_pattern = r"^\d{2}-[A-Za-z]{3}-\d{4}$|^[A-Za-z]{3}-\d{4}$"

# Create a mask to filter correct dates
correct_dates = df[df["Date"].str.match(valid_date_pattern, na=False)]

# Store incorrect dates separately for cleaning
incorrect_dates = df[~df["Date"].str.match(valid_date_pattern, na=False)]
# Define unwanted keywords
unwanted_patterns = r"(no date|world war|before|after|between|c[.\s]?a[.\s]?|circa|approx|around|B\.?C\.?|BCE|A\.?D\.?|CE|prior|\bOR\b)"

# Drop rows containing any of these words
incorrect_dates = incorrect_dates[~incorrect_dates["Date"].apply(lambda x: bool(re.search(unwanted_patterns, str(x), re.IGNORECASE)))]
# Remove the words but keep the rest of the date
incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(r"\b(Reported|Late|Early)\b", "", case=False, regex=True).str.strip()
# Define regex pattern for standalone years or decades
standalone_year_pattern = r"^\d{4}s?$"  

# Drop rows that match exactly a 4-digit year or decade
incorrect_dates = incorrect_dates[~incorrect_dates["Date"].str.match(standalone_year_pattern, na=False)]

# Fix missing hyphens in "DD MMM YYYY" and "DD-MMM YYYY" formats
incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(
    r"(\d{1,2})\s([A-Za-z]+)-?\s?(\d{4})",  # Matches '22 Jul-2023', '5 Jul 2014', '3-Jul 2023'
    r"\1-\2-\3",  # Converts to '22-Jul-2023', '5-Jul-2014'
    regex=True
)
incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(
    r"(\d{4})\.(\d{2})\.(\d{2})", r"\3-\2-\1", regex=True  # Rearranges YYYY.MM.DD → DD-MM-YYYY
)
incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(
    r"(\d{4}\.\d{2}\.\d{2})\.\w$", r"\1", regex=True  # Removes the trailing single letter
)
incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(
    r"^[\.\s]+", "", regex=True  # Removes leading dots and spaces
)
incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(
    r"(\d{1,2})-(Decp)(\d{4})", r"\1-Dec-\3", regex=True
)
# Fix missing hyphens in "DD-MMM YYYY" (e.g., "08-Jun 2023" → "08-Jun-2023")
incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(
    r"(\d{1,2}-[A-Za-z]{3}) (\d{4})", r"\1-\2", regex=True
)

# Fix missing hyphens in "DD MMM-YYYY" (e.g., "13-May 1871" → "13-May-1871")
incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(
    r"(\d{1,2}) ([A-Za-z]{3}-\d{4})", r"\1-\2", regex=True
)
# 1️⃣ Convert YYYY-MM-DD → DD-MMM-YYYY  (e.g., "2025-02-22" → "22-Feb-2025")
incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(
    r"(\d{4})-(\d{2})-(\d{2})", r"\3-\2-\1", regex=True
)

# 2️⃣ Fix ordinal days (1st, 2nd, 3rd, etc.) → Normal numbers  (e.g., "3rd-Oct-2016" → "03-Oct-2016")
incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(
    r"(\d{1,2})(st|nd|rd|th)-", r"\1-", regex=True
)

# 3️⃣ Standardize month names (e.g., "31-July-2009" → "31-Jul-2009")
month_fix = {
    "January": "Jan", "February": "Feb", "March": "Mar", "April": "Apr", "May": "May", "June": "Jun",
    "July": "Jul", "August": "Aug", "September": "Sep", "October": "Oct", "November": "Nov", "December": "Dec"
}
for long_month, short_month in month_fix.items():
    incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(long_month, short_month, regex=True)

# 4️⃣ Add leading zero to single-digit days (e.g., "9-Mar-2018" → "09-Mar-2018")
incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(
    r"\b(\d{1})-([A-Za-z]{3}-\d{4})", r"0\1-\2", regex=True
)

# 5️⃣ Fix incorrect "Month-Day-Year" order → Correct "Day-Month-Year" (e.g., "May-17-1803" → "17-May-1803")
incorrect_dates["Date"] = incorrect_dates["Date"].str.replace(
    r"([A-Za-z]{3})-(\d{1,2})-(\d{4})", r"\2-\1-\3", regex=True
)
def convert_numeric_to_text_date(date_str):
    """ Convert DD-MM-YYYY → DD-MMM-YYYY """
    try:
        return datetime.datetime.strptime(date_str, "%d-%m-%Y").strftime("%d-%b-%Y")
    except ValueError:
        return date_str  # If it can't be converted, return original

# Apply the function to the Date column
incorrect_dates["Date"] = incorrect_dates["Date"].apply(convert_numeric_to_text_date)
# Reuse the same regex pattern to filter valid dates
new_correct_dates = incorrect_dates[incorrect_dates["Date"].str.match(valid_date_pattern, na=False)]

# Store newly cleaned correct dates
correct_dates = pd.concat([correct_dates, new_correct_dates])

# Keep only still-incorrect dates for further cleaning
incorrect_dates = incorrect_dates[~incorrect_dates["Date"].str.match(valid_date_pattern, na=False)]
# Convert 'Date' column to datetime for sorting, ignore errors to keep text-based dates
correct_dates["Date_Sort"] = pd.to_datetime(correct_dates["Date"], format="%d-%b-%Y", errors="coerce")

# Sort by 'Date_Sort' while keeping original 'Date' column intact
correct_dates = correct_dates.sort_values(by="Date_Sort").drop(columns=["Date_Sort"])

# Reset index
correct_dates = correct_dates.reset_index(drop=True)

# Extract year, month, and day separately for sorting
correct_dates["Year"] = correct_dates["Date"].str.extract(r"(\d{4})").astype(float)
correct_dates["Month"] = correct_dates["Date"].str.extract(r"([A-Za-z]{3})")
correct_dates["Day"] = correct_dates["Date"].str.extract(r"(^\d{1,2})").astype(float)

# Convert month names to numbers for proper sorting
month_map = {"Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6, 
             "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12}
correct_dates["Month"] = correct_dates["Month"].map(month_map)

# Fill NaNs with extreme values to ensure correct sorting
correct_dates["Day"] = correct_dates["Day"].fillna(0)  # No day → 0 (comes last within same month)
correct_dates["Month"] = correct_dates["Month"].fillna(0)  # No month → 0 (comes last within same year)

# Sort in descending order: first by Year, then Month, then Day
correct_dates = correct_dates.sort_values(by=["Year", "Month", "Day"], ascending=[False, False, False])

# Drop helper columns
correct_dates = correct_dates.drop(columns=["Year", "Month", "Day"]).reset_index(drop=True)

In [None]:
correct_dates.to_excel("SharkAttack_clean.xlsx")