In [None]:
import pandas as pd

In [None]:
df = pd.read_excel("GSAF5.xlsx")

In [None]:
df.columns

In [None]:
# Goal: Commit the perfect shark murder
# Problem Statement: Identify the correct location, activity,  time of the day and what time of the year

# Hypothesis 1: The perfect location is Australia (can be narrowed down on state level if fitting to the country)
# Hypothesis 2: The perfect time of the year is early in the year on a morning
# Hypothesis 3: The deadliest sharks will be white, tiger and bull

# Columns we need: "Fatal Y/N", "Country", "Date", "Activity",  "Time"
# -> GFM: Fatal & Country
# -> LB: Date
# -> MB: Activty & Time


# further stuf:
# - age, gender of the person we try to kill
# species for "e.g should be killed by white shark"





In [None]:
def clean_date_column(df, column_name, date_format='%Y-%m-%d', fill_na=None):
    """
    Cleans a date column in a DataFrame and returns both cleaned and invalid rows.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the date column.
    - column_name (str): The name of the date column to clean.
    - date_format (str, optional): The format to standardize the date to (default: '%Y-%m-%d').
    - fill_na (str or pd.Timestamp, optional): A default date to fill missing/invalid values. If None, does not fill.

    Returns:
    - pd.DataFrame: The DataFrame with the cleaned date column.
    - pd.DataFrame: A DataFrame containing rows that had invalid dates.
    - dict: A summary of how many rows were affected.
    """
    df = df.copy()  # Avoid modifying the original DataFrame

    # Convert column to datetime, coercing errors to NaT
    df[column_name] = pd.to_datetime(df[column_name], errors='coerce')

    # Separate valid and invalid date rows
    valid_df = df[df[column_name].notna()].copy()
    invalid_df = df[df[column_name].isna()].copy()

    # Count invalid dates
    invalid_count = len(invalid_df)

    # Handle missing values in valid_df
    if fill_na is not None:
        valid_df[column_name] = valid_df[column_name].fillna(pd.Timestamp(fill_na))

    # Standardize date format
    valid_df[column_name] = valid_df[column_name].dt.strftime(date_format)

    # Summary of changes
    summary = {
        "total_rows": len(df),
        "valid_rows": len(valid_df),
        "invalid_rows": invalid_count,
        "filled_rows": invalid_count if fill_na else 0
    }

    print("Date Cleaning Summary:", summary)
    
    return valid_df, invalid_df, summary

cleaned_df, invalid_df, summary = clean_date_column(df, 'Date')

print("\nCleaned DataFrame:")
display(cleaned_df)

print("\nInvalid DataFrame:")
display(invalid_df)


### Guillermo Working on Cleaning  Fatal Y/N

Original Values in the DF Column:

# Sample data as a list
values = ['N', 'Y', 'F', 'M', float('nan'), 'n', 'Nq', 'UNKNOWN', 2017, 'Y x 2', ' N', 'N ', 'y']

different values in column fatal y/n:
 Fatal Y/N
N          4877
Y          1474
UNKNOWN      71
 N            7
F             5
M             3
n             1
Nq            1
2017          1
Y x 2         1
N             1
y             1
Name: count, dtype: int64
['N' 'Y' 'F' 'M' nan 'n' 'Nq' 'UNKNOWN' 2017 'Y x 2' ' N' 'N ' 'y']
Number of NaN values in 'Fatal Y/N' column: 561

The result is:

different values in column fatal y/n:
 Fatal Y/N
N    4887
Y    1476
U     641
Name: count, dtype: int64
['N' 'Y' 'U']
Number of NaN values in 'Fatal Y/N' column: 0



In [None]:
print('different values in column fatal y/n:\n', df['Fatal Y/N'].value_counts())

unique_values = df['Fatal Y/N'].unique()
print(unique_values)

num_nan_in_column = df['Fatal Y/N'].isna().sum()
print(f"Number of NaN values in 'Fatal Y/N' column: {num_nan_in_column}")

In [None]:
# Code to clean the 'Fatal Y/N' column in the DataFrame

# Define a dictionary to map current values to 'Yes' or 'No'
'''replacements = {
    'Y': 'Yes',
    'N': 'No',
    'UNKNOWN': 'No',  # Example of treating unknown as 'No', adjust as needed
    'nan': 'No',      # Handle missing values, adjust if they should be 'Yes' 
    # Add more mappings if necessary
}
'''
# Sample data as a list
values = ['N', 'Y', 'F', 'M', float('nan'), 'n', 'Nq', 'UNKNOWN', 2017, 'Y x 2', ' N', 'N ', 'y']

# Define a dictionary for the mappings
mapping = {
    'N': 'N',
    'Y': 'Y',
    'F': 'U',           # Assuming 'F' and 'M' are unusual entries we'll consider unknown ('U')
    'M': 'U',
    'n': 'N',
    'Nq': 'N',
    'UNKNOWN': 'U',
    2017: 'U',         # With the numerical could Keep numerical years unchanged or convert to 'U'
    'Y x 2': 'Y',       # Considering 'Y x 2' as a form of 'Yes'
    ' N': 'N',
    'N ': 'N',
    'y': 'Y'
}

# Apply the mapping to the column
df['Fatal Y/N'] = df['Fatal Y/N'].replace(mapping).fillna('U')

# Print the cleaned DataFrame column
print(df['Fatal Y/N'])


''' Sample data as a list
values = ['N', 'Y', 'F', 'M', float('nan'), 'n', 'Nq', 'UNKNOWN', 2017, 'Y x 2', ' N', 'N ', 'y']

different values in column fatal y/n:
 Fatal Y/N
N          4877
Y          1474
UNKNOWN      71
 N            7
F             5
M             3
n             1
Nq            1
2017          1
Y x 2         1
N             1
y             1
Name: count, dtype: int64
['N' 'Y' 'F' 'M' nan 'n' 'Nq' 'UNKNOWN' 2017 'Y x 2' ' N' 'N ' 'y']
Number of NaN values in 'Fatal Y/N' column: 561

The result is:

different values in column fatal y/n:
 Fatal Y/N
N    4887
Y    1476
U     641
Name: count, dtype: int64
['N' 'Y' 'U']
Number of NaN values in 'Fatal Y/N' column: 0'''


