In [1]:
''' 
Script para descargar un archivo de Google Drive y mostrar las primeras 10 líneas
'''

url_view = "https://drive.google.com/file/d/1RMKdCJKS7vFBhiTLosCotau1k4UsN5iu/view?usp=sharing"   # Enlace de Google Drive
file_id = url_view.split('/')[5]  # Extrae el ID del archivo (la 6ª parte del enlace)
download_url = f"https://drive.google.com/uc?id={file_id}"   # Crea el enlace de descarga directa

import requests
response = requests.get(download_url)

# Muestra las primeras 10 líneas, para saber el tipo de separación del csv
for i, line in enumerate(response.text.splitlines()[:10], start=1):
    print(f"{i:02d}: {line}")


01: Date;Year;Type;Country;State;Location;Activity;Name;Sex;Age;Injury;Fatal Y/N;Time;Species ;Source;pdf;href formula;href;Case Number;Case Number;original order;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
02: 11th October;2025;Unprovoked;Australia;Queensland;Cook Esplanade Thursday Island;Fishing/swimming;Samuel Nai;M;14;Serious abdonminal injuries;N;1823 hrs;Tiger or Bull shark;Kevin McMurray Trackingsharks.com;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
03: 7th October;2025;Unprovoked;Australia;South Australia;Kangaroo Island;Surfing;Lee Berryman;M;50+;Lacerations to calf ;N;1330hrs;Bronze whaler?;Kevin

In [2]:
import pandas as pd
from typing import Optional, Iterable

shark_df = pd.read_csv(download_url, sep=';', encoding='utf-8', low_memory=False)
print("Inicialmente los datos tienen:", shark_df.shape)
print(shark_df.head())  # Muestra las primeras filas del DataFrame

shark_df = shark_df.dropna(axis=1, how='all') # Eliminar columnas vacías enteras
print("Ahora los datos tienen:",shark_df.shape)
print(shark_df.columns)  # Muestra las columnas del DataFrame: 255 col a 23 col

Inicialmente los datos tienen: (39139, 255)
             Date  Year        Type     Country            State  \
0    11th October  2025  Unprovoked   Australia       Queensland   
1     7th October  2025  Unprovoked   Australia  South Australia   
2  29th September  2025  Unprovoked         USA   Off California   
3  27th September  2025    Provoked  Costa Rica              NaN   
4   6th September  2025  Unprovoked   Australia              NSW   

                         Location               Activity                Name  \
0  Cook Esplanade Thursday Island       Fishing/swimming          Samuel Nai   
1                 Kangaroo Island                Surfing        Lee Berryman   
2                 Catalina Island               Swimming  Christopher Murray   
3                   Cocos Islands  Diving-Tagging sharks  Dr. Mauricio Hoyos   
4                Long Reef Sydney                Surfing  Mercury Psillaskis   

  Sex  Age  ... Unnamed: 245 Unnamed: 246 Unnamed: 247 Unnamed: 24

In [3]:
#Quitamos las columnas que no nos sirven para el análisis

shark_df.drop(['Date','Location','Name','Age','Injury','Time','Source', 'pdf', 'href formula', 'href', 'Case Number', 'Case Number.1',
       'original order', 'Unnamed: 21', 'Unnamed: 22'], axis=1, inplace=True)

print(shark_df.columns)  # Muestra las columnas del DataFrame después de eliminar las que no analizaremos

Index(['Year', 'Type', 'Country', 'State', 'Activity', 'Sex', 'Fatal Y/N',
       'Species '],
      dtype='object')


In [4]:
""""
Normalize text columns (strip spaces, set lowercase).

Parameters
----------
df : DataFrame
cols : columns to normalize; if None, all object/string columns
lower : convert to lowercase
strip : strip leading/trailing whitespace
normalize_columns: normalize column names (strip, lower, replace spaces with _)

Returns
-------
DataFrame (same object, modified in place style but returns df for chaining)
"""

def standardize_text(
    df: pd.DataFrame,
    cols: Optional[Iterable[str]] = None,
    lower: bool = True,
    strip: bool = True,
    normalize_columns: bool = True,
) -> pd.DataFrame:
       
    if cols is None:
        cols = df.select_dtypes(include=["object", "string"]).columns

    for c in cols:
        s = df[c].astype("string")
        if strip:
            s = s.str.strip()
        if lower:
            s = s.str.lower()
        df[c] = s
     
    if normalize_columns:
        df.columns = (
            df.columns
            .str.strip()
            .str.lower()
            .str.replace(" ", "_")
        )

    return df

shark_df = standardize_text(shark_df)
print(shark_df.head())  # Muestra las primeras filas del DataFrame después de la normalización del texto y las columnas

   year        type     country            state               activity sex  \
0  2025  unprovoked   australia       queensland       fishing/swimming   m   
1  2025  unprovoked   australia  south australia                surfing   m   
2  2025  unprovoked         usa   off california               swimming   m   
3  2025    provoked  costa rica             <NA>  diving-tagging sharks   m   
4  2025  unprovoked   australia              nsw                surfing   m   

  fatal_y/n              species  
0         n  tiger or bull shark  
1         n       bronze whaler?  
2         n   unknown 1.2m shark  
3         n       tiger shark 4m  
4         y    great white shark  


In [5]:
#Check valores únicos por columna:

for col in shark_df.columns:
    print(f'{col.upper()} --> valores únicos:', shark_df[col].unique())

YEAR --> valores únicos: <StringArray>
['2025', '2024', '2026', '2023', '2022', '2021', '2020', '2019', '2018',
 '2017',
 ...
 '1580', '1555', '1554', '1543', '1518', '1500', '1000', '0077', '0005',
 '0000']
Length: 263, dtype: string
TYPE --> valores únicos: <StringArray>
[         'unprovoked',            'provoked',        'questionable',
          'watercraft',        'sea disaster',                  <NA>,
                   '?',         'unconfirmed',          'unverified',
             'invalid', 'under investigation',                'boat']
Length: 12, dtype: string
COUNTRY --> valores únicos: <StringArray>
[               'australia',                      'usa',
               'costa rica',                  'bahamas',
              'puerto rico',         'french polynesia',
                    'spain',           'canary islands',
             'south africa',                  'vanuatu',
 ...
        'mediterranean sea',                   'sweden',
                   'roatan', 'b

In [6]:
shark_df.dtypes

year         string[python]
type         string[python]
country      string[python]
state        string[python]
activity     string[python]
sex          string[python]
fatal_y/n    string[python]
species      string[python]
dtype: object

In [7]:
print('Los valores iniciales eran:', shark_df['year'].nunique)
print('Los valores iniciales eran:', shark_df['year'].shape)
print('Los valores iniciales eran:', shark_df['type'].nunique)
print('Los valores iniciales eran:', shark_df['type'].shape)

#PRIMER FILTRO (Marlene):

shark_df = shark_df[shark_df['type'] == 'unprovoked'].copy() #Quitamos los accidentes que no sean "unprovoked"

Los valores iniciales eran: <bound method IndexOpsMixin.nunique of 0        2025
1        2025
2        2025
3        2025
4        2025
         ... 
39134    <NA>
39135    <NA>
39136    <NA>
39137    <NA>
39138    <NA>
Name: year, Length: 39139, dtype: string>
Los valores iniciales eran: (39139,)
Los valores iniciales eran: <bound method IndexOpsMixin.nunique of 0        unprovoked
1        unprovoked
2        unprovoked
3          provoked
4        unprovoked
            ...    
39134          <NA>
39135          <NA>
39136          <NA>
39137          <NA>
39138          <NA>
Name: type, Length: 39139, dtype: string>
Los valores iniciales eran: (39139,)


In [8]:
#Exportamos a excel post filtro: TYPE

shark_df.to_excel('SharkAttack_check1.xlsx')

In [9]:
print(shark_df[['year']].isnull().sum())

shark_df = shark_df.dropna(subset=['year']) #Quitamos los null de la columna "year"

print(shark_df.isnull().sum())

year    1
dtype: int64
year            0
type            0
country        33
state         300
activity      359
sex           168
fatal_y/n      17
species      2598
dtype: int64


In [10]:
#formateamos para que todos los string sean igual (con decimal ".")

shark_df['year'] = (
    shark_df['year']
    .astype(str)                 
    .str.replace(',', '.', regex=False)  # cambia coma por punto
    .astype(float)      
)
shark_df['year'] = pd.to_numeric(shark_df['year'])
shark_df = shark_df.loc[(shark_df['year'] >= 2000) & (shark_df['year'] <= 2025)] #filtramos los años sujetos a estudio

#CHECK FILTRO 1
print('Post filtro quedan:', shark_df['year'].shape , "valores")
print('Post filtro quedan:', shark_df['type'].shape , "valores")


Post filtro quedan: (2206,) valores
Post filtro quedan: (2206,) valores


In [11]:
#Check valores por columnas para próximos filtros

for col in shark_df.columns:
    print(f'{col.upper()} --> valores únicos:', shark_df[col].unique())

YEAR --> valores únicos: [2025. 2024. 2023. 2022. 2021. 2020. 2019. 2018. 2017. 2016. 2015. 2014.
 2013. 2012. 2011. 2010. 2009. 2008. 2007. 2006. 2005. 2004. 2003. 2002.
 2001. 2000.]
TYPE --> valores únicos: <StringArray>
['unprovoked']
Length: 1, dtype: string
COUNTRY --> valores únicos: <StringArray>
[                            'australia',
                                   'usa',
                               'bahamas',
                           'puerto rico',
                      'french polynesia',
                                 'spain',
                        'canary islands',
                          'south africa',
                               'vanuatu',
                               'jamaica',
                                'israel',
                              'maldives',
                      'turks and caicos',
                            'mozambique',
                         'new caledonia',
                                 'egypt',
                      

In [12]:
#Exportamos a excel post filtro: TYPE + YEAR

shark_df.to_excel('SharkAttack_check2.xlsx')

In [13]:
#SEGUNDO FILTRO (Marta):
#Decidimos filtrar por 11 países que representan el 90% de los ataques. 
country_selected = [
    "usa","australia","south africa","bahamas","brazil","new zealand",
    "new caledonia","egypt","reunion","french polynesia","mexico","reunion island"]

shark_df_filtered = shark_df[
    shark_df["country"].str.strip().str.lower().isin(country_selected)
].copy()

print("Después del segundo filtro, los datos tienen:", shark_df_filtered.shape)
# Reunion y Reunion island los llamamos como Reunion Island

map_reunion = {"reunion island": "reunion"}
shark_df_filtered["country"] = shark_df_filtered["country"].replace(map_reunion)
print(shark_df_filtered["country"].value_counts(dropna=False).sort_index())
shark_df = shark_df_filtered #sobrescribe el dataframe con los filtros y demás


Después del segundo filtro, los datos tienen: (1977, 8)
country
australia            434
bahamas               69
brazil                54
egypt                 30
french polynesia      26
mexico                25
new caledonia         35
new zealand           38
reunion               30
south africa         115
usa                 1121
Name: count, dtype: Int64


In [14]:
# Borramos state en blanco para esos paises
print("Antes:", shark_df_filtered.shape)
shark_df_filtered = shark_df_filtered.dropna(subset=["state"]).copy()
print("Después:", shark_df_filtered.shape)

Antes: (1977, 8)
Después: (1934, 8)


In [15]:
#Normalizando nombres de estados para los países seleccionados
#USA:

mask_usa = shark_df_filtered["country"].eq("usa")

# Diccionario de normalización (todo en minúsculas)
usa_state_map = {
    "floria": "florida",
    "franklin county, florida": "florida",
    "noirth carolina": "north carolina",
    "off california": "california",
    "los angeles": "california",
    "long island ny": "new york",
    "maui": "hawaii",
    "virgin islands": "us virgin islands",
 }

print("Después de normalizar estados USA:", shark_df_filtered.loc[mask_usa, "state"].replace(usa_state_map).value_counts())

Después de normalizar estados USA: state
florida              602
hawaii               152
california           103
south carolina        79
north carolina        69
texas                 34
new york              16
oregon                16
alabama               10
louisiana              6
georgia                5
new jersey             5
massachusetts          4
maine                  2
guam                   2
us virgin islands      2
virginia               2
galveston              1
samoa                  1
maryland               1
cayman islands         1
bahamas                1
washington             1
rhode island           1
delaware               1
palmyra atoll          1
puerto rico            1
johnston atoll         1
Name: count, dtype: Int64


In [16]:
#Australia:

mask_aus = shark_df_filtered["country"].eq("australia")

aus_state_map = {
    "new  south wales": "new south wales",
    "new south ales": "new south wales",
    "nsw": "new south wales",
    "wa": "western australia",
    "westerm australia": "western australia",
    "western  australia": "western australia",
}

print(
    "Después de normalizar estados AU:",
    shark_df_filtered.loc[mask_aus, "state"]
        .replace(aus_state_map)
        .value_counts()
)

Después de normalizar estados AU: state
new south wales                         164
western australia                       120
queensland                               77
south australia                          36
victoria                                 23
tasmania                                  6
northern territory                        5
torres strait                             1
territory of cocos (keeling) islands      1
Name: count, dtype: Int64


In [17]:
#South Africa:
mask_sa = shark_df_filtered["country"].eq("south africa")

sa_state_map = {
    "easten cape province": "eastern cape",
    "eastern cape  province": "eastern cape",
    "eastern cape province": "eastern cape",
    "eastern province": "eastern cape", 
    "kwazulu-natal between port edward and port st johns": "kwazulu-natal",
    "western cape province": "western cape",
    "western province": "western cape",
}

print(
    "Después de normalizar estados South Africa:",
    shark_df_filtered.loc[mask_sa, "state"]
        .replace(sa_state_map)
        .value_counts()
)


Después de normalizar estados South Africa: state
western cape     48
eastern cape     46
kwazulu-natal    19
Name: count, dtype: Int64


In [18]:
#Bahamas:
mask_bhs = shark_df_filtered["country"].eq("bahamas")

bahamas_state_map = {
    # --- Grand Bahama (incluye off-shore y ciudades) ---
    "40 miles off grand bahama island": "grand bahama",
    "grand  bahama island": "grand bahama",
    "grand bahama island": "grand bahama",
    "freeport": "grand bahama",
    "west end": "grand bahama",

    # --- Abaco ---
    "abaco islands": "abaco",
    "great abaco islands": "abaco",
    "atlantic ocean near big grand cay": "abaco",

    # --- Andros ---
    "andros islands": "andros",

    # --- Exuma ---
    "exuma islands": "exuma",
    "exumas": "exuma",
    "the exuma cays": "exuma",

    # --- New Providence (+ Paradise Island y variantes tipográficas) ---
    "new providence   isoad": "new providence",
    "new providence district": "new providence",
    "new providence island": "new providence",
    "paradise island": "new providence",

    # --- Long Island (typo en el dato) ---
    "clarence town long isand": "long island",
}

print(
    "Después de normalizar Bahamas:",
    shark_df_filtered.loc[mask_bhs, "state"]
        .str.strip().str.lower()
        .replace(bahamas_state_map)
        .value_counts()
)

Después de normalizar Bahamas: state
abaco                  22
grand bahama           10
exuma                   7
new providence          7
eleuthera               2
long island             1
lucayan archipelago     1
bimini                  1
northern bahamas        1
andros                  1
Name: count, dtype: Int64


In [19]:
#Brasil:
mask_bra = shark_df_filtered["country"].eq("brazil")

brazil_state_map = {
    "balneário camboriú": "santa catarina",
    "santa catarina state": "santa catarina",
    "rio grande de norte": "rio grande do norte",
    "fernando de noronha": "pernambuco",
    "são paulo.": "sao paulo",
}

print(
    "Después de normalizar Brasil:",
    shark_df_filtered.loc[mask_bra, "state"]
        .str.strip().str.lower()
        .replace(brazil_state_map)
        .value_counts())

Después de normalizar Brasil: state
pernambuco             42
sao paulo               2
santa catarina          2
bahia                   2
rio de janeiro          2
rio grande do sul       1
rio grande do norte     1
Name: count, dtype: Int64


In [20]:
#New Zealand:
mask_nz = shark_df_filtered["country"].eq("new zealand")

nz_state_map = {
    "south island, near karitane north of dunedin": "south island",
    "southland": "south island",
    "bay of waitangi": "north island",
    "mercury islands": "north island",
  }

print(
    "Después de normalizar New Zealand:",
    shark_df_filtered.loc[mask_nz, "state"]
        .str.strip().str.lower()
        .replace(nz_state_map)
        .value_counts()
)

Después de normalizar New Zealand: state
south island       18
north island       16
cook islands        3
chatham islands     1
Name: count, dtype: Int64


In [21]:
#New Caledonia:

mask_nc = shark_df_filtered["country"].eq("new caledonia")

nc_state_map = {
    "bélep islands": "belep islands",
    "grande terre": "grand terre",
}

print(
    "Después de normalizar New Caledonia:",
    shark_df_filtered.loc[mask_nc, "state"]
        .str.strip().str.lower()
        .replace(nc_state_map)
        .value_counts()
)


Después de normalizar New Caledonia: state
south province          8
north province          8
loyalty islands         5
grand terre             2
poum                    1
belep islands           1
noumea                  1
baie de sainte-marie    1
Name: count, dtype: Int64


In [22]:
#Egipto:
mask_egy = shark_df_filtered["country"].eq("egypt")

egypt_state_map = {
    "hurghada, red sea governorate": "red sea governorate",
    "north of marsa alam": "red sea governorate",
    "red sea": "red sea governorate",
    "red sea protectorate": "red sea governorate",
    "st. johns reef": "red sea governorate",
    "sinai peninsula": "south sinai",
    "south sinai peninsula": "south sinai",
}

print(
    "Después de normalizar Egipto:",
    shark_df_filtered.loc[mask_egy, "state"]
        .str.strip().str.lower()
        .replace(egypt_state_map)
        .value_counts()
)


Después de normalizar Egipto: state
red sea governorate    12
south sinai            12
suez                    1
Name: count, dtype: Int64


In [23]:
#Reunion:

mask_reu = shark_df_filtered["country"].eq("reunion")

s = (
    shark_df_filtered.loc[mask_reu, "state"]
        .astype("string")
        .str.strip().str.lower()
        .str.replace("-", " ", regex=False)         # "saint-gilles" -> "saint gilles"
        .replace({
            "saint guilles": "saint gilles",        # typo
            "saint gilles les bains": "saint gilles",
            "d'etang-sale": "etang sale",
            "d’etang-sale": "etang sale",
            "d'etang-salé": "etang sale",
            "d’etang-salé": "etang sale",
            "conservatória district": "conservatoria district",
        })
)

# Agrupa TODO lo que empiece por "saint " en "saint areas"
s = s.where(~s.str.startswith("saint "), "saint areas")

print("Después de normalizar Reunion:")
print(s.value_counts())

Después de normalizar Reunion:
state
saint areas               17
d’étang salé               2
le port                    1
trois bassins              1
bois blanc                 1
conservatoria district     1
Name: count, dtype: Int64


In [24]:
# French Polynesia
mask_fp = shark_df_filtered["country"].eq("french polynesia")

french_poly_map = {
    # Society Islands
    "bora bora": "society islands",
    "moorea": "society islands",
    "tahiti": "society islands",
    "nuku hiva": "marquesas",
    "central tuamotu": "tuamotu islands",
    "tuamotos": "tuamotu islands",
    "tuamotus": "tuamotu islands",
    "rangiroa": "tuamotu islands",
}

print(
    "Después de normalizar French Polynesia:",
    shark_df_filtered.loc[mask_fp, "state"]
        .str.strip().str.lower()
        .replace(french_poly_map)
        .value_counts()
)

Después de normalizar French Polynesia: state
society islands    14
tuamotu islands     6
marquesas           4
gambier islands     1
Name: count, dtype: Int64


In [25]:
#Mexico:
mask_mex = shark_df_filtered["country"].eq("mexico")

mexico_state_map = {
    "guerrero": "guerrero",
    "cabo san lucas": "baja california",
    "baja": "baja california",
    "baja california sur": "baja california",
    "guerro": "guerrero",
}

print(
    "Después de normalizar Méxipwdco:",
    shark_df_filtered.loc[mask_mex, "state"]
        .str.strip().str.lower()
        .replace(mexico_state_map)
        .value_counts()
)

Después de normalizar Méxipwdco: state
quintana roo          7
baja california       6
guerrero              6
sonora                3
jalisco               1
gulf of california    1
sinaloa               1
Name: count, dtype: Int64


In [26]:
#Asignamos los cambios al dataframe principal
# Use shark_df_filtered since the masks were created from it
shark_df_filtered.loc[mask_usa, "state"] = shark_df_filtered.loc[mask_usa, "state"].replace(usa_state_map)
shark_df_filtered.loc[mask_aus, "state"] = shark_df_filtered.loc[mask_aus, "state"].replace(aus_state_map)
shark_df_filtered.loc[mask_sa, "state"] = shark_df_filtered.loc[mask_sa, "state"].replace(sa_state_map)
shark_df_filtered.loc[mask_bhs, "state"] = shark_df_filtered.loc[mask_bhs, "state"].replace(bahamas_state_map)
shark_df_filtered.loc[mask_bra, "state"] = shark_df_filtered.loc[mask_bra, "state"].replace(brazil_state_map)
shark_df_filtered.loc[mask_nz, "state"] = shark_df_filtered.loc[mask_nz, "state"].replace(nz_state_map)
shark_df_filtered.loc[mask_nc, "state"] = shark_df_filtered.loc[mask_nc, "state"].replace(nc_state_map)
shark_df_filtered.loc[mask_egy, "state"] = shark_df_filtered.loc[mask_egy, "state"].replace(egypt_state_map)
shark_df_filtered.loc[mask_reu, "state"] = s
shark_df_filtered.loc[mask_fp, "state"] = shark_df_filtered.loc[mask_fp, "state"].replace(french_poly_map)
shark_df_filtered.loc[mask_mex, "state"] = shark_df_filtered.loc[mask_mex, "state"].replace(mexico_state_map)
shark_df

# Update shark_df to the cleaned version
shark_df = shark_df_filtered


In [27]:
#Exportamos a excel post filtro: TYPE + YEAR + COUNTRY + LOCATION

shark_df.to_excel('SharkAttack_check3.xlsx')

In [28]:
# TERCER FILTRO (Alejandro):
# Valoramos los duplicados de las actividades que estaban realizando los atacados, para agruparlas en categorias iguales.
shark_df['activity'].value_counts()

activity
surfing                                             722
swimming                                            318
spearfishing                                        113
snorkeling                                           87
wading                                               86
                                                   ... 
swimming out to porpoises                             1
windsurfing, but sitting on his board                 1
surfing / wading                                      1
spearfishing, holding mesh bag with speared fish      1
boogie boarding / wading                              1
Name: count, Length: 187, dtype: Int64

In [29]:
# Agrupamos las actividades en sus categorias
# Diving
mask_diving = shark_df_filtered["activity"].str.strip().str.lower().isin([
    "abalone diving",
    "diving",
    "diving (shell maintenance)",
    "diving / filming",
    "diving for abalone",
    "diving for crayfish",
    "diving into water",
    "diving with  surface-supplied air",
    "diving, but on the surface when bitten by the shark",
    "free diving",
    "free diving / modeling",
    "free diving / photographing pilot whales",
    "free diving / spearfishing",
    "freediving",
    "scuba diving",
    "snorkeling",
    "snorkeling (filming the sardine run)",
    "standing / snorkeling"
])

activity_diving_map = {
    "abalone diving": "diving",
    "diving": "diving",
    "diving (shell maintenance)": "diving",
    "diving / filming": "diving",
    "diving for abalone": "diving",
    "diving for crayfish": "diving",
    "diving into water": "diving",
    "diving with  surface-supplied air": "diving",
    "diving, but on the surface when bitten by the shark": "diving",
    "free diving": "diving",
    "free diving / modeling": "diving",
    "free diving / photographing pilot whales": "diving",
    "free diving / spearfishing": "diving",
    "freediving": "diving",
    "scuba diving": "diving",
    "snorkeling": "diving",
    "snorkeling (filming the sardine run)": "diving",
    "standing / snorkeling": "diving",
}

print(
    "Después de normalizar 'activity' → 'diving':",
    shark_df_filtered.loc[mask_diving, "activity"]
        .str.strip().str.lower()
        .replace(activity_diving_map)
        .value_counts()
)

shark_df_filtered.loc[mask_diving, "activity"] = (
    shark_df_filtered.loc[mask_diving, "activity"]
        .str.strip().str.lower()
        .replace(activity_diving_map)
)

Después de normalizar 'activity' → 'diving': activity
diving    171
Name: count, dtype: Int64


In [30]:
# Swimming
mask_swimming = shark_df_filtered["activity"].str.strip().str.lower().isin([
    "bathing",
    "batin",
    "crouching in 2' of water",
    "floating",
    "floating face-down in knee-deep water",
    "floating in inflatable pool ring",
    "floating in tube",
    "floating near boat & observing bioluminesce",
    "floating on a raft",
    "holding onto an inflatable boat",
    "in water with diving seabirds",
    "jumped into the water",
    "jumped into water",
    "jumpinf from dock to boat acidentally fell into water at marina",
    "jumping",
    "jumping in the waves",
    "kneeling in the water",
    "lifeguard exercises",
    "lifeguard training exercise",
    "lifesaving drill",
    "playing",
    "playing football in the water",
    "playing in the surf",
    "playing in the surf with his 2 dogs",
    "playing in the water",
    "playing on a sandbar",
    "playing soccer in the water",
    "squatting in the water",
    "standing",
    "swimming",
    "swimming  or snorkeling",
    "swimming & snorkeling",
    "swimming (using a float)",
    "swimming /  boogie boarding",
    "swimming / body surfing",
    "swimming / jumping off a jetty",
    "swimming / shipwreck",
    "swimming / snorkeling",
    "swimming / treading water",
    "swimming / wading",
    "swimming ocean training",
    "swimming or boogie boarding",
    "swimming out to porpoises",
    "swimming to shore from capsized kayak",
    "swimming with boogie board",
    "swimming with pod of dolphins",
    "swimming with sharks",
    "swimming, attempting to rescue a girl  believed to be drowning",
    "swimming, poaching abalone",
    "swimming, poaching perlemoen",
    "swimming, towing surfboard",
    "swimming, wearing black wetsuit & swim fins",
    "swimming/ treading water",
    "swimming/standing",
    "swimmingq",
    "treading water",
    "treading water/ surfing",
    "unconfirmed possibly swimming near the boat harbour",
    "undisclosed",
    "wade fishing",
    "wade-fishing",
    "wading",
    "wading / fishing & carrying a bag of fish",
    "wading near a fishing net",
    "wading or swimming",
    "wading?"
])

activity_swimming_map = {
    "bathing": "swimming",
    "batin": "swimming",
    "crouching in 2' of water": "swimming",
    "floating": "swimming",
    "floating face-down in knee-deep water": "swimming",
    "floating in inflatable pool ring": "swimming",
    "floating in tube": "swimming",
    "floating near boat & observing bioluminesce": "swimming",
    "floating on a raft": "swimming",
    "holding onto an inflatable boat": "swimming",
    "in water with diving seabirds": "swimming",
    "jumped into the water": "swimming",
    "jumped into water": "swimming",
    "jumpinf from dock to boat acidentally fell into water at marina": "swimming",
    "jumping": "swimming",
    "jumping in the waves": "swimming",
    "kneeling in the water": "swimming",
    "lifeguard exercises": "swimming",
    "lifeguard training exercise": "swimming",
    "lifesaving drill": "swimming",
    "playing": "swimming",
    "playing football in the water": "swimming",
    "playing in the surf": "swimming",
    "playing in the surf with his 2 dogs": "swimming",
    "playing in the water": "swimming",
    "playing on a sandbar": "swimming",
    "playing soccer in the water": "swimming",
    "squatting in the water": "swimming",
    "standing": "swimming",
    "swimming": "swimming",
    "swimming  or snorkeling": "swimming",
    "swimming & snorkeling": "swimming",
    "swimming (using a float)": "swimming",
    "swimming /  boogie boarding": "swimming",
    "swimming / body surfing": "swimming",
    "swimming / jumping off a jetty": "swimming",
    "swimming / shipwreck": "swimming",
    "swimming / snorkeling": "swimming",
    "swimming / treading water": "swimming",
    "swimming / wading": "swimming",
    "swimming ocean training": "swimming",
    "swimming or boogie boarding": "swimming",
    "swimming out to porpoises": "swimming",
    "swimming to shore from capsized kayak": "swimming",
    "swimming with boogie board": "swimming",
    "swimming with pod of dolphins": "swimming",
    "swimming with sharks": "swimming",
    "swimming, attempting to rescue a girl  believed to be drowning": "swimming",
    "swimming, poaching abalone": "swimming",
    "swimming, poaching perlemoen": "swimming",
    "swimming, towing surfboard": "swimming",
    "swimming, wearing black wetsuit & swim fins": "swimming",
    "swimming/ treading water": "swimming",
    "swimming/standing": "swimming",
    "swimmingq": "swimming",
    "treading water": "swimming",
    "treading water/ surfing": "swimming",
    "unconfirmed possibly swimming near the boat harbour": "swimming",
    "undisclosed": "swimming",
    "wade fishing": "swimming",
    "wade-fishing": "swimming",
    "wading": "swimming",
    "wading / fishing & carrying a bag of fish": "swimming",
    "wading near a fishing net": "swimming",
    "wading or swimming": "swimming",
    "wading?": "swimming",
}

print(
    "Después de normalizar 'activity' → 'swimming':",
    shark_df_filtered.loc[mask_swimming, "activity"]
        .str.strip().str.lower()
        .replace(activity_swimming_map)
        .value_counts()
)

shark_df_filtered.loc[mask_swimming, "activity"] = (
    shark_df_filtered.loc[mask_swimming, "activity"]
        .str.strip().str.lower()
        .replace(activity_swimming_map)
)

Después de normalizar 'activity' → 'swimming': activity
swimming    541
Name: count, dtype: Int64


In [31]:
# Study
mask_study = shark_df_filtered["activity"].str.strip().str.lower().isin([
    "attempting to rescue a shark",
    "photographing fish",
    "photographing the shark",
    "tagging sharks",
    "wrangling a shark",
])

activity_study_map = {
    "attempting to rescue a shark": "study",
    "photographing fish": "study",
    "photographing the shark": "study",
    "tagging sharks": "study",
    "wrangling a shark": "study",
}

print(
    "Después de normalizar 'activity':",
    shark_df_filtered.loc[mask_study, "activity"]
        .str.strip().str.lower()
        .replace(activity_study_map)
        .value_counts()
)

shark_df_filtered.loc[mask_study, "activity"] = (
    shark_df_filtered.loc[mask_study, "activity"]
        .str.strip().str.lower()
        .replace(activity_study_map)
)

Después de normalizar 'activity': activity
study    5
Name: count, dtype: Int64


In [32]:
# Paddle

mask_paddle = shark_df_filtered["activity"].str.strip().str.lower().isin([
    "body boarding",
    "body surfing",
    "body surfing or boogie boarding",
    "body surfing?",
    "bodyboarding",
    "body-boarding",
    "bodysurfing",
    "body-surfing",
    "boggie boarding",
    "boogie boarding",
    "boogie boarding / wading",
    "boogie boarding or surfing",
    "boogie-boarding / swimming",
    "paddle boarding",
    "paddleboarding",
    "paddle-boarding",
    "paddle-surfing",
    "paddling on kneeboard",
    "standing alongside surfboard",
    "standing in inner tube",
    "standing or boogie boardin",
    "standing, stepped on shark",
    "stand-up paddle boarding",
    "stand-up paddleboarding",
    "sup",
    "sup foil boarding"
])

activity_paddle_map = {
    "body boarding": "paddle",
    "body surfing": "paddle",
    "body surfing or boogie boarding": "paddle",
    "body surfing?": "paddle",
    "bodyboarding": "paddle",
    "body-boarding": "paddle",
    "bodysurfing": "paddle",
    "body-surfing": "paddle",
    "boggie boarding": "paddle",
    "boogie boarding": "paddle",
    "boogie boarding / wading": "paddle",
    "boogie boarding or surfing": "paddle",
    "boogie-boarding / swimming": "paddle",
    "paddle boarding": "paddle",
    "paddleboarding": "paddle",
    "paddle-boarding": "paddle",
    "paddle-surfing": "paddle",
    "paddling on kneeboard": "paddle",
    "standing alongside surfboard": "paddle",
    "standing in inner tube": "paddle",
    "standing or boogie boardin": "paddle",
    "standing, stepped on shark": "paddle",
    "stand-up paddle boarding": "paddle",
    "stand-up paddleboarding": "paddle",
    "sup": "paddle",
    "sup foil boarding": "paddle",
}

print(
    "Después de normalizar 'activity' → 'paddle':",
    shark_df_filtered.loc[mask_paddle, "activity"]
        .str.strip().str.lower()
        .replace(activity_paddle_map)
        .value_counts()
)

shark_df_filtered.loc[mask_paddle, "activity"] = (
    shark_df_filtered.loc[mask_paddle, "activity"]
        .str.strip().str.lower()
        .replace(activity_paddle_map)
)

Después de normalizar 'activity' → 'paddle': activity
paddle    151
Name: count, dtype: Int64


In [33]:
# Fishing
mask_fishing = shark_df_filtered["activity"].str.strip().str.lower().isin([
    "casting a net",
    "chumming for white sharks",
    "cleaning fish",
    "feeding fish",
    "feeding sharks",
    "fell into the water",
    "fihing",
    "filming",
    "fishing",
    "fishing from surfboard",
    "fishing from surfski",
    "fishing, fell in water",
    "fishing, standing in 2' of water",
    "fishing/swimming",
    "fly fishing",
    "hand feeding sharks",
    "lobstering",
    "removing fish from a trap",
    "scallop diving (using surface-supplied air & a pod)",
    "scallop diving on hookah",
    "scalloping",
    "spearfishing",
    "spearfishing (free diving)",
    "spearfishing / diving",
    "spearfishing / free diving",
    "spearfishing, holding mesh bag with speared fish",
    "washing hands",
    "washing sand off a speared fish",
])

activity_fishing_map = {
    "casting a net": "fishing",
    "chumming for white sharks": "fishing",
    "cleaning fish": "fishing",
    "feeding fish": "fishing",
    "feeding sharks": "fishing",
    "fell into the water": "fishing",
    "fihing": "fishing",
    "filming": "fishing",
    "fishing": "fishing",
    "fishing from surfboard": "fishing",
    "fishing from surfski": "fishing",
    "fishing, fell in water": "fishing",
    "fishing, standing in 2' of water": "fishing",
    "fishing/swimming": "fishing",
    "fly fishing": "fishing",
    "hand feeding sharks": "fishing",
    "lobstering": "fishing",
    "removing fish from a trap": "fishing",
    "scallop diving (using surface-supplied air & a pod)": "fishing",
    "scallop diving on hookah": "fishing",
    "scalloping": "fishing",
    "spearfishing": "fishing",
    "spearfishing (free diving)": "fishing",
    "spearfishing / diving": "fishing",
    "spearfishing / free diving": "fishing",
    "spearfishing, holding mesh bag with speared fish": "fishing",
    "washing hands": "fishing",
    "washing sand off a speared fish": "fishing",
}

print(
    "Después de normalizar 'activity' → 'fishing':",
    shark_df_filtered.loc[mask_fishing, "activity"]
        .str.strip().str.lower()
        .replace(activity_fishing_map)
        .value_counts()
)

shark_df_filtered.loc[mask_fishing, "activity"] = (
    shark_df_filtered.loc[mask_fishing, "activity"]
        .str.strip().str.lower()
        .replace(activity_fishing_map)
)

Después de normalizar 'activity' → 'fishing': activity
fishing    170
Name: count, dtype: Int64


In [34]:
# Surfing
mask_surfing = shark_df_filtered["activity"].str.strip().str.lower().isin([
    "foil boarding",
    "foil-boarding",
    "jet skiing",
    "kite boarding",
    "kite foiling",
    "kite surfing",
    "kiteboarding",
    "kite-boarding",
    "kitesurfing",
    "kite-surfing",
    "night surfing",
    "scurfing (surfboard being  towed behind a boat)",
    "sitting",
    "sitting in the water",
    "sitting on surfboard",
    "skimboarding",
    "standing / surfing",
    "surf fishing",
    "surf paddling",
    "surf skiing",
    "surface supplied  diving",
    "surfing",
    "surfing / wading",
    "surfing amid a shoal of sharks",
    "surfing, but standing in water alongside board",
    "surfng",
    "surf-skiing",
    "surf-sking",
    "tandem surfing",
    "wakeboarding",
    "walking",
    "walking in shallows",
    "walking in surf",
    "walking out of the water after surfing",
    "windsurfing",
    "windsurfing, but sitting on his board",
    "wing foil surfing"
])

activity_surfing_map = {
    "foil boarding": "surfing",
    "foil-boarding": "surfing",
    "jet skiing": "surfing",
    "kite boarding": "surfing",
    "kite foiling": "surfing",
    "kite surfing": "surfing",
    "kiteboarding": "surfing",
    "kite-boarding": "surfing",
    "kitesurfing": "surfing",
    "kite-surfing": "surfing",
    "night surfing": "surfing",
    "scurfing (surfboard being  towed behind a boat)": "surfing",
    "sitting": "surfing",
    "sitting in the water": "surfing",
    "sitting on surfboard": "surfing",
    "skimboarding": "surfing",
    "standing / surfing": "surfing",
    "surf fishing": "surfing",
    "surf paddling": "surfing",
    "surf skiing": "surfing",
    "surface supplied  diving": "surfing",
    "surfing": "surfing",
    "surfing / wading": "surfing",
    "surfing amid a shoal of sharks": "surfing",
    "surfing, but standing in water alongside board": "surfing",
    "surfng": "surfing",
    "surf-skiing": "surfing",
    "surf-sking": "surfing",
    "tandem surfing": "surfing",
    "wakeboarding": "surfing",
    "walking": "surfing",
    "walking in shallows": "surfing",
    "walking in surf": "surfing",
    "walking out of the water after surfing": "surfing",
    "windsurfing": "surfing",
    "windsurfing, but sitting on his board": "surfing",
    "wing foil surfing": "surfing",
}

print(
    "Después de normalizar 'activity' → 'surfing':",
    shark_df_filtered.loc[mask_surfing, "activity"]
        .str.strip().str.lower()
        .replace(activity_surfing_map)
        .value_counts()
)

shark_df_filtered.loc[mask_surfing, "activity"] = (
    shark_df_filtered.loc[mask_surfing, "activity"]
        .str.strip().str.lower()
        .replace(activity_surfing_map)
)

Después de normalizar 'activity' → 'surfing': activity
surfing    814
Name: count, dtype: Int64


In [35]:
# Kayaking
mask_kayaking = shark_df_filtered["activity"].str.strip().str.lower().isin([
    "canoeing",
    "kakaying",
    "kayak fishing",
    "kayaking",
    "kayaking / fishing"
])

activity_kayaking_map = {
    "canoeing": "kayaking",
    "kakaying": "kayaking",
    "kayak fishing": "kayaking",
    "kayaking": "kayaking",
    "kayaking / fishing": "kayaking",
}

print(
    "Después de normalizar 'activity' → 'kayaking':",
    shark_df_filtered.loc[mask_kayaking, "activity"]
        .str.strip().str.lower()
        .replace(activity_kayaking_map)
        .value_counts()
)

shark_df_filtered.loc[mask_kayaking, "activity"] = (
    shark_df_filtered.loc[mask_kayaking, "activity"]
        .str.strip().str.lower()
        .replace(activity_kayaking_map)
)

Después de normalizar 'activity' → 'kayaking': activity
kayaking    14
Name: count, dtype: Int64


In [36]:
# Hacemos un checkeo, para saber el numero de ataques dependiendo de la actividad.
shark_df["activity"].value_counts().head(10)

activity
surfing                    814
swimming                   541
diving                     171
fishing                    170
paddle                     151
kayaking                    14
study                        5
shipwreck                    1
attempting to fix motor      1
Name: count, dtype: Int64

In [37]:
# Valoramos los NaN de las categorias "activity" y "sex" para agruparlas en unkwnown
shark_df[["activity", "sex"]].isna().sum()

activity    66
sex         30
dtype: int64

In [38]:
import numpy as np

In [39]:
shark_df["activity"] = shark_df["activity"].replace("", np.nan)  # convierte strings vacíos en NaN
shark_df["activity"] = shark_df["activity"].fillna("unknown activity") #completa los NULL con "unknown"

In [40]:
shark_df["sex"] = shark_df["sex"].replace("", np.nan)  # convierte strings vacíos en NaN
shark_df["sex"] = shark_df["sex"].fillna("unknown sex") #completa los NULL con "unknown"

#replace de "lli" por "unknown" en la columna "sex"
shark_df["sex"] = shark_df["sex"].replace("lli", "unknown sex")

In [41]:
# Chekeamos los cambios.
shark_df[["activity", "sex"]].isna().sum()

activity    0
sex         0
dtype: int64

In [42]:
#Check valores por columnas para próximos filtros

for col in shark_df.columns:
    print(f'{col.upper()} --> valores únicos:', shark_df[col].unique())

YEAR --> valores únicos: [2025. 2024. 2023. 2022. 2021. 2020. 2019. 2018. 2017. 2016. 2015. 2014.
 2013. 2012. 2011. 2010. 2009. 2008. 2007. 2006. 2005. 2004. 2003. 2002.
 2001. 2000.]
TYPE --> valores únicos: <StringArray>
['unprovoked']
Length: 1, dtype: string
COUNTRY --> valores únicos: <StringArray>
[       'australia',              'usa',          'bahamas',
 'french polynesia',     'south africa',    'new caledonia',
            'egypt',      'new zealand',           'mexico',
           'brazil',          'reunion']
Length: 11, dtype: string
STATE --> valores únicos: <StringArray>
[                          'queensland',
                      'south australia',
                           'california',
                      'new south wales',
                              'florida',
                                'texas',
                                'abaco',
                            'marquesas',
                        'kwazulu-natal',
                             'new y

In [43]:
#Exportamos a excel post filtro: TYPE + YEAR + COUNTRY + LOCATION + ACTIVITY + SEX

shark_df.to_excel('SharkAttack_check4.xlsx')

In [44]:
# CUARTO FILTRO (CLAUDIA):
# Valoramos los duplicados de las actividades que estaban realizando los atacados, para agruparlas en categorias iguales.
shark_df['fatal_y/n'].value_counts()

fatal_y/n
n          1749
y           171
f             2
unknown       2
nq            1
Name: count, dtype: Int64

In [45]:
# Definimos el mapeo de reemplazo
fatal_map = {
    "f": "unknown",
    "nq": "n",
    "unknown": "unknown",
    "": "unknown",   # celdas vacías
    "nan": "unknown" # valores nulos
}

# Aplicamos el reemplazo

shark_df["fatal_y/n"] = (
    shark_df["fatal_y/n"]
        .astype("string")              # usa el tipo string nativo de pandas
        .str.strip().str.lower()
        .replace(fatal_map)
        .replace(r"^\s*$", pd.NA, regex=True)  # convierte blancos o espacios en NA
        .fillna("unknown") # completa los NaN con "unknown"
)  

# Checkeamos los cambios
shark_df["fatal_y/n"].value_counts(dropna=False)

fatal_y/n
n          1750
y           171
unknown      13
Name: count, dtype: Int64

In [46]:
shark_df.to_excel('SharkAttack_check5.xlsx')

In [47]:
shark_df.columns

Index(['year', 'type', 'country', 'state', 'activity', 'sex', 'fatal_y/n',
       'species'],
      dtype='object')

In [48]:
!pip install plotly




[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [61]:
# Análisis por YEAR - COUNTRY

year_analysis_cols = ['year', 'country', 'fatal_y/n']
dupe_mask1 = shark_df.duplicated(subset=year_analysis_cols, keep=False)
shark_df[dupe_mask1].sort_values(by=year_analysis_cols)

# Cuántas veces se repite cada combinación de las columnas clave
shark_df.groupby(year_analysis_cols).size().sort_values(ascending=False).head(10)

year    country  fatal_y/n
2015.0  usa      n            56
2012.0  usa      n            54
2007.0  usa      n            54
2016.0  usa      n            54
2014.0  usa      n            53
2017.0  usa      n            52
2013.0  usa      n            51
2019.0  usa      n            50
2001.0  usa      n            50
2008.0  usa      n            49
dtype: int64

In [50]:
# Agrupamos los datos y contamos los ataques
shark_summary = (shark_df.groupby(year_analysis_cols).size().reset_index(name="attack_count"))

shark_summary.head()

Unnamed: 0,year,country,fatal_y/n,attack_count
0,2000.0,australia,n,8
1,2000.0,australia,y,3
2,2000.0,new caledonia,y,1
3,2000.0,new zealand,n,1
4,2000.0,new zealand,y,1


In [64]:
import plotly.express as px

fig = px.bar(
    shark_summary,
    x="year",
    y="attack_count",
    color="fatal_y/n",
    facet_col="country",       # crea subgráficos por país
    facet_col_wrap=3,          # organiza en filas/columnas
    title="Ataques de tiburón por año y país (divido por ataque fatal/no fatal)",
    labels={
        "year": "Año",
        "attack_count": "Nº de ataques",
        "fatal_y/n": "Fatalidad"
    }
)

fig.update_layout(height=600)
fig.show()


In [None]:
year_fatal_summary = (
    shark_summary.groupby(["year", "fatal_y/n"])["attack_count"]
    .sum()
    .reset_index()
)

fig = px.line(
    year_fatal_summary,
    x="year",
    y="attack_count",
    color="fatal_y/n",
    markers=True,
    title="Evolución de los ataques fatales vs no fatales"
)

fig.show()


In [52]:
# Análisis por COUNTRY - STATE
geo_analysis_cols = ['country', 'state', 'activity','fatal_y/n']
dupe_mask2 = shark_df.duplicated(subset=geo_analysis_cols, keep=False)
shark_df[dupe_mask2].sort_values(by=geo_analysis_cols)

# Cuántas veces se repite cada combinación de las columnas clave
shark_df.groupby(geo_analysis_cols).size().sort_values(ascending=False).head(10)

country    state              activity  fatal_y/n
usa        florida            surfing   n            293
                              swimming  n            184
australia  new south wales    surfing   n            105
usa        california         surfing   n             61
           hawaii             surfing   n             60
           south carolina     swimming  n             41
           north carolina     swimming  n             41
           florida            paddle    n             40
australia  western australia  surfing   n             38
usa        florida            fishing   n             32
dtype: int64

In [65]:
fig = px.density_heatmap(
    shark_summary,
    x="year",
    y="country",
    z="attack_count",
    color_continuous_scale="Cividis",
    title="Mapa de calor: dispersión de ataques por año y país"
)

fig.show()


In [66]:
geo_summary = (shark_df.groupby(['country', 'state', 'fatal_y/n']).size().reset_index(name="attack_count"))

fig = px.scatter_geo(
    geo_summary,
    locations="country",           # reconoce nombres de países automáticamente
    locationmode="country names",
    size="attack_count",
    color="fatal_y/n",
    title="Mapa interactivo de ataques de tiburón por país y estado",
    projection="natural earth"
)
fig.show()



The library used by the *country names* `locationmode` option is changing in an upcoming version. Country names in existing plots may not work in the new version. To ensure consistent behavior, consider setting `locationmode` to *ISO-3*.



In [53]:
# Análisis por FATAL-SEX-ACTIVITY
sex_analysis_cols = ['fatal_y/n','sex', 'activity']
dupe_mask3 = shark_df.duplicated(subset=sex_analysis_cols, keep=False)
shark_df[dupe_mask3].sort_values(by=sex_analysis_cols)

# Cuántas veces se repite cada combinación de las columnas clave
shark_df.groupby(sex_analysis_cols).size().sort_values(ascending=False).head(10)

fatal_y/n  sex  activity
n          m    surfing     676
                swimming    307
           f    swimming    171
           m    fishing     140
                paddle       98
                diving       97
           f    surfing      77
y          m    surfing      43
                swimming     41
n          f    diving       40
dtype: int64

In [67]:
geo_summary = (shark_df.groupby(['country', 'activity']).size().reset_index(name="attack_count"))

fig = px.density_heatmap(
    geo_summary,
    x="activity",
    y="country",
    z="attack_count",
    color_continuous_scale="Inferno",
    title="Mapa de calor: dispersión de los ataques por actividad en cada país"
)
fig.show()


In [54]:
# Análisis por SPECIES-FATAL
species_analysis_cols = ['species','fatal_y/n']
dupe_mask4 = shark_df.duplicated(subset=species_analysis_cols, keep=False)
shark_df[dupe_mask4].sort_values(by=species_analysis_cols)

# Cuántas veces se repite cada combinación de las columnas clave
shark_df.groupby(species_analysis_cols).size().sort_values(ascending=False).head(10)

species         fatal_y/n
white shark     n            43
bull shark      n            35
4' shark        n            29
tiger shark     n            27
white shark     y            22
4' to 5' shark  n            20
6' shark        n            18
3' shark        n            17
3' to 4' shark  n            16
blacktip shark  n            15
dtype: int64