In [1]:
import pandas as pd
import re

**LOADING DATA FROM CSV**

In [2]:
pd.set_option('display.max_columns', None)
shark_attacks = pd.read_csv("../data/attacks.csv", encoding="latin1")
shark_attacks.columns = [i.lower().replace(' ', '_') for i in shark_attacks.columns]
shark_attacks

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex_,age,injury,fatal_(y/n),time,species_,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order,unnamed:_22,unnamed:_23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adysonï¿½McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25718,,,,,,,,,,,,,,,,,,,,,,,,
25719,,,,,,,,,,,,,,,,,,,,,,,,
25720,,,,,,,,,,,,,,,,,,,,,,,,
25721,,,,,,,,,,,,,,,,,,,,,,,,


**CLEANING**

First we we need to drop all the rows with null values.

In [3]:
attacks_subset = shark_attacks.dropna(how='all')
attacks_subset.isna().sum()

case_number                  1
date                      2401
year                      2403
type                      2405
country                   2451
area                      2856
location                  2941
activity                  2945
name                      2611
sex_                      2966
age                       5232
injury                    2429
fatal_(y/n)               2940
time                      5755
species_                  5239
investigator_or_source    2418
pdf                       2401
href_formula              2402
href                      2401
case_number.1             2401
case_number.2             2401
original_order            2394
unnamed:_22               8702
unnamed:_23               8701
dtype: int64

Now we need to have a look to those rows were the 'case_number' is not null but the rest of the values are null.

In [4]:
rows_with_nan = attacks_subset[attacks_subset.drop('case_number', axis=1).isna().all(axis=1)]
print(rows_with_nan.shape)
rows_with_nan.isna().sum()

(2394, 24)


case_number                  0
date                      2394
year                      2394
type                      2394
country                   2394
area                      2394
location                  2394
activity                  2394
name                      2394
sex_                      2394
age                       2394
injury                    2394
fatal_(y/n)               2394
time                      2394
species_                  2394
investigator_or_source    2394
pdf                       2394
href_formula              2394
href                      2394
case_number.1             2394
case_number.2             2394
original_order            2394
unnamed:_22               2394
unnamed:_23               2394
dtype: int64

As we can see, all the columns unless 'case_number' have the same null values as the lenght of the rows filtered, then we can remove this rows because they do not have any information.

In [5]:
# The '~' operator is used to negate the boolean condition, so it keeps the rows where the condition is False.
attacks_subset_ = attacks_subset[~attacks_subset.drop('case_number', axis=1).isna().all(axis=1)]
print(attacks_subset_.shape)
attacks_subset_[attacks_subset_['case_number'] == '0']

(6309, 24)


Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex_,age,injury,fatal_(y/n),time,species_,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order,unnamed:_22,unnamed:_23
6302,0,,,,,,,,,,,,,,,,,,,,,6304.0,,
6303,0,,,,,,,,,,,,,,,,,,,,,6305.0,,
6304,0,,,,,,,,,,,,,,,,,,,,,6306.0,,
6305,0,,,,,,,,,,,,,,,,,,,,,6307.0,,
6306,0,,,,,,,,,,,,,,,,,,,,,6308.0,,
6307,0,,,,,,,,,,,,,,,,,,,,,6309.0,,
6308,0,,,,,,,,,,,,,,,,,,,,,6310.0,,


We still have some rows with almost all the data with null values so as I don't need the column 'original_order', I can remove this rows.

In [6]:
attacks_subset = attacks_subset[~attacks_subset.drop(['case_number', 'original_order'], axis=1).isna().all(axis=1)]
print(attacks_subset.shape)

(6302, 24)


Now I want to remove those columns that are not giving me any type of information.

In [7]:
attacks_subset = attacks_subset.drop(columns=['pdf', 'href_formula', 'href', 'case_number.1', 'case_number.2', 'original_order', 'unnamed:_22', 'unnamed:_23'])
attacks_subset


Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex_,age,injury,fatal_(y/n),time,species_,investigator_or_source
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF"
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adysonï¿½McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com"
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com"
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF"
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,,FATAL,Y,,,"H. Taunton; N. Bartlett, p. 234"
6298,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,,FATAL,Y,,,"H. Taunton; N. Bartlett, pp. 233-234"
6299,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,,FATAL,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF"
6300,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ï¿½N, 79ï¿½W",,Jules Patterson,M,,FATAL,Y,,,"The Sun, 10/20/1938"


In order to find the duplicated rows we do it by 'original_order'.

In [8]:
duplicated = attacks_subset[attacks_subset['case_number'].duplicated(keep=False)]
duplicated

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex_,age,injury,fatal_(y/n),time,species_,investigator_or_source
522,2014.08.02,02-Aug-2014,2014.0,Unprovoked,USA,Florida,"South of Cocoa Beach, Brevard County",Surfing,male,M,50s,Foot bitten,N,,,"Florida Today, 8/8/2014"
523,2014.08.02,02-Aug-2014,2014.0,Unprovoked,USA,Florida,"Table Beach, Brevard County",Boogie boarding,Christian Sanhueza,M,8,Laceration to ankle,N,13h00,,"Florida Today, 8/2/2014"
615,2013.10.05,06-Oct-2013,2013.0,Unprovoked,USA,California,"Bunkers, Humboldt Bay, Eureka, Humboldt County",Surfing,Jay Scrivner,M,45,Laceration to thigh,N,08h45,"White shark, 8' to 10'","R. Collier, GSAF"
616,2013.10.05,10-Oct-2013,2013.0,Unprovoked,USA,Florida,"Destin, Okaloosa County",Wading,Zachary Tyke Standridge,M,12,Lacerations to right forearm,N,15h30,Small bull shark,"Monroe County Advocate, 10/9/2013"
746,2012.09.02.b,02-Sep-2012,2012.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Boogie boarding,female,F,8,Puncture wounds to calf and hand,N,18h30,3.5' to 4' shark,"WYTV, 9/3/2012"
747,2012.09.02.b,02-Sep-2012,2012.0,Provoked,USA,Hawaii,"Spreckelsville, Maui",Spearfishing,M. Malabon,,,Minor laceration to hand PROVOKED INCIDENT,N,12h00,"Tiger shark, 10' to 12'",HawaiiNow.com
1063,2009.12.18,18-Dec-2009,2009.0,Unprovoked,SOUTH AFRICA,Eastern Cape Province,"Second Beach, Port St. Johns",Paddling on kneeboard,Tshintshekile Nduva,M,22,FATAL,Y,14h30,,"B. Jordan & A. Ferreira, Times Live, 12/21/2009"
1064,2009.12.18,18-Dec-2009,2009.0,Invalid,SOUTH AFRICA,KwaZulu-Natal,"North Beach, Durban",Surfing,Lance Morris,M,,Minor lacerations to left leg. nitially report...,,,No shark involvement,"M. Addison, C. Eckstander, GSAF"
1436,2006.09.02,02-Sep-2006,2006.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Swimming,male,M,12 or 13,Arm bitten,N,17h55,,"S. Petersohn, GSAF"
1437,2006.09.02,02-Sep-2006,2006.0,Unprovoked,SOUTH AFRICA,Western Cape Province,Noordhoek,Surfing,Steven Harcourt-Wood,M,37,"No injury, shark rammed surfboard",N,,"White shark, 3.5m","Cape Times, 9/3/2006"


We check that there are no rows duplicated with the same information.

So now we can start cleaning the columns in order to standarize the data.

    1. Let's start with 'type' column of shark attack

In [9]:
attacks_subset['type'].value_counts()

type
Unprovoked      4595
Provoked         574
Invalid          547
Sea Disaster     239
Boating          203
Boat             137
Questionable       2
Boatomg            1
Name: count, dtype: int64

In [10]:
pattern = r'\w*[Bb]oat\w*'
attacks_subset['type'] = attacks_subset['type'].str.replace(pattern, 'Boat', regex=True)
attacks_subset['type'].value_counts()

type
Unprovoked      4595
Provoked         574
Invalid          547
Boat             341
Sea Disaster     239
Questionable       2
Name: count, dtype: int64

    2. For 'time' column I want to classify hours by ranges of time.

In [11]:
attacks_subset['time'].unique()

array(['18h00', '14h00  -15h00', '07h45', nan, 'Late afternoon', '17h00',
       '14h00', 'Morning', '15h00', '08h15', '11h00', '10h30', '10h40',
       '16h50', '07h00', '09h30', 'Afternoon', '21h50', '09h40', '08h00',
       '17h35', '15h30', '07h30', '19h00, Dusk', 'Night', '16h00',
       '15h01', '12h00', '13h45', '23h30', '09h00', '14h30', '18h30',
       '12h30', '16h30', '18h45', '06h00', '10h00', '10h44', '13h19',
       'Midday', '13h30', '10h45', '11h20', '11h45', '19h30', '08h30',
       '15h45', 'Shortly before 12h00', '17h34', '17h10', '11h15',
       '08h50', '17h45', '13h00', '10h20', '13h20', '02h00', '09h50',
       '11h30', '17h30', '9h00', '10h43', 'After noon', '15h15', '15h40',
       '19h05', '1300', '14h30 / 15h30', '22h00', '16h20', '14h34',
       '15h25', '14h55', '17h46', 'Morning ', '15h49', '19h00',
       'Midnight', '09h30 / 10h00', '10h15', '18h15', '04h00', '14h50',
       '13h50', '19h20', '10h25', '10h45-11h15', '16h45', '15h52',
       '06h15', '14h

In [12]:
hour_pattern = r'\d{1,2}(?::|h|j)\d{2}'
second_patter = r'\d{1,2}\w\d{2}|>\d{1,2}:\d{2}|(0?\d|1[0-2]):\d{2}j|Before (0?\d|1[0-2]):\d{2}|Between (0?\d|1[0-2]):\d{2} and (0?\d|1[0-2]):\d{2}|\d{1,2}:\d{2}|\d{1,2}:\d{2}'

def formatt_hour(string):
    if 'h' in string:
        string = string.split('h')
        if len(string[0]) == 1:
            string[0] += '0'
        return ":".join(string)
    elif 'j' in string:
        return string.replace('j', ':')
    elif string.isdigit():
        string = string[:len(string)//2] + ':' + string[len(string)//2:]
    else:
        return string



In [13]:
def class_by_string(string):
    if 'morning' in str(string).lower():
        return 'Morning'
    elif 'afternoon' in str(string).lower() or 'midday' in str(string).lower() or 'noon' in str(string).lower():
        return "Afternoon"
    elif 'evening' in str(string).lower():
        return "Evening"
    elif 'night' in str(string).lower() or 'dusk' in str(string).lower():
        return "Night"
    elif re.search(second_patter, str(string)):
        str_ = re.search(second_patter, str(string))
        return formatt_hour(str_.group())
    else:
        return "Unknown"

attacks_subset['clean_time'] = attacks_subset['time'].apply(class_by_string)
attacks_subset

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex_,age,injury,fatal_(y/n),time,species_,investigator_or_source,clean_time
0,2018.06.25,25-Jun-2018,2018.0,Boat,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",18:00
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adysonï¿½McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",14:00
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",07:45
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",Unknown
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,,FATAL,Y,,,"H. Taunton; N. Bartlett, p. 234",Unknown
6298,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,,FATAL,Y,,,"H. Taunton; N. Bartlett, pp. 233-234",Unknown
6299,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,,FATAL,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF",Unknown
6300,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ï¿½N, 79ï¿½W",,Jules Patterson,M,,FATAL,Y,,,"The Sun, 10/20/1938",Unknown


In [14]:
#regex_condition = attacks_subset['day_time'].str.contains(hour_pattern)
# Aplica la función 'formatt_hour' solo a los valores que cumplen el patrón regex
#non_null_condition = (regex_condition) & (~attacks_subset['day_time'].isnull())
#attacks_subset.loc[non_null_condition, 'day_time'] = attacks_subset.loc[non_null_condition, 'day_time'].apply(formatt_hour)
#attacks_subset['day_time'].unique()

In [15]:
def class_by_time(string):
    """
    This function receives a string in the format '12:00' and determines whether this time is 'Morning', 'Afternoon', 'Evening' or 'Night'.
    """
    morning_pattern = r'0[6-9]:[0-5][0-9]|1[0-1]:[0-5][0-9]'
    afternoon_pattern = r'1[2-7]:[0-5][0-9]'
    evening_pattern = r'1[8-9]:[0-5][0-9]|20:[0-5][0-9]|21:[0-5][0-9]'
    night_pattern = r'22:[0-5][0-9]|23:[0-5][0-9]|0[0-5]:[0-5][0-9]'

    if re.search(morning_pattern, str(string)):
        return 'Morning'
    elif re.search(afternoon_pattern, str(string)):
        return "Afternoon"
    elif re.search(evening_pattern, str(string)):
        return "Evening"
    elif re.search(night_pattern, str(string)):
        return "Night"
    else:
        return 'Unknown'
    
pattern = r'\d{1,2}:\d{1,2}'
attacks_subset['clean_time'] = attacks_subset['clean_time'].apply(lambda x: class_by_time(x) if pd.notna(x) and pd.Series(x).str.contains(pattern).any() else x)

attacks_subset['clean_time'].value_counts()




clean_time
Unknown      3414
Afternoon    1566
Morning       914
Evening       254
Night         146
Name: count, dtype: int64

- The cleaning of time column is useless because more than half of the data is 'Unknown'.

    3. Now let's clean 'age' column.

In [16]:
attacks_subset['age'].value_counts()

age
17                154
18                150
19                142
20                141
15                139
                 ... 
20?                 1
 28                 1
7      &    31      1
 30                 1
13 or 14            1
Name: count, Length: 157, dtype: int64

In [17]:
def extract_age(x):
    if pd.isna(x):
        return 'unknown'
    if x in ['teen', 'Teen', 'Teens']:
        return '15'
    if x in ['adult', '(adult)', 'middle-aged']:
        return '50'
    if x == '18 months':
        return '2'
    age_av = re.findall(r'(\d{1,2})\s*(&|or|to)\s*(\d{1,2})', str(x))
    if age_av:
        average_ages = [(int(match[0]) + int(match[2])) / 2 for match in age_av]
        return str(average_ages[0])
    age_match = re.search(r'\d{1,2}', str(x))
    if age_match:
        return age_match.group()
    return 'unknown'

    4. Column 'year'

In [18]:
attacks_subset['year'].unique()

array([2018., 2017.,   nan, 2016., 2015., 2014., 2013., 2012., 2011.,
       2010., 2009., 2008., 2007., 2006., 2005., 2004., 2003., 2002.,
       2001., 2000., 1999., 1998., 1997., 1996., 1995., 1984., 1994.,
       1993., 1992., 1991., 1990., 1989., 1969., 1988., 1987., 1986.,
       1985., 1983., 1982., 1981., 1980., 1979., 1978., 1977., 1976.,
       1975., 1974., 1973., 1972., 1971., 1970., 1968., 1967., 1966.,
       1965., 1964., 1963., 1962., 1961., 1960., 1959., 1958., 1957.,
       1956., 1955., 1954., 1953., 1952., 1951., 1950., 1949., 1948.,
       1848., 1947., 1946., 1945., 1944., 1943., 1942., 1941., 1940.,
       1939., 1938., 1937., 1936., 1935., 1934., 1933., 1932., 1931.,
       1930., 1929., 1928., 1927., 1926., 1925., 1924., 1923., 1922.,
       1921., 1920., 1919., 1918., 1917., 1916., 1915., 1914., 1913.,
       1912., 1911., 1910., 1909., 1908., 1907., 1906., 1905., 1904.,
       1903., 1902., 1901., 1900., 1899., 1898., 1897., 1896., 1895.,
       1894., 1893.,

In [19]:
pattern_year = r'\d{4}'
str_pattern = r'\d{4}\.'
def clean_year(year):
    if re.match(str_pattern, str(year)):
        return re.match(pattern_year, str(year)).group()
    else:
        return 'Unknown'

attacks_subset['clean_year'] = attacks_subset['year'].apply(clean_year)
attacks_subset['clean_year'].unique()
attacks_subset.sample(10)


Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex_,age,injury,fatal_(y/n),time,species_,investigator_or_source,clean_time,clean_year
2546,1992.07.08.a,08-Jul-1992,1992.0,Unprovoked,BRAZIL,Maranhï¿½o,"Praia da Marcela, Sï¿½o Marcos Bay",Surfing,M.C.,M,,"Lower leg bitten, surgically amputated",N,Morning,,M. Szpilman,Morning,1992
563,2014.04.22,22-Apr-2014,2014.0,Unprovoked,USA,Florida,"Cocoa Beach, Brevard County",Swimming,male,M,42.0,Laceration & puncture wounds to right foot,N,15h30,,"R. Neale, Florida Today, 4/22/2014",Afternoon,2014
46,2018.02.01,01-Feb-2018,2018.0,Invalid,AUSTRALIA,Western Australia,"Avalon Point, Manurah",Spearfishing,Lucas Martin,M,14.0,"No injury no attack. This is considerd an ""enc...",N,,2m shark,"The West Australian, 2/2/2018",Unknown,2018
5768,1885.07.26.b,26-Jul-1885,1885.0,Sea Disaster,USA,Hawaii,Kau District,Wreck of the schooner Pohoiki,sailor,M,,Laceration to torso,N,,,"Hawaiian Gazette, 8124/1885",Unknown,1885
1400,2007.03.05.R,Reported 05-Mar-2007,2007.0,Provoked,NEW ZEALAND,Cook islans,Penhryn Island,Spearfishing,Turua William Maretapu,M,16.0,Leg bitten by shark after he shot at it & mis...,N,,Tiger shark,"R. Weeks, GSAF",Unknown,2007
6229,ND.0084,"No date, Before 3-Jan-1967",0.0,Unprovoked,SOUTH AFRICA,Eastern Cape Province,Keiskamma River mouth,Crossing river on a raft,Sinsa,M,,"FATAL, leg severed",Y,,,"Whitaker, The Sun, 1/3/1967",Unknown,Unknown
2573,1991.12.04,04-Dec-1991,1991.0,Unprovoked,USA,California,"Shelter Cover, north of Fort Bragg, Shelter Co...",Hookah diving for sea urchins,David Abernathy,M,25.0,"No injury, shark became tangled in hose & towe...",N,15h06,6 m [20'] white shark,"R. Collier, pp.129-130; Mark Marks; S. Waterma...",Afternoon,1991
4860,1937.00.00,1937,1937.0,Unprovoked,AUSTRALIA,Torres Strait,,,"O'Leary, a Torres Strait islander",M,,Survived,N,,,"Rpt. Dept. Harb. Mar. (Qld), 1937, p.13; G.P. ...",Unknown,1937
5438,1908.05.13,13-May-1908,1908.0,Sea Disaster,INDONESIA,Java,Jakarta Harbor,native boats sunk in storm,,F,,FATAL,Y,,,"The Advertiser, 6/26/1908",Unknown,1908
6036,1848.08.31,31-Aug-1848,1848.0,Unprovoked,USA,Maryland,Patapsco River,Swimming,William Haywood,M,15.0,Left leg severely bitten,N,,,"Adams Sentinel, 9/4/1848",Unknown,1848
