In [4]:
import pandas as pd
import numpy as np
import re
import matplotlib
import seaborn as sns

This file ("Clean.ipynb") will be used to clean the data set "attacks.csv" to then test my hypothesis about sharks: "Surfers and those participating in board sports account for most incidents specially in the US state of Florida"


In [117]:
data = pd.read_csv("Output/attacks.csv", encoding = "ISO-8859-1")
data

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25718,,,,,,,,,,,...,,,,,,,,,,
25719,,,,,,,,,,,...,,,,,,,,,,
25720,,,,,,,,,,,...,,,,,,,,,,
25721,,,,,,,,,,,...,,,,,,,,,,


Having a first look at the dataset, we can quickly conclude that the data is very messy and extensive (25723 rows x 24 columns). Moreover, there are several 'Nan' values which aren't of any help to test our hypothesis. Hereunder, we will use several lines of code to help clean the dataset.

In [118]:
data.shape

(25723, 24)

In [119]:
data.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [120]:
data.isna().sum()

Case Number               17021
Date                      19421
Year                      19423
Type                      19425
Country                   19471
Area                      19876
Location                  19961
Activity                  19965
Name                      19631
Sex                       19986
Age                       22252
Injury                    19449
Fatal (Y/N)               19960
Time                      22775
Species                   22259
Investigator or Source    19438
pdf                       19421
href formula              19422
href                      19421
Case Number.1             19421
Case Number.2             19421
original order            19414
Unnamed: 22               25722
Unnamed: 23               25721
dtype: int64

In [121]:
data = data.drop_duplicates()
data

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6307,0,,,,,,,,,,...,,,,,,,,6309.0,,
6308,0,,,,,,,,,,...,,,,,,,,6310.0,,
6309,0,,,,,,,,,,...,,,,,,,,,,
8702,,,,,,,,,,,...,,,,,,,,,,


In [122]:
data.shape

(6312, 24)

The dataset has considerably reduced the number of rows after dropping all the duplicate values (from 25723 rows to 6312).

In [123]:
data.isnull().sum(axis=0).sort_values(ascending=False)

Unnamed: 22               6311
Unnamed: 23               6310
Time                      3364
Species                   2848
Age                       2841
Sex                        575
Activity                   554
Location                   550
Fatal (Y/N)                549
Area                       465
Name                       220
Country                     60
Injury                      38
Investigator or Source      27
Type                        14
Year                        12
href formula                11
pdf                         10
href                        10
Case Number.1               10
Case Number.2               10
Date                        10
original order               3
Case Number                  2
dtype: int64

On the table above, we are checking the empty values within each column. For instance, the columns 'Unname: 22', 'Unnamed: 23' are almost empty and hence redundant.

In [124]:
data.isnull().sum(axis=1).sort_values(ascending=False)

8702     24
25722    23
6309     23
6308     22
6307     22
         ..
1383      2
3042      2
3039      2
1386      2
0         2
Length: 6312, dtype: int64

Similary, when setting the axis=1, we count the empty values within each row.

In [125]:
nas = data.isnull().sum(axis=1)[data.isna().sum(axis=1)>10].index
nas

Int64Index([6302, 6303, 6304, 6305, 6306, 6307, 6308, 6309, 8702, 25722], dtype='int64')

The rows where almost all the data is missing is useless to our study. Thererfore, we are going to drop all the rows with more than 10 values null.

In [126]:
data = data.drop(index = nas)
data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [127]:
data.shape

(6302, 24)

Although the length hasn't shorten much, we have dispensed of 10 rows which weren't going to help.

In [128]:
data.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [129]:
data = data.drop(columns = ['Case Number', 'original order','Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'Unnamed: 22','Unnamed: 23'])
data

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,,FATAL,Y,,
6298,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,,FATAL,Y,,
6299,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,,FATAL,Y,,
6300,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,,FATAL,Y,,


Out of the dataset, we considered that the columns ['Case Number', 'original order','Investigator or Source', 'pdf', 'href formula', 'href','Case Number.1', 'Case Number.2', 'Unnamed: 22','Unnamed: 23'] where useless to us and therefore dropped them.

In [130]:
data.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species '],
      dtype='object')

In [131]:
data.shape

(6302, 14)

We now have 14 columns.

In [132]:
data["Species "].isnull().sum()

2838

We will now filter the dataset into the most common species. As we can see, the majority of the data in the column "Species" is missing, therefore, we will fill this data with a new value: "Unknown Species".

In [133]:
data['Species '] =data['Species '].fillna("unknown species")
data['Species '].value_counts()

unknown species                                       2838
White shark                                            163
Shark involvement prior to death was not confirmed     105
Invalid                                                102
Shark involvement not confirmed                         88
                                                      ... 
2' to 3' juvenile shark                                  1
Tawny nurse shark, 40cm                                  1
4.5' to 5' shark                                         1
White shark, 3 m [10'] k                                 1
Tiger shark & others                                     1
Name: Species , Length: 1550, dtype: int64

Since there are many unknowns, we will filter the dataset onto 2 main shark species: "White sharks" and "Tiger sharks".

In [134]:
data = data[(data["Species "]== "White shark" )| (data[ "Species "] == "Tiger shark")]
data['Species '].value_counts()

White shark    163
Tiger shark     73
Name: Species , dtype: int64

We now have 236 rows in the data set.

In [135]:
data.shape

(236, 14)

In [136]:
data.head()

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
6,03-Jun-2018,2018.0,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto da Silva,M,18.0,FATAL,Y,Late afternoon,Tiger shark
18,28-Apr-2018,2018.0,Unprovoked,COSTA RICA,Cocos Island,Manuelita,Scuba diving,male,M,30.0,"No injury, shark bit scuba gear",N,10h40,Tiger shark
23,23-Apr-2018,2018.0,Unprovoked,MALDIVES,Alifu Alifu Atoll,Madoogali,Fishing,Ahmed Rasheed,M,32.0,5-inch cut to hand,N,21h50,Tiger shark
30,14-Apr-2018,2018.0,Unprovoked,BAHAMAS,New Providence,Nirvana Beach,Surfing,Bruce Rowan,M,,No Injury. Shark swam away with the surf board,N,09h30,Tiger shark


Quick reminder that our hypothesis testing is to check if "surfers and those participating in board sports account for most incidents specially in the US state of Florida". By filtering by activity, we check that indeed, that the most shark attack accidents happen to surrfers (45 out of 236).

In [137]:
data['Activity'].value_counts()

Surfing                                                                                                     45
Swimming                                                                                                    29
Spearfishing                                                                                                17
Fishing                                                                                                     15
Kayaking                                                                                                     6
                                                                                                            ..
Swimming. Passer-by, Len Bedford, heard him shriek , saw shark leap from the water & swimmer disappeared     1
Wooden fishing boat                                                                                          1
Spearfishing (Free diving)                                                                                   1
F

We will now focus our study on the most common activities: Surfing, Swimming, Spearfishing, Fishing and Kayaking.

In [139]:
data = data[(data["Activity"]== "Surfing" )| (data[ "Activity"] == "Swimming") |(data[ "Activity"] == "Spearfishing") |(data[ "Activity"] == "Fishing")|(data[ "Activity"] == "Kayaking")]

data['Activity'].value_counts()

Surfing         45
Swimming        29
Spearfishing    17
Fishing         15
Kayaking         6
Name: Activity, dtype: int64

In [140]:
data.shape

(112, 14)

As mentioned before, we want to test whether if Florida is the place where most attacks ocurr. Therefore, we will now do a count of the attacks per location. First, we filter by Country and then we filter by Area.

In [141]:
data["Country"].value_counts()

USA                               33
AUSTRALIA                         24
SOUTH AFRICA                      21
CROATIA                            6
NEW CALEDONIA                      4
NEW ZEALAND                        3
PAPUA NEW GUINEA                   2
BRAZIL                             2
BAHAMAS                            2
FIJI                               2
GREECE                             2
MEXICO                             2
CANADA                             1
TURKEY                             1
NEW BRITAIN                        1
FEDERATED STATES OF MICRONESIA     1
MALDIVES                           1
MALTA                              1
FRANCE                             1
TONGA                              1
MAYOTTE                            1
Name: Country, dtype: int64

In [142]:
data["Area"].value_counts()

California                       13
Western Cape Province            13
Western Australia                 9
Hawaii                            9
New South Wales                   7
Eastern Cape Province             5
Oregon                            5
Florida                           4
South Australia                   4
KwaZulu-Natal                     3
Queensland                        2
South Province                    2
Pernambuco                        2
Victoria                          2
North Carolina                    2
Primorje-Gorski Kotar County      2
Corfu Island                      2
Eastern Caroline Islands          1
St. Thomas Bay                    1
Mozambique Channel                1
North Island                      1
Andros Islands                    1
New Britain                       1
Zadar County                      1
Istria County                     1
Loyalty Islands                   1
Guerro                            1
New Providence              

We will now check whether the attacks are provoked or unprovoked.

In [143]:
data['Type'].value_counts()

Unprovoked    93
Boating       10
Boat           6
Provoked       2
Invalid        1
Name: Type, dtype: int64

As we can see, almost 90% of the attacks are unprovoked, meaning that sharks have a predator nature. 

In [144]:
data["Sex "].value_counts

<bound method IndexOpsMixin.value_counts of 6       M
23      M
30      M
34      M
53      F
       ..
5041    M
5105    M
5231    M
5545    M
5887    M
Name: Sex , Length: 112, dtype: object>

We now check if the attack is mortal or not. We see that there is almost 70% chance of surviving to a shark attack.

In [145]:
data["Fatal (Y/N)"].value_counts()

N          74
Y          35
UNKNOWN     2
Name: Fatal (Y/N), dtype: int64

In [146]:
data["Year"].dtype

dtype('float64')

In [147]:
def years (x): 
    x = str(x) # the first thing is to change it from float
    x = x.split(".")
    x = x[0]
    
    if len(x) != 4:
        return 'date_unknown'
    else: 
        return x

In [152]:
data['Year'] = data['Year'].apply(years)
data.head()

Unnamed: 0,Year,Type,Country,Area,Activity,Sex,Fatal (Y/N),Species
6,2018,Unprovoked,BRAZIL,Pernambuco,Swimming,M,Y,Tiger shark
23,2018,Unprovoked,MALDIVES,Alifu Alifu Atoll,Fishing,M,N,Tiger shark
30,2018,Unprovoked,BAHAMAS,New Providence,Surfing,M,N,Tiger shark
34,2018,Unprovoked,SOUTH AFRICA,Eastern Cape Province,Surfing,M,N,White shark
53,2017,Unprovoked,USA,Hawaii,Surfing,F,N,Tiger shark


In [148]:
data.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species '],
      dtype='object')

In [149]:
data = data.drop(columns = ['Location', 'Name','Age', 'Injury', 'Time',"Date"])

In [153]:
data

Unnamed: 0,Year,Type,Country,Area,Activity,Sex,Fatal (Y/N),Species
6,2018,Unprovoked,BRAZIL,Pernambuco,Swimming,M,Y,Tiger shark
23,2018,Unprovoked,MALDIVES,Alifu Alifu Atoll,Fishing,M,N,Tiger shark
30,2018,Unprovoked,BAHAMAS,New Providence,Surfing,M,N,Tiger shark
34,2018,Unprovoked,SOUTH AFRICA,Eastern Cape Province,Surfing,M,N,White shark
53,2017,Unprovoked,USA,Hawaii,Surfing,F,N,Tiger shark
...,...,...,...,...,...,...,...,...
5041,1930,Unprovoked,USA,Florida,Swimming,M,N,Tiger shark
5105,1928,Unprovoked,AUSTRALIA,Queensland,Fishing,M,N,Tiger shark
5231,1922,Unprovoked,AUSTRALIA,New South Wales,Swimming,M,Y,White shark
5545,1901,Unprovoked,SOUTH AFRICA,Western Cape Province,Swimming,M,Y,White shark


In [154]:
data.to_csv("output/attacks_Clean.csv")