# Global Shark Attacks

- note
- note
- note
-
-


## Import Libraries

In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats
from scipy.stats.mstats import winsorize

import math
from sklearn.preprocessing import OneHotEncoder, Normalizer, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.neighbors import KNeighborsRegressor

import warnings
warnings.filterwarnings('ignore')

## Import Data

In [43]:
data = pd.read_csv('attacks.csv',encoding='latin-1')

## First Review and Clean Data

In [44]:
data.shape

(25723, 24)

In [45]:
data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


### Cleaning Columns

In [46]:
for col in data:
    print(data[col].value_counts(), '\n')

# extract month and year from date (where possible)
# clean year to identical format or extract from date
# check if years are the same from date and year
# Type ??????? what do those categories mean and are they actually reliable? probaly objective call
# check country values if there are somekind of double counts?
# Area? what to do with the Area? probably has a connection to the kind of shark
# Location? what to do with Location? probably has a connection to the kind of shark
# Activity - Cleaning down to a few categories and sum the others under unknown
# name . drop
# sex
# age
# injury
# fatal

0                 2400
1913.08.27.R         2
2012.09.02.b         2
1915.07.06.a.R       2
1983.06.15           2
                  ... 
2015.09.20.d         1
1989.02.15           1
1964.01.01.b         1
1938.07.18           1
1958.09.06           1
Name: Case Number, Length: 6287, dtype: int64 

1957           11
1942            9
1956            8
1950            7
1941            7
               ..
03-Jan-1999     1
16-May-1981     1
27-Jan-1967     1
04-Nov-1968     1
13-Jun-1914     1
Name: Date, Length: 5433, dtype: int64 

2015.0    143
2017.0    136
2016.0    130
2011.0    128
2014.0    127
         ... 
1753.0      1
77.0        1
1785.0      1
1580.0      1
1543.0      1
Name: Year, Length: 249, dtype: int64 

Unprovoked      4595
Provoked         574
Invalid          547
Sea Disaster     239
Boating          203
Boat             137
Questionable       2
Boatomg            1
Name: Type, dtype: int64 

USA                       2229
AUSTRALIA                 1338
SOUTH AFR

After a first brief review of the data it becomes clear very fast that this data will require a lot of data cleaning which is why I will review column by columns.

#### Clean Column Names

In [47]:
data.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

**Changes needed:** <br>
lower case <br>
underscore '_' <br>
drop empty spaces ' ', <br>
replace '.' with underscore '_' <br>
remove (Y/N)

In [48]:
data.columns = [column.lower().replace('.', '').replace(' ','_').replace(':','').replace('(y/n)','') for column in data.columns]

In [49]:
data.columns

Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex_', 'age', 'injury', 'fatal_', 'time',
       'species_', 'investigator_or_source', 'pdf', 'href_formula', 'href',
       'case_number1', 'case_number2', 'original_order', 'unnamed_22',
       'unnamed_23'],
      dtype='object')

#### drop columns

In [50]:
to_drop = ['name', 'time', 'investigator_or_source', 'pdf', 'href_formula', 'href', 'unnamed_22', 'unnamed_23', 'original_order']

data.drop(columns=to_drop, inplace=True)

In [51]:
data.head()

Unnamed: 0,case_number,date,year,type,country,area,location,activity,sex_,age,injury,fatal_,species_,case_number1,case_number2
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,White shark,2018.06.25,2018.06.25
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,,2018.06.18,2018.06.18
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48.0,Injury to left lower leg from surfboard skeg,N,,2018.06.09,2018.06.09
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,2 m shark,2018.06.08,2018.06.08
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m",2018.06.04,2018.06.04


#### Drop Duplicates

In [53]:
data.shape

(25723, 15)

In [54]:
data = data.drop_duplicates()

In [55]:
data.shape

(6305, 15)

#### Null Values

In [56]:
data.isna().sum()

case_number        2
date               3
year               5
type               7
country           53
area             458
location         543
activity         547
sex_             568
age             2834
injury            31
fatal_           542
species_        2841
case_number1       3
case_number2       3
dtype: int64

In [57]:
data

Unnamed: 0,case_number,date,year,type,country,area,location,activity,sex_,age,injury,fatal_,species_,case_number1,case_number2
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,White shark,2018.06.25,2018.06.25
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,,2018.06.18,2018.06.18
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48,Injury to left lower leg from surfboard skeg,N,,2018.06.09,2018.06.09
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,2 m shark,2018.06.08,2018.06.08
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m",2018.06.04,2018.06.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6300,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,,FATAL,Y,,ND.0002,ND.0002
6301,ND.0001,1845-1853,0.0,Unprovoked,CEYLON (SRI LANKA),Eastern Province,"Below the English fort, Trincomalee",Swimming,M,15,"FATAL. ""Shark bit him in half, carrying away t...",Y,,ND.0001,ND.0001
6302,0,,,,,,,,,,,,,,
8702,,,,,,,,,,,,,,,


#### case_number, case_number1, case_number2

In [58]:
cases = data[['date', 'year', 'case_number', 'case_number1', 'case_number2']].copy()

In [59]:
cases.head()

Unnamed: 0,date,year,case_number,case_number1,case_number2
0,25-Jun-2018,2018.0,2018.06.25,2018.06.25,2018.06.25
1,18-Jun-2018,2018.0,2018.06.18,2018.06.18,2018.06.18
2,09-Jun-2018,2018.0,2018.06.09,2018.06.09,2018.06.09
3,08-Jun-2018,2018.0,2018.06.08,2018.06.08,2018.06.08
4,04-Jun-2018,2018.0,2018.06.04,2018.06.04,2018.06.04


In [60]:
for col in cases:
    print(cases[col].value_counts(), '\n')

1957           11
1942            9
1956            8
1958            7
1941            7
               ..
16-May-1981     1
04-Nov-1968     1
05-Apr-2017     1
09-Feb-1927     1
13-Jun-1914     1
Name: date, Length: 5433, dtype: int64 

2015.0    143
2017.0    136
2016.0    130
2011.0    128
2014.0    127
         ... 
1801.0      1
1638.0      1
1834.0      1
1723.0      1
1786.0      1
Name: year, Length: 249, dtype: int64 

1920.00.00.b    2
1913.08.27.R    2
2009.12.18      2
1980.07.00      2
1990.05.10      2
               ..
1931.08.27      1
1977.02.04      1
2015.12.26      1
2008.06.21      1
2006.10.00.a    1
Name: case_number, Length: 6287, dtype: int64 

1920.00.00.b    2
2012.09.02.b    2
1913.08.27.R    2
2013.10.05      2
2009.12.18      2
               ..
2015.12.26      1
2008.06.21      1
1826.08.28      1
1844.07.20.     1
2006.10.00.a    1
Name: case_number1, Length: 6285, dtype: int64 

1920.00.00.b    2
2012.09.02.b    2
2013.10.05      2
1913.08.27.R    2
20

I drop the case numbers and work with a unique index instead

In [61]:
to_drop2 = ['case_number', 'case_number1', 'case_number2']

data.drop(columns=to_drop2, inplace=True)

In [62]:
data.head()

Unnamed: 0,date,year,type,country,area,location,activity,sex_,age,injury,fatal_,species_
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,White shark
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48.0,Injury to left lower leg from surfboard skeg,N,
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,2 m shark
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m"


#### date

In [63]:
data.date.unique()

array(['25-Jun-2018', '18-Jun-2018', '09-Jun-2018', ..., '1883-1889',
       '1845-1853', nan], dtype=object)

#### year

In [64]:
data.year.unique()

array([2018., 2017.,   nan, 2016., 2015., 2014., 2013., 2012., 2011.,
       2010., 2009., 2008., 2007., 2006., 2005., 2004., 2003., 2002.,
       2001., 2000., 1999., 1998., 1997., 1996., 1995., 1984., 1994.,
       1993., 1992., 1991., 1990., 1989., 1969., 1988., 1987., 1986.,
       1985., 1983., 1982., 1981., 1980., 1979., 1978., 1977., 1976.,
       1975., 1974., 1973., 1972., 1971., 1970., 1968., 1967., 1966.,
       1965., 1964., 1963., 1962., 1961., 1960., 1959., 1958., 1957.,
       1956., 1955., 1954., 1953., 1952., 1951., 1950., 1949., 1948.,
       1848., 1947., 1946., 1945., 1944., 1943., 1942., 1941., 1940.,
       1939., 1938., 1937., 1936., 1935., 1934., 1933., 1932., 1931.,
       1930., 1929., 1928., 1927., 1926., 1925., 1924., 1923., 1922.,
       1921., 1920., 1919., 1918., 1917., 1916., 1915., 1914., 1913.,
       1912., 1911., 1910., 1909., 1908., 1907., 1906., 1905., 1904.,
       1903., 1902., 1901., 1900., 1899., 1898., 1897., 1896., 1895.,
       1894., 1893.,

#### Type

In [65]:
data.type.unique()

array(['Boating', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable',
       'Sea Disaster', nan, 'Boat', 'Boatomg'], dtype=object)

#### Country

In [70]:
data.species_.unique()

array(['White shark', nan, '2 m shark', ..., "12' tiger shark",
       'Blue pointers',
       'Said to involve a grey nurse shark that leapt out of the water and  seized the boy but species identification is questionable'],
      dtype=object)

first step is to remove special charakter '?', remove spaces at the end of a string and change everything into upper letters

In [40]:
for i in data['country']:
    i.upper().replace('?', '').replace(' ','_')

AttributeError: 'NoneType' object has no attribute 'upper'

In [68]:
data

Unnamed: 0,date,year,type,country,area,location,activity,sex_,age,injury,fatal_,species_
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,White shark
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48,Injury to left lower leg from surfboard skeg,N,
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,2 m shark
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m"
...,...,...,...,...,...,...,...,...,...,...,...,...
6300,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,,FATAL,Y,
6301,1845-1853,0.0,Unprovoked,CEYLON (SRI LANKA),Eastern Province,"Below the English fort, Trincomalee",Swimming,M,15,"FATAL. ""Shark bit him in half, carrying away t...",Y,
6302,,,,,,,,,,,,
8702,,,,,,,,,,,,


In [36]:
data.country.unique()

array([None], dtype=object)

In [28]:
data['Case Number'].is_unique

False

In [24]:
data.shape

(25723, 24)

In [67]:
data.dtypes

date         object
year        float64
type         object
country      object
area         object
location     object
activity     object
sex_         object
age          object
injury       object
fatal_       object
species_     object
dtype: object