In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

## Get data

In [2]:
data = pd.read_csv("files/attacks.csv", encoding='latin1')
data

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25718,,,,,,,,,,,...,,,,,,,,,,
25719,,,,,,,,,,,...,,,,,,,,,,
25720,,,,,,,,,,,...,,,,,,,,,,
25721,,,,,,,,,,,...,,,,,,,,,,


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

## Data cleaning

In [4]:
data = data.drop_duplicates() #to remove duplicated rows

In [5]:
data.columns = [col.lower().replace(' ','_') for col in data.columns] #to standardize headers

In [6]:
data.rename(columns={"sex_": "gender", "species_": "species", "fatal_(y/n)": "fatal"}, inplace=True)

In [7]:
## dataframe has no duplicates, headers are standardized and we removed rows with lot of NaNs. Let's take another look at our dataframe: 
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6312 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   case_number             6310 non-null   object 
 1   date                    6302 non-null   object 
 2   year                    6300 non-null   float64
 3   type                    6298 non-null   object 
 4   country                 6252 non-null   object 
 5   area                    5847 non-null   object 
 6   location                5762 non-null   object 
 7   activity                5758 non-null   object 
 8   name                    6092 non-null   object 
 9   gender                  5737 non-null   object 
 10  age                     3471 non-null   object 
 11  injury                  6274 non-null   object 
 12  fatal                   5763 non-null   object 
 13  time                    2948 non-null   object 
 14  species                 3464 non-null  

We can already notice that several columns won't be important for our model: 
- case_number doesn't add any relevant information. same for the date, year, case_number.1, case_number.2, original_order as the information from those originates in the date
- type contains very unclear information
- name and investigator_or_source columns - to be dropped, they don't contain any information applicable for our model 
- same for pdf, href and href_formula which contain pdf with links 
I will focus on the data regarding location, activity, gender of the victim and injury/fatality of the attack. 

In [8]:
## The initial analysis suggests we can already drop several columns: case_number, year, name, pdf, href_formula, href, case_number.1, case_number.2, original_order, unnamed:_22, unnamed:_23, investigator_or_source
data = data.drop(columns = ["case_number", "date", "year", "type", "name", "age", "time", "pdf", "href_formula", "href", "case_number.1", "case_number.2", "original_order", "unnamed:_22", "unnamed:_23", "investigator_or_source"], axis=1)

In [9]:
def clean_empty(x):
    return 'UNK' if x == '' or x == ' ' else x

In [10]:
for col in data.select_dtypes('object'):
    data[col] = data[col].apply(clean_empty)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6312 entries, 0 to 25722
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   country   6252 non-null   object
 1   area      5847 non-null   object
 2   location  5762 non-null   object
 3   activity  5758 non-null   object
 4   gender    5737 non-null   object
 5   injury    6274 non-null   object
 6   fatal     5763 non-null   object
 7   species   3464 non-null   object
dtypes: object(8)
memory usage: 443.8+ KB


In [12]:
data = data.dropna(thresh=5) #to remove rows which have NaN in min. 5 columns 

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6201 entries, 0 to 6301
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   country   6178 non-null   object
 1   area      5833 non-null   object
 2   location  5749 non-null   object
 3   activity  5726 non-null   object
 4   gender    5680 non-null   object
 5   injury    6183 non-null   object
 6   fatal     5684 non-null   object
 7   species   3443 non-null   object
dtypes: object(8)
memory usage: 436.0+ KB


We have only categorical data left. 

In [14]:
data1 = data.copy() #copy of the dataframe after initial clean up

In [15]:
data1.isna().sum()

country       23
area         368
location     452
activity     475
gender       521
injury        18
fatal        517
species     2758
dtype: int64

In [16]:
data1["country"] = data1["country"].fillna("ukn")

In [17]:
data1["area"] = data1["area"].fillna("ukn")

In [18]:
data1["location"] = data1["location"].fillna("ukn")

In [19]:
data1["activity"] = data1["activity"].fillna("ukn")

In [20]:
data1["gender"] = data1["gender"].fillna("ukn")

In [21]:
data1["injury"] = data1["injury"].fillna("ukn")

In [22]:
data1["fatal"] = data1["fatal"].fillna("ukn")

In [23]:
data1["species"] = data1["species"].fillna("ukn")

In [24]:
data1.isna().sum()

country     0
area        0
location    0
activity    0
gender      0
injury      0
fatal       0
species     0
dtype: int64

In [25]:
data2 = data1.copy() #copy of the dataframe after removing NaNs

In [26]:
data2.describe()

Unnamed: 0,country,area,location,activity,gender,injury,fatal,species
count,6201,6201,6201,6201,6201,6201,6201,6201
unique,210,822,4098,1519,7,3698,9,1550
top,USA,Florida,ukn,Surfing,M,FATAL,N,ukn
freq,2227,1037,452,969,5039,778,4255,2758


In [27]:
for col in data2:
    print(data2[col].value_counts(), '\n')

USA                           2227
AUSTRALIA                     1332
SOUTH AFRICA                   579
PAPUA NEW GUINEA               133
NEW ZEALAND                    126
                              ... 
NORTH ATLANTIC OCEAN             1
MAYOTTE                          1
GABON                            1
ANDAMAN / NICOBAR ISLANDAS       1
CEYLON (SRI LANKA)               1
Name: country, Length: 210, dtype: int64 

Florida                   1037
New South Wales            485
ukn                        368
Queensland                 310
Hawaii                     298
                          ... 
Aulong Island                1
Antibes                      1
Illeginni Atoll              1
Between Beira & Maputo       1
Moala Island                 1
Name: area, Length: 822, dtype: int64 

ukn                                               452
New Smyrna Beach, Volusia County                  163
Daytona Beach, Volusia County                      30
Ponce Inlet, Volusia County  

Looking at the information above we can see that we need to clean the data in all columns that are left so it's more clear for the model we'll build. 

In [28]:
## cleaning the gender column. 

In [29]:
data2["gender"].value_counts()

M      5039
F       635
ukn     521
M         2
N         2
lli       1
.         1
Name: gender, dtype: int64

In [30]:
def clean_gender(x):
    x = str(x).lower()
    if x.startswith("m"):
        return "m"
    elif x.startswith ("f"):
        return "f"
    elif x.startswith("n"):
        return "m"
    else: 
        return "ukn"

In [31]:
data2["gender"] = data2["gender"].apply(clean_gender)
data2["gender"].value_counts()

m      5043
f       635
ukn     523
Name: gender, dtype: int64

In [32]:
## In the activity column we have too many different values so too avoid problems with our model I'll bucket the ones with a very low count (often it's too precise explanation of the context when the incident took place) under the label "other"

In [33]:
def clean_activity(x):
    x = str(x).lower()
    if "surfing" in x:
        return "surfing"
    elif "swimming" in x:
        return "swimming"
    elif "fishing" in x:
        return "fishing"
    elif "spearfishing" in x:
        return "spearfishing"
    elif "bathing" in x:
        return "bathing"
    elif "diving" in x:
        return "diving"
    elif "surf-skiing" in x:
        return "surf skiing"
    elif "surf skiing" in x:
        return "surf skiing"
    elif "boat" in x:
        return "boating"
    elif "stand-up paddleboarding" in x:
        return "paddle boarding"
    elif "sitting on surfboard" in x:
        return "surfing"
    elif "floating on his back" in x:
        return "floating"
    elif "floating on his back" in x:
        return "floating"
    elif "fell overboard" in x:
        return "fell into the water"
    else: 
        return x

In [34]:
data2["activity"] = data2["activity"].apply(clean_activity)
data2["activity"].value_counts()

fishing                                 1157
surfing                                 1134
swimming                                1107
diving                                   524
ukn                                      475
                                        ... 
attempting to anesthetize shark            1
walking on reef                            1
aircraft exploded                          1
paddling rescue ski                        1
wreck of  large double sailing canoe       1
Name: activity, Length: 696, dtype: int64

In [35]:
rare_activities = [ind for ind, val in data2["activity"].value_counts().iteritems()if val <5]
rare_activities

['paddling on surfboard',
 'jumping',
 'clamming',
 'splashing',
 'body-boarding',
 'kite boarding',
 'lifesaving drill',
 'jumped into the water',
 'air disaster',
 'feeding sharks',
 'seine netting',
 'tagging sharks',
 'paddling',
 'floating on a raft',
 'their 9 m launch was run down by a 25,000-ton japanese freighter  on the night of 3-11-1977 & they drifted, clinging to an icebox for 2 days',
 'jumped overboard',
 '.',
 'shark watching',
 'knocked overboard',
 'spearing fish',
 'feeding fish',
 'playing in the surf',
 'sculling',
 'escaping from alacatraz',
 'unknown',
 'competing in the woodvale atlantic rowing race',
 'sup',
 'wreck of the schooner pohoiki ',
 'jumped overboard ',
 'standing in knee-deep water',
 'crabbing',
 'suicide',
 'thrown overboard',
 'cruising',
 'shark tagging',
 'shooting sharks ',
 'rowing ',
 'finning the shark that bit him',
 'the christie v sank on 11/6/1988, survivors were adrift on a dinghy',
 '"flying tiger" transport plane went down with 5 men

Before we bucket them let's take a moment to appreciate the bravery of few of those shark attack victims who were attacked while: 
- dragging a shark
- diving naked into the water on a bet
- attempting to catch a crocodile
- attempting to lasso a shark
- attempting to rescue a shark
- kissing a shark
- dragging banana seeds through the shallows. 

In [36]:
data2["activity"] = data2["activity"].apply(lambda x: "other" if x in rare_activities else x)
data2["activity"].value_counts()

fishing                       1157
surfing                       1134
swimming                      1107
other                          778
diving                         524
ukn                            475
bathing                        189
wading                         149
boating                        103
standing                        99
snorkeling                      89
body boarding                   64
fell into the water             62
boogie boarding                 45
kayaking                        33
treading water                  32
surf skiing                     31
floating                        22
walking                         17
paddle boarding                 15
canoeing                        14
sea disaster                    13
rowing                          12
sailing                          9
playing                          7
shipwreck                        6
murder                           5
paddleskiing                     5
dangling feet in the

In [194]:
data3 = data2.copy() #copy of the dataframe after cleaning gender and activity columns

In [195]:
# cleaning injury column

In [196]:
data3["injury"].value_counts()

FATAL                                                                   778
Survived                                                                 95
Foot bitten                                                              87
No injury                                                                81
Leg bitten                                                               72
                                                                       ... 
FATAL, arm bitten                                                         1
Multiple injuries                                                         1
Right thigh bitten PROVOKED INCIDENT                                      1
Bruised right leg                                                         1
FATAL. "Shark bit him in half, carrying away the lower extremities"       1
Name: injury, Length: 3698, dtype: int64

In [197]:
def clean_injury(x):
    x = str(x).lower()
    if "fatal" in x:
        return "fatal"
    elif "leg" in x:
        return "leg or foot injury"
    elif "thigh" in x:
        return "leg or foot injury"
    elif "knee" in x:
        return "leg or foot injury"
    elif "foot" in x:
        return "leg or foot injury"
    elif "feet" in x:
        return "leg or foot injury"
    elif "heel" in x:
        return "leg or foot injury"
    elif "ankle" in x:
        return "leg or foot injury"
    elif "shin" in x:
        return "leg or foot injury"
    elif "toe" in x:
        return "leg or foot injury"
    elif "calf" in x:
        return "leg or foot injury"
    elif "calves" in x:
        return "leg or foot injury"
    elif "human remains" in x:
        return "fatal"
    elif "bones recovered" in x:
        return "fatal"
    elif "body not recovered" in x:
        return "fatal"
    elif "body recovered" in x:
        return "fatal"
    elif "remains recovered" in x:
        return "fatal"
    elif "killed" in x:
        return "fatal"
    elif "laceration" in x:
        return "lacerations"
    elif "lacerated" in x:
        return "lacerations"
    elif "abdomen" in x:
        return "abdomen injury"
    elif "hand" in x:
        return "arm or hand injury"
    elif "finger" in x:
        return "arm or hand injury"
    elif "thumb" in x:
        return "arm or hand injury"
    elif "arm" in x:
        return "arm or hand injury"
    elif "shoulder" in x:
        return "arm or hand injury"
    elif "elbow" in x:
        return "arm or hand injury"
    elif "wrist" in x:
        return "arm or hand injury"
    elif "head" in x:
        return "head or face injury"
    elif "face" in x:
        return "head or face injury"
    elif "scalp" in x:
        return "head or face injury"
    elif "chest" in x:
        return "torso injury"
    elif "ribs" in x:
        return "torso injury"
    elif "back" in x:
        return "torso injury"
    elif "torso" in x:
        return "torso injury"
    elif "hip" in x:
        return "torso injury"
    elif "buttock" in x:
        return "torso injury"
    elif "no injury" in x:
        return "no injury"
    elif "no injuries" in x:
        return "no injury"
    elif "not injured" in x:
        return "no injury"
    elif "uninjured" in x:
        return "no injury"
    elif "swim fin bitten" in x:
        return "no injury"
    elif "minor" in x:
        return "minor injury"
    elif "bruise" in x:
        return "minor injury"
    elif "abrasion" in x:
        return "minor injury"
    elif "survived" in x:
        return "other"
    elif "drown" in x:
        return "shark involvement unconfirmed"
    elif "scaveng" in x:
        return "scavenging"
    elif "not confirmed" in x:
        return "shark involvement unconfirmed"
    elif "unconfirmed" in x:
        return "shark involvement unconfirmed"
    elif "post-mortem" in x:
        return "shark involvement unconfirmed"
    elif "post mortem" in x:
        return "shark involvement unconfirmed"
    elif "no details" in x:
        return "unknown"
    elif "provoked incident" in x:
        return "unknown"
    elif "ukn" in x:
        return "unknown"
    else: 
        return x

In [198]:
data3["injury"] = data3["injury"].apply(clean_injury)

In [199]:
data3["injury"].value_counts()

leg or foot injury                                                                      2317
fatal                                                                                   1400
no injury                                                                                763
arm or hand injury                                                                       534
lacerations                                                                              413
                                                                                        ... 
cracked jaw & broken tooth, shark took chunk out of surfboard                              1
the press reported this as an attack by a white shark but the diver was the agressor       1
shark bit rudder & hull                                                                    1
major injuries                                                                             1
shark bumped him                                                      

In [200]:
other_injuries = [ind for ind, val in data3["injury"].value_counts().iteritems()if val <6]
other_injuries

['recovered',
 'missing, believed taken by a shark',
 'thought to have been taken by a shark. body was not recovered',
 'multiple injuries',
 'multiple major injuries',
 'missing, thought to have been taken by a shark',
 'of her crew of 50, eight perished, including the two injured men',
 'survivors on life rafts were harassed by sharks',
 'injured by sharks, but managed to swim ashore 6.5 hours later',
 'paddle of surf ski bitten by shark',
 'general imamura, commander in chief of japanese forces in java was sentenced to 10 years imprisonment by australian military court for his role in the "pig basket atrocities"',
 '"mauled"',
 'clothing torn by sharks',
 'struck by shark immediately before it bit tanner (see below)',
 'no inury to occupants, shark struck boat',
 'an estimated  3,000 to 7,000 japanese troops perished, some were taken by sharks',
 'shark leapt onboard & into fishwell, tossing a crew member, pepino, in the sea',
 'disappeared & his torn clothing washed ashore',
 'shar

There were some happy endings: 
- 'reported to have been killed by a shark but 2 years later he was found very much alive'
- 'later found to be fiction, never happened'
- 'never happened; it was a  hoax' 

In [201]:
data3["injury"] = data3["injury"].apply(lambda x: "other" if x in other_injuries else x)
data3["injury"].value_counts()

leg or foot injury               2317
fatal                            1400
no injury                         763
arm or hand injury                534
lacerations                       413
other                             308
shark involvement unconfirmed     119
unknown                           117
torso injury                       88
minor injury                       74
head or face injury                40
abdomen injury                     16
scavenging                         12
Name: injury, dtype: int64

In [202]:
data4 = data3.copy()

In [203]:
#cleaning fatal column

In [204]:
data4["fatal"].value_counts()

n      4256
y      1359
ukn     586
Name: fatal, dtype: int64

In [205]:
def clean_fatal(x):
    x = str(x).lower()
    if x.startswith("y"):
        return "y"
    elif x.startswith("n"):
        return "n"
    else: 
        return "ukn"

In [206]:
data4["fatal"] = data4["fatal"].apply(clean_fatal)
data4["fatal"].value_counts()

n      4256
y      1359
ukn     586
Name: fatal, dtype: int64

In [207]:
# let's check if we can identify whether the injury was fatal or not based on the values in injury column

In [208]:
data4.loc[data4["injury"].str.contains ("fatal"), "fatal"] = "y"

In [209]:
data4["fatal"].value_counts()

n      4244
y      1469
ukn     488
Name: fatal, dtype: int64

In [210]:
data4["species"].value_counts()

ukn                                                                                                                              2758
White shark                                                                                                                       163
Shark involvement prior to death was not confirmed                                                                                102
Invalid                                                                                                                           101
Shark involvement not confirmed                                                                                                    86
                                                                                                                                 ... 
1.2 m to 1.5 m [4.5' to 5'] shark                                                                                                   1
Bull shark, 2.3 m [7.5']                                      

In [211]:
# species columns seems to contain a lot of incoherent or unconfirmed data so I will drop this column. 
data4 = data4.drop(columns = ["species"], axis=1)

In [212]:
data5 = data4.copy()

In [214]:
data5["country"].value_counts()

USA                           2227
AUSTRALIA                     1332
SOUTH AFRICA                   579
PAPUA NEW GUINEA               133
NEW ZEALAND                    126
                              ... 
NORTH ATLANTIC OCEAN             1
MAYOTTE                          1
GABON                            1
ANDAMAN / NICOBAR ISLANDAS       1
CEYLON (SRI LANKA)               1
Name: country, Length: 210, dtype: int64

In [216]:
low_countries = [ind for ind, val in data5["country"].value_counts().iteritems()if val <10]
low_countries

['VENEZUELA',
 'NEW GUINEA',
 'TAIWAN',
 'ECUADOR',
 'SCOTLAND',
 'MADAGASCAR',
 'SOUTH KOREA',
 'CHILE',
 'COLUMBIA',
 'TANZANIA',
 'THAILAND',
 'SAMOA',
 'SEYCHELLES',
 'NORTH PACIFIC OCEAN',
 'CARIBBEAN SEA',
 'ISRAEL',
 'YEMEN ',
 'CHINA',
 'KIRIBATI',
 'SOMALIA',
 'INDIAN OCEAN',
 'LIBYA',
 'SINGAPORE',
 'BARBADOS',
 'NEW BRITAIN',
 'DOMINICAN REPUBLIC',
 'SIERRA LEONE',
 'NICARAGUA',
 'PALAU',
 'MALAYSIA',
 'MALTA',
 'MID ATLANTIC OCEAN',
 'OKINAWA',
 'SAUDI ARABIA',
 'TURKS & CAICOS',
 'HONDURAS',
 'URUGUAY',
 'RUSSIA',
 'NORTH ATLANTIC OCEAN',
 'GRENADA',
 'SUDAN',
 'BURMA',
 'PERSIAN GULF',
 'NIGERIA',
 'EL SALVADOR',
 'TUNISIA',
 'GUYANA',
 'PORTUGAL',
 'HAITI',
 'CEYLON',
 'AMERICAN SAMOA',
 'MARTINIQUE',
 'LEBANON',
 'GUINEA',
 'MONTENEGRO',
 'GUAM',
 'BELIZE',
 'MICRONESIA',
 ' TONGA',
 'CAPE VERDE',
 'LIBERIA',
 'TRINIDAD & TOBAGO',
 'AZORES',
 'Fiji',
 'UNITED ARAB EMIRATES (UAE)',
 'SOUTH PACIFIC OCEAN',
 'CRETE',
 'EGYPT ',
 'TOBAGO',
 'NORWAY',
 'ICELAND',
 'WEST INDI

In [None]:
## Between columns: country, area and location some information might be repeated or sufficient to remove NaN in other columns