## Importing Libraries

In [48]:
import pandas as pd
import numpy as np

## Reading the archive

In [2]:
attacks = pd.read_csv('./attacks.csv')

## Cleaning DataFrame

In [3]:
attacks.columns 

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [4]:
# Cleaning the spaces in the columns
columns = ['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',\
           'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal', 'Time',\
           'Species', 'Source', 'pdf', 'href formula', 'href',\
           'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',\
           'Unnamed: 23']

columns = [col.lower().replace(' ','_') for col in columns]

In [5]:
# Updating the columns in the DataFrame
attacks.columns = columns

In [6]:
# Checking the rows in wich the 'unnamed:_23' columns is not null
attacks[attacks['unnamed:_23'].isnull()==False]

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,species,source,pdf,href_formula,href,case_number.1,case_number.2,original_order,unnamed:_22,unnamed:_23
4415,1952.03.30,30-Mar-1952,1952.0,Unprovoked,NETHERLANDS ANTILLES,Curacao,,Went to aid of child being menaced by the shark,A.J. Eggink,M,...,"Bull shark, 2.7 m [9'] was captured & dragged ...","J. Randall, p.352 in Sharks & Survival; H.D. B...",1952.03.30-Eggink.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1952.03.30,1952.03.30,1888.0,,Teramo
5840,1878.09.14.R,Reported 14-Sep-1878,1878.0,Provoked,USA,Connecticut,"Branford, New Haven County",Fishing,Captain Pattison,M,...,,"St. Joseph Herald, 9/14/1878",1878.09.14.R-Pattison.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1878.09.14.R,1878.09.14.R,463.0,,change filename


In [7]:
# Checking the rows in wich the 'unnamed:_22' columns is not null
attacks[attacks['unnamed:_22'].isnull()==False]

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,species,source,pdf,href_formula,href,case_number.1,case_number.2,original_order,unnamed:_22,unnamed:_23
1478,2006.05.27,27-May-2006,2006.0,Unprovoked,USA,Hawaii,"North Shore, O'ahu",Surfing,Bret Desmond,M,...,,R. Collier,2006.05.27-Desmond.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2006.05.27,2006.05.27,4825.0,stopped here,


In [8]:
# Defining the columns that aren't going to be used
drop_columns=['unnamed:_22', 'unnamed:_23', 'case_number.1', 'case_number.2', 'case_number',\
              'original_order', 'source', 'pdf', 'href_formula', 'href', 'original_order']

In [9]:
# Removing columns that aren't going to be used
clean_attacks = attacks.drop(columns=drop_columns)

In [10]:
# Removing duplicated columns
duplicated_cols = clean_attacks[attacks.duplicated() == True].index
clean_attacks.drop(index=duplicated_cols, inplace=True)

In [11]:
# Removing columns that are completely null
clean_attacks.dropna(how='all', inplace=True)

In [12]:
# Reseting the index
clean_attacks.reset_index(inplace=True, drop=True)

In [13]:
clean_attacks.columns

Index(['date', 'year', 'type', 'country', 'area', 'location', 'activity',
       'name', 'sex', 'age', 'injury', 'fatal', 'time', 'species'],
      dtype='object')

## Evaluating the sex column

In [45]:
clean_attacks.sex.unique()

array(['F', 'M', nan, 'M ', 'N', '.'], dtype=object)

### Cleaning the odd values

In [46]:
clean_attacks[clean_attacks.sex=='.']

Unnamed: 0,date,year,type,country,area,location,activity,name,sex,age,injury,fatal,time,species
5437,Reported 02-Jun-1908,1908.0,Sea Disaster,PAPUA NEW GUINEA,New Britain,Matupi,.,,.,,"Remains of 3 humans recovered from shark, but ...",Y,,Allegedly a 33-foot shark


In [34]:
clean_attacks[clean_attacks.sex=='lli']

Unnamed: 0,date,year,type,country,area,location,activity,name,sex,age,injury,fatal,time,species
1624,11-Nov-2004,2004.0,Unprovoked,USA,California,"Bunkers, Humboldt Bay, Eureka, Humboldt County",Surfing,Brian Kang,lli,38,"Lacerations to hand, knee & thigh",N,13h30,5.5 m [18'] white shark


In [42]:
clean_attacks[clean_attacks.sex=='N']

Unnamed: 0,date,year,type,country,area,location,activity,name,sex,age,injury,fatal,time,species
4938,11-Jul-1934,1934.0,Boating,AUSTRALIA,New South Wales,Cronulla,Fishing,"18' boat, occupants William & Leslie Newton",N,,No injury to occupants Sharks continually foll...,N,,"Blue pointer, 11'"
6131,Reported 18-Dec-1801,1801.0,Provoked,,,,Standing on landed shark's tail,Stephen Pettigew,N,,"FATAL, PROVOKED INCIDENT",Y,,12' shark


In [50]:
# Cleaning the wrong 'sex'
clean_attacks.iloc[1624, 8] = 'M'
clean_attacks.iloc[6131, 8] = 'M'
clean_attacks.iloc[4938, 8] = np.nan
clean_attacks.iloc[5437, 8] = np.nan

In [51]:
# Checking if there's any relation with gender
clean_attacks.sex.str.strip().value_counts(normalize=True)

M    0.888928
F    0.111072
Name: sex, dtype: float64

## Starting the analysis based on country

In [14]:
# Choosing a country to analyse based on the amount of attacks
clean_attacks.country.value_counts(normalize=True).head()

USA                 0.356526
AUSTRALIA           0.214012
SOUTH AFRICA        0.092610
PAPUA NEW GUINEA    0.021433
NEW ZEALAND         0.020473
Name: country, dtype: float64

In [16]:
# Starting the analisys on USA
attacks_usa = clean_attacks[clean_attacks.country=='USA']

In [17]:
# Understanding how the attacks occour
attacks_usa.type.value_counts(normalize=True)

Unprovoked      0.772095
Provoked        0.093315
Invalid         0.086586
Boating         0.023329
Sea Disaster    0.014356
Boat            0.010319
Name: type, dtype: float64

In [20]:
# Choosing the most targeted area
attacks_usa.area.value_counts(normalize=True)

Florida                   0.466067
Hawaii                    0.133933
California                0.130337
South Carolina            0.071461
North Carolina            0.045393
Texas                     0.032809
New Jersey                0.023371
New York                  0.013483
Oregon                    0.013034
Virginia                  0.008539
Puerto Rico               0.007191
Massachusetts             0.007191
Alabama                   0.006742
Georgia                   0.006292
Louisiana                 0.004944
Maryland                  0.003146
Delaware                  0.003146
Rhode Island              0.003146
Connecticut               0.002697
US Virgin Islands         0.002247
Mississippi               0.002247
New York                  0.001348
North Carolina            0.000899
Washington                0.000899
East coast                0.000899
Palmyra Atoll             0.000899
South Carolina            0.000449
Florida                   0.000449
New Mexico          

In [21]:
# Cleaning the area field
attacks_usa['area'] = attacks_usa['area'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attacks_usa['area'] = attacks_usa['area'].str.strip()


In [22]:
# Choosing the most affected area
attacks_usa.area.value_counts(normalize=True).head()

Florida           0.466517
Hawaii            0.133933
California        0.130337
South Carolina    0.071910
North Carolina    0.046742
Name: area, dtype: float64

In [23]:
# Narrowing down
attacks_florida = attacks_usa[attacks_usa.area=='Florida']

In [90]:
attacks_florida.type.value_counts(normalize=True)

Unprovoked      0.835260
Provoked        0.083815
Invalid         0.064547
Boat            0.006744
Sea Disaster    0.005780
Boating         0.003854
Name: type, dtype: float64

In [25]:
attacks_florida.activity.str.strip().value_counts(normalize=True)

Surfing                                                        0.378462
Swimming                                                       0.145641
Wading                                                         0.068718
Fishing                                                        0.035897
Standing                                                       0.034872
                                                                 ...   
Adrift after the sinking of the motor yacht Princess Dianne    0.001026
Fishing for snapper                                            0.001026
Floating in inner tube                                         0.001026
Crawling                                                       0.001026
Underwater photography                                         0.001026
Name: activity, Length: 197, dtype: float64

In [98]:
# Checking if there's a common species 
attacks_florida[attacks_florida.activity=='Surfing'].species.value_counts()

4' to 5' shark                                   13
5' shark                                         10
4' shark                                         10
3' shark                                          9
6' shark                                          8
                                                 ..
9.5' shark?                                       1
Spinner shark                                     1
1.2 m [4'] bull shark                             1
Spinner shark, 1.2 m to 1.5 m [4' to 5']          1
6' shark, possibly a blactip or spinner shark     1
Name: species, Length: 89, dtype: int64

In [27]:
attacks_florida[attacks_florida.activity=='Surfing'].type.value_counts()

Unprovoked    352
Invalid         8
Provoked        7
Name: type, dtype: int64