In [4]:
import pandas as pd
import re
attacks = pd.read_csv("input/attacks.csv",encoding='utf-8')
attacks.Activity.value_counts().head(10)

Surfing         971
Swimming        869
Fishing         431
Spearfishing    333
Bathing         162
Wading          149
Diving          127
Standing         99
Snorkeling       89
Scuba diving     76
Name: Activity, dtype: int64

In [5]:
attacks['Country']= attacks['Country'].map(lambda x: x.replace('?',  "" ) if type(x)==str else x )

In [6]:
attacks['Country'].value_counts()[:50]

USA                     2229
AUSTRALIA               1338
SOUTH AFRICA             579
PAPUA NEW GUINEA         134
NEW ZEALAND              128
BRAZIL                   112
BAHAMAS                  109
MEXICO                    89
ITALY                     71
FIJI                      62
PHILIPPINES               61
REUNION                   60
NEW CALEDONIA             53
CUBA                      46
MOZAMBIQUE                45
SPAIN                     44
INDIA                     40
EGYPT                     38
JAPAN                     34
CROATIA                   34
PANAMA                    32
SOLOMON ISLANDS           30
IRAN                      29
JAMAICA                   27
GREECE                    25
FRENCH POLYNESIA          25
HONG KONG                 24
ENGLAND                   23
INDONESIA                 23
COSTA RICA                17
PACIFIC OCEAN             17
ATLANTIC OCEAN            17
BERMUDA                   16
VIETNAM                   15
TONGA         

In [7]:
attacks['Area Total']=attacks['Area'].groupby(attacks['Area']).transform('count')

In [8]:

def get_month(date):
    try:
        
        month = re.findall(r'jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|summer|fall|', date, flags=re.IGNORECASE) 
        month =list(filter(None, month))
        if len(month):
            return month[0].lower()
        return None
    except TypeError:
        return None


attacks['Month']= attacks['Date'].map(get_month)


In [9]:
def get_season(month):
    northern_summer = ['apr','may','jun','jul','aug','sep','summer']
    northern_winter = ['jan','feb','mar','oct','nov','dec', 'fall']

    if month in northern_summer:
        return 'northern_summer'
    if month in northern_winter:
        return 'northern_winter'
    
attacks['Season']= attacks['Month'].map(get_season)


In [10]:
def get_data_by_country(df,country):
    return df[df['Country']==country]

In [11]:
def get_data_by_season(df,season):
    return df[df['Season']==season]

In [12]:
def fix_species_name(row):
    try:
        shark_type = re.search(r'^(.*) shark', row['Species '])
        if(shark_type):          
            return shark_type.group(1).lower()
    except TypeError as e:
        return row['Species ']
    return row['Species ']

In [13]:
attacks['Shark type']= attacks.apply(fix_species_name, axis='columns')


In [14]:

usa_attacks = get_data_by_country(attacks,'USA')

usa_attacks.groupby('Month')['Area'].count()

Month
apr       157
aug       325
dec        64
fall        3
feb        57
jan        51
jul       358
jun       243
mar        97
may       155
nov       118
oct       206
sep       299
summer      5
Name: Area, dtype: int64

In [15]:
usa_attacks.groupby('Season')['Area'].count()

Season
northern_summer    1542
northern_winter     596
Name: Area, dtype: int64

In [16]:
attacks[attacks['Fatal (Y/N)']=='Y'][['Shark type']].value_counts()


Shark type                                                  
white                                                           98
tiger                                                           60
bull                                                            23
3.7 m [12']                                                     10
3 m [10']                                                        9
                                                                ..
Sharks averaged 1.8 m [6'] in length                             1
Shark involvement probable                                       1
Shark involvement prior to death was not confirmed               1
Shark involvement prior to death suspected but not confirmed     1
                                                                 1
Length: 173, dtype: int64

In [17]:
attacks[attacks['Fatal (Y/N)']=='N'][['Shark type']].value_counts()


Shark type                             
white                                      324
tiger                                      169
bull                                       101
bronze whaler                               53
nurse                                       49
                                          ... 
6' blacktip                                  1
6' shark, possibly a blactip or spinner      1
6' to 7'                                     1
6' to 7' blacktip                            1
�small brown                                 1
Length: 693, dtype: int64

Area                         
New South Wales                  99
Queensland                       75
KwaZulu-Natal                    48
Florida                          46
Hawaii                           46
                                 ..
North of Pernambuco, Brazil       1
Northern Bahamas                  1
Northern Java                     1
Northwest of Papua New Guinea     1
 Manila Bay                       1
Length: 400, dtype: int64

In [19]:
attacks[attacks['Fatal (Y/N)']=='N'][['Area']].value_counts()


Area               
Florida                918
New South Wales        332
California             242
Hawaii                 213
Queensland             207
                      ... 
New Providence           1
New Providence           1
Norfolk                  1
Norfolk Island           1
 Kikori River mouth      1
Length: 523, dtype: int64

In [20]:
usa_attacks.value_counts('Shark type').reset_index()


Unnamed: 0,Shark type,0
0,white,129
1,tiger,102
2,Shark involvement not confirmed,53
3,bull,46
4,4',39
...,...,...
377,5' to 8',1
378,5' to 7',1
379,5' to 6' spinner or bull,1
380,possibly a juvenile blacktip,1


In [21]:
def count_species(df):
        return df['Shark type'].value_counts()


In [22]:
attack_counts = attacks.groupby('Area').size().reset_index(name='counts').sort_values(by='counts',ascending=False)
attack_counts


Unnamed: 0,Area,counts
263,Florida,1037
497,New South Wales,486
589,Queensland,311
315,Hawaii,298
158,California,290
...,...,...
326,Illeginni Atoll,1
327,Illinois,1
328,Imperia Province,1
329,In Convoy OB 274,1


In [23]:
usa_attacks_winter = get_data_by_season(usa_attacks, 'northern_winter')
usa_attacks_winter.value_counts('Shark type').reset_index()


Unnamed: 0,Shark type,0
0,tiger,53
1,white,43
2,Shark involvement not confirmed,13
3,Shark involvement prior to death was not confi...,11
4,spinner,10
...,...,...
146,>2.4 m [8'],1
147,>6 m [20'] white,1
148,>6.7 m [22'],1
149,C. maculpinnis or C. limbatus,1


In [24]:
usa_attacks_summer = get_data_by_season(usa_attacks, 'northern_summer')
usa_attacks_summer.value_counts('Shark type').reset_index()


Unnamed: 0,Shark type,0
0,white,85
1,tiger,47
2,bull,41
3,Shark involvement not confirmed,39
4,4',30
...,...,...
287,9.5',1
288,9-foot,1
289,"9'2"" white",1
290,9',1


In [25]:
australia_attacks = get_data_by_country(attacks,'AUSTRALIA')
australia_attacks['Shark type'].value_counts()

white                                     108
bronze whaler                              49
tiger                                      46
wobbegong                                  42
grey nurse                                 33
                                         ... 
2' to 3'                                    1
1.8 m to 2.4 m  [6' to 8']                  1
3.7 m to 4.3 m [12' to 14']                 1
thought to involve a >2 m [6.75'] bull      1
2.7 m  [9']                                 1
Name: Shark type, Length: 239, dtype: int64

In [26]:
australia_attacks_summer = get_data_by_season(australia_attacks, 'northern_winter')
australia_attacks_summer.value_counts('Shark type').reset_index()


Unnamed: 0,Shark type,0
0,white,65
1,bronze whaler,30
2,tiger,28
3,grey nurse,26
4,wobbegong,25
...,...,...
182,"Blue pointer, 12'",1
183,Blue pointer,1
184,76 cm [2.5'] carpet,1
185,7-gill,1


In [27]:
australia_attacks_winter = get_data_by_season(australia_attacks, 'northern_summer')
australia_attacks_winter.value_counts('Shark type').reset_index()

Unnamed: 0,Shark type,0
0,white,41
1,bronze whaler,19
2,tiger,16
3,wobbegong,15
4,Invalid,8
...,...,...
94,Shark involvement not confirmed; thought to be...,1
95,Shark involvement prior to death not confirmed,1
96,Shark involvement prior to death suspected but...,1
97,Shark seen feeding on turtle scraps thrown ove...,1


In [28]:
 usa_attacks.groupby(['Area','Activity']).apply(count_species)

Area         Activity                                            
Alabama      Fishing                                6', 100-lb       1
             Fishing                                bull             1
             Fishing in Alabama Deep Fishing Rodeo  tiger            1
             Swimming                               bull             1
             Swimming                               bull             1
                                                                    ..
Virginia     Swimming                               sand             1
             Swimming                               bull             1
Wake Island  Spearfishing                           blacktip reef    1
Washington   Surfing                                white            1
             Surfing (lying prone on his board)     white            1
Name: Shark type, Length: 954, dtype: int64

In [29]:
by_shark_type= usa_attacks.groupby(['Area','Area Total']).apply(count_species).reset_index().sort_values(by=["Area Total",'Shark type'],ascending=False)
by_shark_type

Unnamed: 0,Area,Area Total,level_2,Shark type
104,Florida,1037.0,nurse,35
105,Florida,1037.0,bull,26
106,Florida,1037.0,Shark involvement not confirmed,23
107,Florida,1037.0,4',21
108,Florida,1037.0,3',19
...,...,...,...,...
400,Missouri,1.0,nurse,1
420,New Mexico,1.0,sandtiger,1
437,North & South Carolina,1.0,blue,1
542,South Carolina,1.0,2.4 m [8'],1


In [32]:
by_shark_type= usa_attacks.groupby(['Area','Area Total']).apply(count_species).reset_index().sort_values(by=["Area Total",'Shark type'],ascending=False)
by_shark_type[:50]

Unnamed: 0,Area,Area Total,level_2,Shark type
104,Florida,1037.0,nurse,35
105,Florida,1037.0,bull,26
106,Florida,1037.0,Shark involvement not confirmed,23
107,Florida,1037.0,4',21
108,Florida,1037.0,3',19
109,Florida,1037.0,1.2 m [4'],17
110,Florida,1037.0,blacktip,15
111,Florida,1037.0,4' to 5',15
112,Florida,1037.0,6',15
113,Florida,1037.0,Invalid,14


In [31]:
by_shark_type[by_shark_type.Season=='northern_winter']

AttributeError: 'DataFrame' object has no attribute 'Season'

In [None]:
by_shark_type.groupby(['Area']).apply(lambda df: df.loc[df['Shark type'].idxmax()] )

In [None]:
by_shark_type.groupby(['Area']).apply(lambda df: df.loc[df['Shark type'].idxmax()] )

In [None]:
by_shark_type= australia_attacks.groupby(['Area','Area Total','Season']).apply(count_species).reset_index().sort_values(by=["Area Total",'Shark type'],ascending=False)

by_shark_type[by_shark_type.Season=='northern_summer']