In [179]:
import pandas as pd
import re
attacks = pd.read_csv("input/attacks.csv",encoding='utf-8')
attacks.Activity.value_counts().head(10)

Surfing         971
Swimming        869
Fishing         431
Spearfishing    333
Bathing         162
Wading          149
Diving          127
Standing         99
Snorkeling       89
Scuba diving     76
Name: Activity, dtype: int64

In [180]:
attacks['Country']= attacks['Country'].map(lambda x: x.replace('?',  "" ) if type(x)==str else x )

In [181]:
attacks['Country'].value_counts()[:50]

USA                     2229
AUSTRALIA               1338
SOUTH AFRICA             579
PAPUA NEW GUINEA         134
NEW ZEALAND              128
BRAZIL                   112
BAHAMAS                  109
MEXICO                    89
ITALY                     71
FIJI                      62
PHILIPPINES               61
REUNION                   60
NEW CALEDONIA             53
CUBA                      46
MOZAMBIQUE                45
SPAIN                     44
INDIA                     40
EGYPT                     38
JAPAN                     34
CROATIA                   34
PANAMA                    32
SOLOMON ISLANDS           30
IRAN                      29
JAMAICA                   27
GREECE                    25
FRENCH POLYNESIA          25
HONG KONG                 24
ENGLAND                   23
INDONESIA                 23
COSTA RICA                17
PACIFIC OCEAN             17
ATLANTIC OCEAN            17
BERMUDA                   16
VIETNAM                   15
TONGA         

In [182]:
attacks['Area Total']=attacks['Area'].groupby(attacks['Area']).transform('count')

In [183]:

def get_month(date):
    try:
        
        month = re.findall(r'jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|summer|fall|', date, flags=re.IGNORECASE) 
        month =list(filter(None, month))
        if len(month):
            return month[0].lower()
        return None
    except TypeError:
        return None


attacks['Month']= attacks['Date'].map(get_month)


In [184]:
def get_season(month):
    northern_summer = ['apr','may','jun','jul','aug','sep','summer']
    northern_winter = ['jan','feb','mar','oct','nov','dec', 'fall']

    if month in northern_summer:
        return 'northern_summer'
    if month in northern_winter:
        return 'northern_winter'
    
attacks['Season']= attacks['Month'].map(get_season)


In [185]:
def get_data_by_country(df,country):
    return df[df['Country']==country]

In [186]:
def get_data_by_season(df,season):
    return df[df['Season']==season]

In [187]:
def fix_species_name(row):
    try:
        shark_type = re.search(r'^(.*) shark', row['Species '])
        if(shark_type):          
            return shark_type.group(1).lower()
    except TypeError as e:
        return row['Species ']
    return row['Species ']

In [188]:
attacks['Shark type']= attacks.apply(fix_species_name, axis='columns')


In [189]:

usa_attacks = get_data_by_country(attacks,'USA')

usa_attacks.groupby('Month')['Area'].count()

Month
apr       157
aug       325
dec        64
fall        3
feb        57
jan        51
jul       358
jun       243
mar        97
may       155
nov       118
oct       206
sep       299
summer      5
Name: Area, dtype: int64

In [190]:
usa_attacks.groupby('Season')['Area'].count()

Season
northern_summer    1542
northern_winter     596
Name: Area, dtype: int64

In [191]:
attacks[attacks['Fatal (Y/N)']=='Y'][['Shark type']].value_counts()


Shark type                                                    
white                                                             98
tiger                                                             60
bull                                                              23
3.7 m [12']                                                       10
3 m [10']                                                          9
                                                                  ..
fishermen recovered partial remains from                           1
galapagos                                                          1
hand found in gut of 2.9 m to 3.3 m [9'7" to 10'11"] galapagos     1
his hand was found in a 2.4 m [8'] tiger                           1
 tiger                                                             1
Length: 157, dtype: int64

In [192]:
attacks[attacks['Fatal (Y/N)']=='N'][['Shark type']].value_counts()


Shark type                     
white                              324
tiger                              169
bull                               101
bronze whaler                       53
nurse                               49
                                  ... 
7.5'                                 1
70 kg                                1
76 cm [2.5'] carpet                  1
8' bull shark or caribbean reef      1
 "gummy"                             1
Length: 638, dtype: int64

In [193]:
attacks[attacks['Fatal (Y/N)']=='Y'][['Area']].value_counts()


Area                         
New South Wales                  99
Queensland                       75
KwaZulu-Natal                    48
Florida                          46
Hawaii                           46
                                 ..
North of Pernambuco, Brazil       1
Northern Bahamas                  1
Northern Java                     1
Northwest of Papua New Guinea     1
 Manila Bay                       1
Length: 400, dtype: int64

In [194]:
attacks[attacks['Fatal (Y/N)']=='N'][['Area']].value_counts()


Area               
Florida                918
New South Wales        332
California             242
Hawaii                 213
Queensland             207
                      ... 
New Providence           1
New Providence           1
Norfolk                  1
Norfolk Island           1
 Kikori River mouth      1
Length: 523, dtype: int64

In [195]:
usa_attacks.value_counts('Shark type').reset_index()


Unnamed: 0,Shark type,0
0,white,129
1,tiger,102
2,bull,46
3,4',39
4,nurse,38
...,...,...
336,7.5',1
337,8' bull shark or caribbean reef,1
338,8' great hammerhead,1
339,8' to 10',1


In [196]:
def count_species(df):
        return df['Shark type'].value_counts()


In [197]:
attack_counts = attacks.groupby('Area').size().reset_index(name='counts').sort_values(by='counts',ascending=False)
attack_counts


Unnamed: 0,Area,counts
263,Florida,1037
497,New South Wales,486
589,Queensland,311
315,Hawaii,298
158,California,290
...,...,...
326,Illeginni Atoll,1
327,Illinois,1
328,Imperia Province,1
329,In Convoy OB 274,1


In [198]:
usa_attacks_winter = get_data_by_season(usa_attacks, 'northern_winter')
usa_attacks_winter.value_counts('Shark type').reset_index()


Unnamed: 0,Shark type,0
0,tiger,53
1,white,43
2,spinner,10
3,6',9
4,4',9
...,...,...
132,7 m [23'] white,1
133,7' to 8',1
134,8',1
135,8' to 10',1


In [199]:
usa_attacks_summer = get_data_by_season(usa_attacks, 'northern_summer')
usa_attacks_summer.value_counts('Shark type').reset_index()


Unnamed: 0,Shark type,0
0,white,85
1,tiger,47
2,bull,41
3,4',30
4,nurse,29
...,...,...
255,8' bull shark or caribbean reef,1
256,7.5',1
257,7' female,1
258,"68""",1


In [200]:
australia_attacks = get_data_by_country(attacks,'AUSTRALIA')
australia_attacks['Shark type'].value_counts()

white                                       108
bronze whaler                                49
tiger                                        46
wobbegong                                    42
grey nurse                                   33
                                           ... 
100-kg [221-lb] dead blue                     1
"a very large                                 1
1.5 m [5'] "whaler                            1
remains recovered from 5.5 m [18'] white      1
2 to 2.5 m                                    1
Name: Shark type, Length: 208, dtype: int64

In [201]:
australia_attacks_summer = get_data_by_season(australia_attacks, 'northern_winter')
australia_attacks_summer.value_counts('Shark type').reset_index()


Unnamed: 0,Shark type,0
0,white,65
1,bronze whaler,30
2,tiger,28
3,grey nurse,26
4,wobbegong,25
...,...,...
162,6 m [20'],1
163,5.5' to 6',1
164,5.5 m [18'] white,1
165,5.5 m [18''] white,1


In [202]:
australia_attacks_winter = get_data_by_season(australia_attacks, 'northern_summer')
australia_attacks_winter.value_counts('Shark type').reset_index()

Unnamed: 0,Shark type,0
0,white,41
1,bronze whaler,19
2,tiger,16
3,wobbegong,15
4,grey nurse,7
...,...,...
77,3+ m,1
78,3',1
79,3 m to 4 m white,1
80,"3 m [10'], 270- kg [595-lb]",1


In [203]:
 usa_attacks.groupby(['Area','Activity']).apply(count_species)

Area         Activity                                            
Alabama      Fishing                                6', 100-lb       1
             Fishing                                bull             1
             Fishing in Alabama Deep Fishing Rodeo  tiger            1
             Swimming                               bull             1
             Swimming                               bull             1
                                                                    ..
Virginia     Swimming                               sand             1
             Swimming                               bull             1
Wake Island  Spearfishing                           blacktip reef    1
Washington   Surfing                                white            1
             Surfing (lying prone on his board)     white            1
Name: Shark type, Length: 812, dtype: int64

In [205]:
by_shark_type= usa_attacks.groupby(['Area','Area Total']).apply(count_species).reset_index().sort_values(by=["Area Total",'Shark type'],ascending=False)
by_shark_type

Unnamed: 0,Area,Area Total,level_2,Shark type
89,Florida,1037.0,nurse,35
90,Florida,1037.0,bull,26
91,Florida,1037.0,4',21
92,Florida,1037.0,3',19
93,Florida,1037.0,1.2 m [4'],17
...,...,...,...,...
345,Missouri,1.0,nurse,1
363,New Mexico,1.0,sandtiger,1
374,North & South Carolina,1.0,blue,1
467,South Carolina,1.0,2.4 m [8'],1


In [136]:
by_shark_type= usa_attacks.groupby(['Area','Area Total','Activity']).apply(count_species).reset_index().sort_values(by=["Area Total",'Shark type'],ascending=False)
by_shark_type[:50]

Unnamed: 0,Area,Area Total,Activity,level_3,Shark type
323,Florida,1037.0,Surfing,4' to 5',13
324,Florida,1037.0,Surfing,5',10
325,Florida,1037.0,Surfing,blacktip,10
326,Florida,1037.0,Surfing,4',10
327,Florida,1037.0,Surfing,3',9
328,Florida,1037.0,Surfing,6',8
329,Florida,1037.0,Surfing,blacktip or spinner,7
330,Florida,1037.0,Surfing,1.2 m [4'],7
395,Florida,1037.0,Swimming,bull,7
331,Florida,1037.0,Surfing,1.2 m to 1.5 m [4' to 5'],6


In [636]:
by_shark_type[by_shark_type.Season=='northern_winter']

Unnamed: 0,Area,Area Total,Season,level_3,Shark type
228,Florida,1037.0,northern_winter,spinner,10
229,Florida,1037.0,northern_winter,6',8
230,Florida,1037.0,northern_winter,1.2 m [4'],7
231,Florida,1037.0,northern_winter,blacktip,7
232,Florida,1037.0,northern_winter,4' to 5',7
...,...,...,...,...,...
541,US Virgin Islands,5.0,northern_winter,nurse,1
376,Maine,1.0,northern_winter,porbeagle,1
390,Missouri,1.0,northern_winter,nurse,1
409,New Mexico,1.0,northern_winter,sandtiger,1


In [38]:
by_shark_type.groupby(['Area','Season']).apply(lambda df: df.loc[df['Shark type'].idxmax()] )

Unnamed: 0_level_0,Unnamed: 1_level_0,Area,Area Total,Season,level_3,Shark type
Area,Season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,northern_summer,Alabama,15.0,northern_summer,bull,3
Alabama,northern_winter,Alabama,15.0,northern_winter,"6', 100-lb",1
Alaska,northern_summer,Alaska,1.0,northern_summer,salmon,1
California,northern_summer,California,290.0,northern_summer,white,67
California,northern_winter,California,290.0,northern_winter,white,33
Delaware,northern_summer,Delaware,7.0,northern_summer,5',1
Florida,northern_summer,Florida,1037.0,northern_summer,nurse,28
Florida,northern_winter,Florida,1037.0,northern_winter,spinner,10
Florida,northern_summer,Florida,1.0,northern_summer,3.7 m [12'],1
Georgia,northern_summer,Georgia,14.0,northern_summer,small,1


In [500]:
by_shark_type.groupby(['Area']).apply(lambda df: df.loc[df['Shark type'].idxmax()] )

Unnamed: 0_level_0,Area,Area Total,Season,level_3,Shark type
Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New South Wales,New South Wales,486.0,northern_winter,white,23
Northern Territory,Northern Territory,23.0,northern_summer,2 m [6.75'] copper,1
Queensland,Queensland,311.0,northern_winter,bull,11
Queensland,Queensland,1.0,northern_winter,tiger,1
South Australia,South Australia,104.0,northern_winter,white,14
Tasmania,Tasmania,41.0,northern_summer,white,4
Territory of Cocos (Keeling) Islands,Territory of Cocos (Keeling) Islands,1.0,northern_summer,bronze whaler,1
Torres Strait,Torres Strait,70.0,northern_summer,tiger,3
Torres Strait,Torres Strait,2.0,northern_summer,tiger,1
Victoria,Victoria,90.0,northern_winter,white,7


In [39]:
by_shark_type= australia_attacks.groupby(['Area','Area Total','Season']).apply(count_species).reset_index().sort_values(by=["Area Total",'Shark type'],ascending=False)

by_shark_type[by_shark_type.Season=='northern_summer']

Unnamed: 0,Area,Area Total,Season,level_3,Shark type
0,New South Wales,486.0,northern_summer,wobbegong,11
1,New South Wales,486.0,northern_summer,white,10
2,New South Wales,486.0,northern_summer,bronze whaler,7
3,New South Wales,486.0,northern_summer,grey nurse,3
4,New South Wales,486.0,northern_summer,6',3
...,...,...,...,...,...
104,Northern Territory,23.0,northern_summer,bronze whaler,1
310,Westerm Australia,3.0,northern_summer,tawny nurse,1
311,Westerm Australia,3.0,northern_summer,white,1
267,Torres Strait,2.0,northern_summer,tiger,1
