<img src="https://www.bisnia.es/wp-content/uploads/2019/10/limpieza-datos-870x466.jpg">

# Data Clean up

### Import the necessary libraries

In [1]:
import pandas as pd
import src.limpieza_texto as lt
import re
import seaborn as sns

### Import the dateframe

In [2]:
original_data = pd.read_csv("data/attacks.csv",encoding = "ISO-8859-1")

In [3]:
original_data.sample(5)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
18504,,,,,,,,,,,...,,,,,,,,,,
11233,,,,,,,,,,,...,,,,,,,,,,
22399,,,,,,,,,,,...,,,,,,,,,,
4472,1950.06.25,25-Jun-1950,1950.0,Unprovoked,USA,New York,"Beach 103rd Street, Rockaway",Swimming,Joseph Salengo,M,...,"""sand shark""","New York Times, 6/25/1950, p.1. col.2 & p.38",1950.06.25-Salango.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1950.06.25,1950.06.25,1831.0,,
19130,,,,,,,,,,,...,,,,,,,,,,


There are some rows where there is no data.

In [4]:
original_data.isnull().sum()

Case Number               17021
Date                      19421
Year                      19423
Type                      19425
Country                   19471
Area                      19876
Location                  19961
Activity                  19965
Name                      19631
Sex                       19986
Age                       22252
Injury                    19449
Fatal (Y/N)               19960
Time                      22775
Species                   22259
Investigator or Source    19438
pdf                       19421
href formula              19422
href                      19421
Case Number.1             19421
Case Number.2             19421
original order            19414
Unnamed: 22               25722
Unnamed: 23               25721
dtype: int64

### The first step is to delete all the incomplete values.
We use the function `dropna` to delete `all` the rows where there are no values.

In [5]:
data_filtered_1= original_data.dropna(axis=0, how="all")
print(data_filtered_1.shape)
data_filtered_1.tail()

(8703, 24)


Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
8698,0,,,,,,,,,,...,,,,,,,,,,
8699,0,,,,,,,,,,...,,,,,,,,,,
8700,0,,,,,,,,,,...,,,,,,,,,,
8701,0,,,,,,,,,,...,,,,,,,,,,
25722,xx,,,,,,,,,,...,,,,,,,,,,


As we cas see, there are some row where thr `case number` value is `0` and the rest of the values are `Nan`.
First we create a new variable and select the filtered rows of the dataframe where the `case number`value is different from `0`.

In [6]:
data_filtered_2 = lt.valores_distintos(data_filtered_1, "Case Number", "0")
data_filtered_2.tail(5)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
6298,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,...,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5.0,,
6299,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,...,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4.0,,
6300,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,...,,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002,3.0,,
6301,ND.0001,1845-1853,0.0,Unprovoked,CEYLON (SRI LANKA),Eastern Province,"Below the English fort, Trincomalee",Swimming,male,M,...,,S.W. Baker,ND-0001-Ceylon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0001,ND.0001,2.0,,
25722,xx,,,,,,,,,,...,,,,,,,,,,


As que can see, there is a value at the end of dataframe where the `Case Number` value is `xx` and the rest of the values of the row `NaN`, so I delete that row using the method `.drop()` and selecting the value of the index.

In [7]:
data_filtered_3 = data_filtered_2.drop(index= 25722)

In [8]:
data_filtered_3.tail()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
6297,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,...,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6.0,,
6298,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,...,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5.0,,
6299,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,...,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4.0,,
6300,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,...,,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002,3.0,,
6301,ND.0001,1845-1853,0.0,Unprovoked,CEYLON (SRI LANKA),Eastern Province,"Below the English fort, Trincomalee",Swimming,male,M,...,,S.W. Baker,ND-0001-Ceylon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0001,ND.0001,2.0,,


There are also some columns called`pdf`, `href formula`, `Unnamed: 22`, `Unnamed: 23`, which I consider that are not useful.

The column `original order` is also no usefull becouse is the same as the index but inverted.

I used a personal function called `.elim_columnas()` to delete all the columns i have selected in a list called `eliminar`.

In [9]:
eliminar = ["Unnamed: 22","Unnamed: 23", "pdf", "href formula", "original order"]
data_filtered_4 = original_data
data_filtered_4=lt.elim_columnas(data_filtered_3, eliminar)
data_filtered_4

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href,Case Number.1,Case Number.2
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,,FATAL,Y,,,"H. Taunton; N. Bartlett, p. 234",http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005
6298,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,,FATAL,Y,,,"H. Taunton; N. Bartlett, pp. 233-234",http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004
6299,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,,FATAL,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003
6300,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,,FATAL,Y,,,"The Sun, 10/20/1938",http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002


#### At the end of the dataframe there are to columns, so the next step is to check if the information is difference between them.
I have created a function called `check_columns` that find out the values that are not equal in the selected `columns`.

In [10]:
lt.check_columns(data_filtered_4, "Case Number.1", "Case Number.2").sample(3)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href,Case Number.1,Case Number.2
314,2016.01.24.b,24-Jan-2016,2016.0,Unprovoked,USA,Texas,Off Surfside,Spearfishing,Keith Love,M,,"Bruised ribs & tail bone, speargun broken and ...",N,09h30 / 10h00,Bull sharks x 2,K. Love,http://sharkattackfile.net/spreadsheets/pdf_di...,2015.01.24.b,2016.01.24.b
334,2015.12.23,07-Nov-2015,2015.0,Invalid,USA,Florida,"Paradise Beach, Melbourne, Brevard County",Surfing,Ryla Underwood,F,9.0,Lower left leg injured,,11h00,Shark involvement not confirmed,"Fox25Orlando, 11/7/2015",http://sharkattackfile.net/spreadsheets/pdf_di...,2015.11.07,2015.12.23
5386,1911.07.31.R,Reported 31-Jul-1911,1911.0,Unprovoked,SPAIN,Málaga,Ceuta,Bathing,a soldier,M,,FATAL,Y,,,C. Moore. GSAF,http://sharkattackfile.net/spreadsheets/pdf_di...,1911.07.31.T,1911.07.31.R


After analyzing these two columns, you can see that it is the same value as the `case number` column but with different characters, so I decide to delete them.

In [11]:
eliminar_2 = ["Case Number.1", "Case Number.2"]
data_filtered_5 = original_data
data_filtered_5=lt.elim_columnas(data_filtered_4, eliminar_2)
data_filtered_5.sample(3)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href
2159,1998.10.04,04-Oct-1998,1998.0,Unprovoked,BRAZIL,Pernambuco,"Boa Viagem Beach, Recife",Surfing,Júlio César de Barros Correia,M,17,Right leg bitten,N,,,"Folha de S.Paul, 6/10/1998",http://sharkattackfile.net/spreadsheets/pdf_di...
2425,1994.07.24,24-Jul-1994,1994.0,Unprovoked,BRAZIL,Pernambuco,"Boa Viagem, Recife",Surfing,Carlos Frederico Gomes Martins,M,15,Left foot severed,N,,,C.F.G. Martins; JCOnline,http://sharkattackfile.net/spreadsheets/pdf_di...
4350,1954.01.15,15-Jan-1954,1954.0,Unprovoked,PAPUA NEW GUINEA,Madang Province,"Singour, 60 miles south of Madang",Crouching in the water,Ramlen,M,26,Back & thighs lacerated,N,Late afternoon,14' shark,"Cairns Post, 1/21/1954",http://sharkattackfile.net/spreadsheets/pdf_di...


In [12]:
data_filtered_6 = data_filtered_5.dropna(axis=0, how="all")
data_filtered_6

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,http://sharkattackfile.net/spreadsheets/pdf_di...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,,FATAL,Y,,,"H. Taunton; N. Bartlett, p. 234",http://sharkattackfile.net/spreadsheets/pdf_di...
6298,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,,FATAL,Y,,,"H. Taunton; N. Bartlett, pp. 233-234",http://sharkattackfile.net/spreadsheets/pdf_di...
6299,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,,FATAL,Y,,,"F. Schwartz, p.23; C. Creswell, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
6300,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,,FATAL,Y,,,"The Sun, 10/20/1938",http://sharkattackfile.net/spreadsheets/pdf_di...


## Hypothesis 1

I conducted a study on how many `researchers` there are and how much research they have carried out. Is there a shark attack specialist?

In [13]:
data_filtered_6["Investigator or Source"].unique()

array(['R. Collier, GSAF', 'K.McMurray, TrackingSharks.com',
       'B. Myatt, GSAF', ..., 'F. Schwartz, p.23; C. Creswell, GSAF',
       'The Sun, 10/20/1938', 'S.W. Baker'], dtype=object)

In [14]:
data_filtered_6["Investigator or Source"].value_counts()

C. Moore, GSAF                       105
C. Creswell, GSAF                     92
S. Petersohn, GSAF                    82
R. Collier                            55
R. Collier, GSAF                      48
                                    ... 
D.R. Nelson                            1
R. Fernicola, GSAF                     1
AAP, 6/20/2002                         1
Norway Heritage                        1
Orange County Register, 3/13/2016      1
Name: Investigator or Source, Length: 4969, dtype: int64

In [15]:
data_filtered_6[data_filtered_5["Investigator or Source"] == "C. Moore, GSAF"].sample(10)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href
4903,1935.07.01,01-Jul-1935,1935.0,Unprovoked,CROATIA,Adriatic Sea,"Susak / Fiume (Rijeka, Istria)",Swimming,Mira Kudlich,F,22.0,FATAL,Y,15h00,,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
4434,1951.09.03.R,Reported 03-Sep-1951,1951.0,Unprovoked,ITALY,Salerno Province,Salerno,Fishing for squid,Luca Caputo,M,,3 fingers severed when he used his hand as bait,N,Night,,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
5939,1864.09.18.R,Reported 18-Sep-1864,1864.0,Provoked,FRANCE,Alpes Maritime,Antibes,Dragging a shark,fisherman,M,,Knee bitten PROVOKED INCIDENT,N,,1.5 m shark,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
4574,1947.07.00,Jul-1947,1947.0,Invalid,GREECE,Carpathian Sea,Dodecanese Islands,Jumped overboard,Nickolas Doulis,M,,Shark involvement unconfirmed,,,Questionable,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
5703,1890.06.02.R,Reported 02-Jun-1890,1890.0,Unprovoked,EGYPT,,Port Said,Swimming,male,M,,FATAL,Y,,,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
4504,1949.08.17,17-Aug-1949,1949.0,Invalid,ITALY,Tuscany,Elba Island,Swimming,Domenico Murolo,M,17.0,No injury,,,2 m shark,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
3522,1967.07.05,05-Jul-1967,1967.0,Unprovoked,TURKEY,Mugla Province,Kucukada Island,Spearfishing,Gungor Guven,M,36.0,FATAL,Y,13h40,,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
5324,1915.05.15.R,Reported 15-May-1915,1915.0,Invalid,EGYPT,,Alexandria,Fell overboard,male,M,,Shark involvement not confirmed,,,Shark involvement prior to death unconfirmed,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
3794,1962.08.31.R,Reported 31-Aug-1962,1962.0,Provoked,ISRAEL,Sharon,2 km north of Apollonia,Fishing,fisherman,M,,"Details unknown, possibly a PROVOKED INCIDENT",UNKNOWN,,2.5 m [8.25'] shark,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
4338,1954.07.01.R,Reported 01-Jul-1954,1954.0,Invalid,CROATIA,,Pula,,male,,,Human remains found in shark,,,Shark involvement prior to death was not confi...,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...


In [16]:
import src.limpieza_texto as lt

In [17]:
lt.valores_iguales(data_filtered_6, "Investigator or Source", "C. Moore, GSAF")

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href
136,2017.06.07.R,Reported 07-Jun-2017,2017.0,Unprovoked,UNITED KINGDOM,South Devon,Bantham Beach,Surfing,Rich Thomson,M,30,"Bruise to leg, cuts to hand sustained when he ...",N,,"3m shark, probably a smooth hound","C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
2718,1988.08.22.a,22-Aug-1988,1988.0,Unprovoked,ITALY,Manfredonia,Ippocampo,,male,M,16,Survived,N,,,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
3183,1976.06.02.R,Reported 02-Jun-1976,1976.0,Provoked,ITALY,Reggio Calabria Province,Bovalino,Fishing,Francisco Pelle,M,46,Shark rammed boat PROVOKED INCIDENT,N,,"Blue shark, 2m","C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
3229,1975.04.25,25-Apr-1975,1975.0,Invalid,ITALY,Genoa Province,Cervara,Scuba diving,Walter Sansoni,M,37,The press reported this as an attack by a whit...,,,Invalid,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
3513,1967.08.25,25-Aug-1967,1967.0,Unprovoked,ITALY,Liguria,"Marinella Sarzana, La Spezia",Spearfishing on Scuba,Gian Paolo Porta Casucci,M,,Minor injuries to face & forearm,N,,,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6155,1742.12.17,17-Dec-1742,1742.0,Unprovoked,,,Carlisle Bay,Swimming,2 impressed seamen,M,,FATAL,Y,,,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
6156,1738.04.06.R,Reported 06-Apr-1738,1738.0,Unprovoked,ITALY,Sicily,Strait of Messina,Swimming,male,M,,FATAL,Y,,,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
6193,ND-0134,Between 1951-1963,0.0,Unprovoked,GREECE,,,Swimming,Martha Hatagouei,F,,FATAL,Y,,,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
6196,ND-0130,Before 1876,0.0,Unprovoked,LEBANON,,,Collecting fish,Kahlifeh,M,,Posterior thigh bitten,N,,,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...


### last step: export the dataset to the data folder  "midatasetlimpio.csv"

In [18]:
shrk = data_filtered_6
shrk.sample(5)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href
4093,1959.05.30,30-May-1959,1959.0,Provoked,SOUTH AFRICA,Eastern Cape Province,"Bird Island, Algoa Bay",Spearfishing,Tony Dicks,M,23.0,"No injury, diver shot shark & it bit his spear...",N,,"White shark, 2.7 m [9'], 280-lb","C. Middleton; M. Levine, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
4607,1946.01.24.a,24-Jan-1946,1946.0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,"Battery Beach, Durban",Swimming,"Manduray, a lifesaver",M,,Foot severely lacerated,N,Afternoon,,"J. R. Varma; M. Levine, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
2713,1988.10.00,Oct-1988,1988.0,Unprovoked,USA,Florida,"Sanibel Island, Lee County",Wading,Sally Jo Scott,F,23.0,Leg lacerated,N,,1.2 m [4'] shark,C.L. Call,http://sharkattackfile.net/spreadsheets/pdf_di...
1540,2005.10.11,11-Oct-2005,2005.0,Unprovoked,GRAND CAYMAN,East Wall,Jack McKenney's Canyon,Scuba diving,Lea Ann Hughes,F,57.0,No injury,N,15h00,Caribbean reef sharks,"L.A.Hughes; G. Holt, Scubaradio.com",http://sharkattackfile.net/spreadsheets/pdf_di...
5291,1917.05.05,Reported 05-May-1917,1917.0,Unprovoked,KUWAIT,,,Diving for pearls,a young Arab,M,,Torso bitten,N,,,"Denton Journal, 5/5/1917",http://sharkattackfile.net/spreadsheets/pdf_di...


In [55]:
def mejores(columna,n_ind):
    mejores = shrk[columna].value_counts(1)
    top = list(mejores.index[:n_ind])
    return top

rsky_list = [ "Surfing", "Swimming", "Fishing", "Spearfishing", "Bathing", "Wading", "Diving", "Diving", "Standing", "Snorkeling", "Scuba diving", "Body boarding"]

def df_filtered_best(data_frame, columna, n_ind):

    mejores = data_frame[columna].value_counts(1)
    top = list(mejores.index[:n_ind])    
    
    df_final = data_frame[data_frame[columna] == top[0]]
    top.pop(0)
    for i in top:

        i =data_frame[data_frame[columna] == i]
        df_final = pd.concat([df_final,i])

    
    return df_final

In [87]:
shrk.to_csv(".\Data\midatasetlimpio.csv")