In [1]:
import os
import pandas as pd
import re
import numpy as np
df = pd.read_csv("attacks.csv", encoding="Latin1")

# Analizamos la información

In [2]:
df.shape

(25723, 24)

In [3]:
df.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [4]:
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

# Comprobamos valores nulos

In [5]:
df.isnull().sum()

Case Number               17021
Date                      19421
Year                      19423
Type                      19425
Country                   19471
Area                      19876
Location                  19961
Activity                  19965
Name                      19631
Sex                       19986
Age                       22252
Injury                    19449
Fatal (Y/N)               19960
Time                      22775
Species                   22259
Investigator or Source    19438
pdf                       19421
href formula              19422
href                      19421
Case Number.1             19421
Case Number.2             19421
original order            19414
Unnamed: 22               25722
Unnamed: 23               25721
dtype: int64

La mayoría de columnas tiene unos 19.000 valores nulos aproximadamente, por tanto decidimos eliminar aquellas columnas que tengan más de 20.000 valores nulos

In [6]:
morethan20000nulls = df.isnull().sum()[df.isnull().sum() > 20000].index
morethan20000nulls

Index(['Age', 'Time', 'Species ', 'Unnamed: 22', 'Unnamed: 23'], dtype='object')

In [7]:
#Definimos función para eliminar columnas
def dropcolumns (df, list):
    df_clean = df.drop(columns=list)
    return df_clean

In [8]:
df_clean = dropcolumns (df, morethan20000nulls)
df_clean.shape

(25723, 19)

Ahora veremos si hay filas en las que todos sus datos sean nulos y las eliminaremos

In [9]:
df_clean = df_clean.dropna(how="all")
df_clean.shape

(8703, 19)

# Columnas que no vamos a utilizar

A pesar de que hemos reducido bastante la información, seguimos teniendo columnas que no vamos a necesitar, por lo que las eliminaremos también

In [10]:
df_clean.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Injury', 'Fatal (Y/N)',
       'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'],
      dtype='object')

In [11]:
unusefull_columns = ['Case Number', 'Name', 'Sex ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order']
df_clean = dropcolumns (df_clean, unusefull_columns)
df_clean.shape

(8703, 9)

Y volvemos a comprobar si hay filas en las que todos los valores sean nulos y las eliminamos

In [12]:
df_clean = df_clean.dropna(how="all")
df_clean.shape

(6302, 9)

# Analizamos columnas

Ahora que ya hemos elimados los valores nulos, y aquellas columnas que no vamos a necesitar podemos comenzar a ver información más detallada de cada una de las columnas

In [15]:
df_clean.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Injury', 'Fatal (Y/N)'],
      dtype='object')

In [16]:
#Columna "Year"
df_clean["Year"].value_counts().reset_index().sort_values('index', ascending=False)

Unnamed: 0,index,Year
37,2018.0,53
1,2017.0,136
2,2016.0,130
0,2015.0,143
4,2014.0,127
...,...,...
208,1543.0,1
194,500.0,1
224,77.0,1
196,5.0,1


Podemos ver que en el año 2018 hay muchos menos registros que en años anteriores, es posible que el año no esté completo, lo comprobaremos analizando la columna "Date"

In [17]:
list(df_clean["Date"].value_counts().items())

[('1957', 11),
 ('1942', 9),
 ('1956', 8),
 ('1958', 7),
 ('1941', 7),
 ('1950', 7),
 ('No date', 6),
 ('1949', 6),
 ('1954', 5),
 ('Oct-1960', 5),
 ('1955', 5),
 ('28-Jul-1995', 5),
 ('05-Oct-2003', 5),
 ('1970s', 5),
 ('12-Apr-2001', 5),
 ('No date, Before 1963', 5),
 ('1959', 5),
 ('1940', 5),
 ('Aug-1956', 5),
 ('14-Jun-2012', 4),
 ('1898', 4),
 ('Before 1906', 4),
 ('09-Jul-1994', 4),
 ('1945', 4),
 ('1952', 4),
 ('Reported 10-Oct-1906', 4),
 ('1876', 4),
 ('28-Dec-2014', 4),
 ('1904', 4),
 ('1960s', 4),
 ('09-Jan-2010', 4),
 ('20-Sep-2015', 4),
 ('1995', 4),
 ('29-Apr-2017', 4),
 ('1938', 4),
 ('15-Apr-2018', 4),
 ('Before 1958', 4),
 ('27-Jul-1952', 4),
 ('23-Jan-1970', 4),
 ('1960', 4),
 ('27-Dec-2008', 4),
 ('1961', 4),
 ('09-Feb-1996', 3),
 ('1913', 3),
 ('11-Jan-1962', 3),
 ('30-Sep-2007', 3),
 ('Nov-1960', 3),
 ('1926', 3),
 ('31-Oct-2003', 3),
 ('24-Jan-2009', 3),
 ('1941-1945', 3),
 ('16-Aug-1974', 3),
 ('17-Aug-2011', 3),
 ('Before 1962', 3),
 ('20-Jul-2017', 3),
 ('08-A

Observamos que en algunas celdas la información es incompleta, ya que muestra solo el año, o que incluye la palabra "Reported", eliminaremos información no necesaria y nos quedaremos solo con las celdas que contienen la fecha completa haciendo uso de Regex

#Creamos una nueva columna con el nombre "Date_ok" en la que la información esté expresada de forma homogenea
dates = list(df_clean["Date"])
df_clean ["Date_ok"] = pd.Series(re.findall(r'[0-9]{2}\-[a-zA-Z]*\-[0-9]{4}', str(dates)))
#La columna se añade por defecto al final del DataFrame, por lo que las reordenamos y eliminamos la columna "Date"
colnames = df_clean.columns.tolist()
colnames = colnames[-1:] + colnames [:-1]
df_clean = df_clean[colnames]

In [18]:
list(df_clean["Type"].value_counts().items())

[('Unprovoked', 4595),
 ('Provoked', 574),
 ('Invalid', 547),
 ('Sea Disaster', 239),
 ('Boating', 203),
 ('Boat', 137),
 ('Questionable', 2),
 ('Boatomg', 1)]

Parece que los valores "Boating", "Boat" y "Boatomg" hacen referencia a lo mismo, por lo que podemos agruparlos con el mismo nombre

In [19]:
#Comprobamos si existen valores nulos
df_clean["Type"].isnull().sum()

4

In [20]:
df_clean[df_clean.Type.isnull()][["Year","Country","Type"]]

Unnamed: 0,Year,Country,Type
85,2017.0,SAMOA,
382,2015.0,AUSTRALIA,
4867,1936.0,VIETNAM,
5705,1890.0,CEYLON,


In [23]:
#Rellenamos los 4 valores nulos como "Sin registros"
df_clean.loc[df_clean.Type.isnull(), "Type"] = ["Sin registros","Sin registros","Sin registros","Sin registros"]

ValueError: Must have equal len keys and value when setting with an iterable

In [22]:
#Agrupamos "Boating", "Boat" y "Boatomg" bajo "Boating"
df_clean.loc[df_clean["Type"].str.startswith("B"),"Type"] = "Boating"

In [24]:
df_clean["Type"].value_counts()

Unprovoked       4595
Provoked          574
Invalid           547
Boating           341
Sea Disaster      239
Sin registros       4
Questionable        2
Name: Type, dtype: int64

In [25]:
list(df_clean["Country"].value_counts().items())

[('USA', 2229),
 ('AUSTRALIA', 1338),
 ('SOUTH AFRICA', 579),
 ('PAPUA NEW GUINEA', 134),
 ('NEW ZEALAND', 128),
 ('BRAZIL', 112),
 ('BAHAMAS', 109),
 ('MEXICO', 89),
 ('ITALY', 71),
 ('FIJI', 62),
 ('PHILIPPINES', 61),
 ('REUNION', 60),
 ('NEW CALEDONIA', 53),
 ('CUBA', 46),
 ('MOZAMBIQUE', 45),
 ('SPAIN', 44),
 ('INDIA', 40),
 ('EGYPT', 38),
 ('JAPAN', 34),
 ('CROATIA', 34),
 ('PANAMA', 32),
 ('SOLOMON ISLANDS', 30),
 ('IRAN', 29),
 ('JAMAICA', 27),
 ('GREECE', 25),
 ('FRENCH POLYNESIA', 25),
 ('HONG KONG', 24),
 ('ENGLAND', 23),
 ('INDONESIA', 23),
 ('ATLANTIC OCEAN', 17),
 ('PACIFIC OCEAN', 17),
 ('COSTA RICA', 17),
 ('BERMUDA', 16),
 ('VIETNAM', 15),
 ('TONGA', 15),
 ('VANUATU', 14),
 ('SRI LANKA', 14),
 ('FRANCE', 13),
 ('MARSHALL ISLANDS', 13),
 ('TURKEY', 12),
 ('CANADA', 12),
 ('SOUTH ATLANTIC OCEAN', 12),
 ('IRAQ', 12),
 ('UNITED KINGDOM', 11),
 ('SENEGAL', 11),
 ('VENEZUELA', 11),
 ('NEW GUINEA', 10),
 ('KENYA', 10),
 ('MAURITIUS', 10),
 ('TAIWAN', 9),
 ('ECUADOR', 9),
 ('CO

In [26]:
list(df_clean["Activity"].value_counts().items())

[('Surfing', 971),
 ('Swimming', 869),
 ('Fishing', 431),
 ('Spearfishing', 333),
 ('Bathing', 162),
 ('Wading', 149),
 ('Diving', 127),
 ('Standing', 99),
 ('Snorkeling', 89),
 ('Scuba diving', 76),
 ('Body boarding', 61),
 ('Body surfing', 49),
 ('Swimming ', 47),
 ('Kayaking', 33),
 ('Treading water', 32),
 ('Pearl diving', 32),
 ('Fell overboard', 32),
 ('Free diving', 29),
 ('Boogie boarding', 29),
 ('Windsurfing', 19),
 ('Walking', 17),
 ('Boogie Boarding', 16),
 ('Shark fishing', 15),
 ('Floating', 14),
 ('Fishing ', 13),
 ('Canoeing', 13),
 ('Surf fishing', 12),
 ('Surf-skiing', 12),
 ('Surf skiing', 12),
 ('Rowing', 12),
 ('Fishing for sharks', 11),
 ('Kayak Fishing', 11),
 ('Scuba Diving', 10),
 ('Sponge diving', 10),
 ('Freediving', 10),
 ('Fell into the water', 9),
 ('Diving for trochus', 9),
 ('Sailing', 9),
 ('Sitting on surfboard', 9),
 ('Sea disaster', 8),
 ('Paddle boarding', 8),
 ('Spearfishing ', 7),
 ('Playing', 7),
 ('Floating on his back', 7),
 ('Surf skiing ', 7)

In [27]:
df_clean["Activity"].isnull().sum()

544

In [32]:
df_clean[df_clean.Activity.isnull()][["Year","Country","Type"]]

Unnamed: 0,Year,Country,Type
47,2018.0,AUSTRALIA,Unprovoked
108,2017.0,USA,Unprovoked
112,2017.0,USA,Unprovoked
114,2017.0,USA,Unprovoked
154,2017.0,USA,Unprovoked
...,...,...,...
6269,0.0,PACIFIC OCEAN,Sea Disaster
6277,0.0,AUSTRALIA,Unprovoked
6282,0.0,AUSTRALIA,Unprovoked
6292,0.0,USA,Unprovoked


In [33]:
df_clean = df[~df["Activity"].isnull()]

In [35]:
df_clean["Activity"].isnull().sum()

0

In [36]:
df_clean.loc[df_clean["Activity"].str.startswith("Surf"),"Activity"] = "Surfing"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [37]:
list(df_clean["Activity"].value_counts().items())

[('Surfing', 1063),
 ('Swimming', 869),
 ('Fishing', 431),
 ('Spearfishing', 333),
 ('Bathing', 162),
 ('Wading', 149),
 ('Diving', 127),
 ('Standing', 99),
 ('Snorkeling', 89),
 ('Scuba diving', 76),
 ('Body boarding', 61),
 ('Body surfing', 49),
 ('Swimming ', 47),
 ('Kayaking', 33),
 ('Pearl diving', 32),
 ('Treading water', 32),
 ('Fell overboard', 32),
 ('Boogie boarding', 29),
 ('Free diving', 29),
 ('Windsurfing', 19),
 ('Walking', 17),
 ('Boogie Boarding', 16),
 ('Shark fishing', 15),
 ('Floating', 14),
 ('Canoeing', 13),
 ('Fishing ', 13),
 ('Rowing', 12),
 ('Fishing for sharks', 11),
 ('Kayak Fishing', 11),
 ('Freediving', 10),
 ('Scuba Diving', 10),
 ('Sponge diving', 10),
 ('Fell into the water', 9),
 ('Sailing', 9),
 ('Sitting on surfboard', 9),
 ('Diving for trochus', 9),
 ('Sea disaster', 8),
 ('Paddle boarding', 8),
 ('Boating', 7),
 ('Skindiving', 7),
 ('Diving for abalone', 7),
 ('Floating on his back', 7),
 ('Playing', 7),
 ('Spearfishing ', 7),
 ('Free diving for ab

In [38]:
df_clean.loc[df_clean["Activity"].str.endswith("surfing"),"Activity"] = "Surfing"
df_clean.loc[df_clean["Activity"].str.endswith("Surfing"),"Activity"] = "Surfing"

In [39]:
list(df_clean["Activity"].value_counts().items())

[('Surfing', 1166),
 ('Swimming', 869),
 ('Fishing', 431),
 ('Spearfishing', 333),
 ('Bathing', 162),
 ('Wading', 149),
 ('Diving', 127),
 ('Standing', 99),
 ('Snorkeling', 89),
 ('Scuba diving', 76),
 ('Body boarding', 61),
 ('Swimming ', 47),
 ('Kayaking', 33),
 ('Fell overboard', 32),
 ('Pearl diving', 32),
 ('Treading water', 32),
 ('Boogie boarding', 29),
 ('Free diving', 29),
 ('Walking', 17),
 ('Boogie Boarding', 16),
 ('Shark fishing', 15),
 ('Floating', 14),
 ('Canoeing', 13),
 ('Fishing ', 13),
 ('Rowing', 12),
 ('Fishing for sharks', 11),
 ('Kayak Fishing', 11),
 ('Scuba Diving', 10),
 ('Sponge diving', 10),
 ('Freediving', 10),
 ('Sailing', 9),
 ('Fell into the water', 9),
 ('Diving for trochus', 9),
 ('Sitting on surfboard', 9),
 ('Paddle boarding', 8),
 ('Sea disaster', 8),
 ('Free diving for abalone', 7),
 ('Diving for abalone', 7),
 ('Boating', 7),
 ('Skindiving', 7),
 ('Playing', 7),
 ('Spearfishing ', 7),
 ('Floating on his back', 7),
 ('Murder', 6),
 ('Fishing for ma