In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('employees.csv')
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


De cara já podemos verificar que exitem valores ausentes no nosso dataset, além de que algumas colunas estão com o tipo de dados 'string', o que não é nem um pouco vantajoso se pensarmos nas operações que gostariamos de realizar com 'datas' por exemplo, afim de criarmos algumas features ou até mesmo investigarmos melhor os nossos dados.

In [4]:
pd.to_datetime(df['Start Date'])

0     1993-08-06
1     1996-03-31
2     1993-04-23
3     2005-03-04
4     1998-01-24
         ...    
995   2014-11-23
996   1984-01-31
997   2013-05-20
998   2013-04-20
999   2012-05-15
Name: Start Date, Length: 1000, dtype: datetime64[ns]

In [5]:
df['Start Date'] = pd.to_datetime(df['Start Date'])

In [6]:
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services


In [7]:
df['Last Login Time'] = pd.to_datetime(df['Last Login Time'])

In [8]:
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-07-08 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-07-08 16:47:00,101004,1.389,True,Client Services


In [9]:
df['Senior Management'] = df['Senior Management'].astype("bool")

df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance


In [10]:
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance


In [11]:
df.info() # otimizamos o uso na memória por parte do nosso dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.1+ KB


### Explorando outra forma de convertermos colunas com datas no nosso dataframe

In [12]:
df_parse_dates = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"]) # otimiza a conversão de datas já no import do dataset
df_parse_dates['Gender'] = df_parse_dates['Gender'].astype("category")
df_parse_dates['Senior Management'] = df_parse_dates['Senior Management'].astype("bool")

df_parse_dates.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-07-08 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-07-08 16:47:00,101004,1.389,True,Client Services


# **Filter a Dataframe based on a condition**

In [13]:
df_conditions = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"]) # otimiza a conversão de datas já no import do dataset
df_conditions['Gender'] = df_conditions['Gender'].astype("category")
df_conditions['Senior Management'] = df_conditions['Senior Management'].astype("bool")

df_conditions.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-07-08 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-07-08 16:47:00,101004,1.389,True,Client Services


In [14]:
df_conditions[df_conditions["Gender"] == "Male"].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2023-07-08 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-07-08 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2023-07-08 01:35:00,115163,10.125,False,Legal


In [15]:
df_conditions[df_conditions["Team"] == "Finance"].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-07-08 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2023-07-08 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2023-07-08 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2023-07-08 22:47:00,114796,6.796,False,Finance


Podemos realizar a mesma operação acima de uma maneira mais elegante, uma vez que a sintaxe acima pode ser um pouco confusa. Veja o exemplo a seguir:

In [16]:
mask = df_conditions["Team"] == "Finance"
df_conditions[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-07-08 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2023-07-08 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2023-07-08 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2023-07-08 22:47:00,114796,6.796,False,Finance


In [17]:
# Como a nossa coluna "Senior Management" possui valores booleanos, então a sintaxe pra obter os valores verdadeiros ou falsos é mais simples

df_conditions[~df_conditions["Senior Management"]].head() 

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance
5,Dennis,Male,1987-04-18,2023-07-08 01:35:00,115163,10.125,False,Legal
13,Gary,Male,2008-01-27,2023-07-08 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2023-07-08 06:09:00,59414,1.256,False,Product
16,Jeremy,Male,2010-09-21,2023-07-08 05:56:00,90370,7.369,False,Human Resources


In [18]:
team_is_not_marketing = df_conditions["Team"] != "Marketing" # Obtemos todas as equipes que não são a de "Marketing".
df_conditions[team_is_not_marketing].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-07-08 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-07-08 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2023-07-08 01:35:00,115163,10.125,False,Legal


In [19]:
df_conditions[df_conditions["Salary"] > 110_000]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-07-08 13:00:00,138705,9.340,True,Finance
5,Dennis,Male,1987-04-18,2023-07-08 01:35:00,115163,10.125,False,Legal
9,Frances,Female,2002-08-08,2023-07-08 06:51:00,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,2023-07-08 01:08:00,112807,17.492,True,Human Resources
...,...,...,...,...,...,...,...,...
987,Gloria,Female,2014-12-08,2023-07-08 05:08:00,136709,10.331,True,Finance
991,Rose,Female,2002-08-25,2023-07-08 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2023-07-08 08:35:00,112769,11.625,True,Finance
995,Henry,,2014-11-23,2023-07-08 06:09:00,132483,16.655,False,Distribution


In [20]:
# Filtrando o dataset para obtermos todos os funcionários que tenham um bônus menor do que 1.5%

menores_bonus = df_conditions["Bonus %"] < 1.5
df_conditions[menores_bonus].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,2023-07-08 16:47:00,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,2023-07-08 06:09:00,59414,1.256,False,Product
58,Theresa,Female,2010-04-11,2023-07-08 07:18:00,72670,1.481,True,Engineering
77,Charles,Male,2004-09-14,2023-07-08 20:13:00,107391,1.26,True,Marketing
175,Willie,Male,1998-02-17,2023-07-08 20:20:00,146651,1.451,True,Engineering


In [21]:
# Agora vamos obter todos os funcionários que começaram antes ou em 1º de janeiro de 1985

old_funcionarios = df_conditions["Start Date"] <= "1985-01-01"
df_conditions[old_funcionarios].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2023-07-08 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2023-07-08 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2023-07-08 10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,2023-07-08 18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,2023-07-08 20:49:00,57427,9.557,True,Client Services


# **Filter with more than one condition (AND OPERATOR)**

In [22]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance


### As boas práticas nos dizem para armazenarmos condições muito grandes em variáveis diferentes que representem muito bem a regrade negócio envolvida no filtro condicional, isso pode ajudar muito na legibilidade do código.

In [23]:
genero_masculino = df["Gender"] == "Male"
time_de_marketing = df["Team"] == "Marketing"

df[genero_masculino & time_de_marketing].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2023-07-08 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2023-07-08 07:45:00,37598,7.757,True,Marketing
74,Thomas,Male,1995-06-04,2023-07-08 14:24:00,62096,17.029,False,Marketing
77,Charles,Male,2004-09-14,2023-07-08 20:13:00,107391,1.26,True,Marketing


In [24]:
# Testando condicionais

# Não podemos criar condicionais sem os parênteses, como por exemplo: df["Gender"] == "Male" & df["Team"] == "Marketing"
# Pois devemos adicionar os parênteses para indicar a ordem correta das operações lógicas
# A sintaxe abaixo seria a forma correta da sintaxe fornecida acima:

(df["Gender"] == "Male") & (df["Team"] == "Marketing")

0       True
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

# **Filter with More than One Condition (OR)**

In [25]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance


In [26]:
gerencia_senior = df["Senior Management"]
old_funcionario = df["Start Date"] < '1990-01-01'

df[(gerencia_senior) | (old_funcionario)].head() # tinha esqucido que o 'operator' aqui era o pipe

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2023-07-08 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-07-08 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2023-07-08 01:35:00,115163,10.125,False,Legal


In [27]:
nome_is_robert = df["First Name"] == "Robert"
equipe_de_servicos = df["Team"] == "Client Services"
funcionario_desde_junho_de_2016 = df["Start Date"] > "2016-06-01"

df[((nome_is_robert) & (equipe_de_servicos)) | (funcionario_desde_junho_de_2016)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2023-07-08 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2023-07-08 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2023-07-08 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2023-07-08 00:29:00,140002,19.49,True,Marketing


# **The .isin() Method**

In [28]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance


In [29]:
df["Team"].unique()

array(['Marketing', nan, 'Finance', 'Client Services', 'Legal', 'Product',
       'Engineering', 'Business Development', 'Human Resources', 'Sales',
       'Distribution'], dtype=object)

In [30]:
df[df["Team"].isin(["Legal", "Product", "Sales"])].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2023-07-08 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2023-07-08 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2023-07-08 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2023-07-08 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2023-07-08 06:09:00,59414,1.256,False,Product


# **The .isnull() and .notnull() Methods**

In [31]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance


In [32]:
equipes_nulas = df["Team"].isnull()
df[equipes_nulas].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2023-07-08 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2023-07-08 16:19:00,125792,5.042,True,
32,,Male,1998-08-21,2023-07-08 14:27:00,122340,6.417,True,
91,James,,2005-01-26,2023-07-08 23:00:00,128771,8.309,False,


#### Em resumo, verificamos que podemos utilizar qualquer série de valores booleanos para fins de filtragem de dados;

In [33]:
equipes_nao_nulas = df["Team"].notnull()
df[equipes_nao_nulas].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-07-08 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-07-08 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2023-07-08 01:35:00,115163,10.125,False,Legal


## **The .between() method**

In [45]:
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Bonus %"] = df["Bonus %"].round(2)
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-07-08 12:42:00,97308,6.94,True,Marketing
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.86,False,Finance
3,Jerry,Male,2005-03-04,2023-07-08 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-07-08 16:47:00,101004,1.39,True,Client Services


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


In [54]:
mask = df["Salary"].between(60_000, 70_000, inclusive="both")
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
6,Ruby,Female,1987-08-17,2023-07-08 16:20:00,65476,10.01,True,Product
10,Louise,Female,1980-08-12,2023-07-08 09:01:00,63241,15.13,True,
20,Lois,,1995-04-22,2023-07-08 19:18:00,64714,4.93,True,Legal
41,Christine,,2015-06-28,2023-07-08 01:08:00,66582,11.31,True,Business Development


In [55]:
mask_bonus = df["Bonus %"].between(2, 5)
df[mask_bonus].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2023-07-08 06:53:00,61933,4.17,True,
20,Lois,,1995-04-22,2023-07-08 19:18:00,64714,4.93,True,Legal
40,Michael,Male,2008-10-10,2023-07-08 11:25:00,99283,2.66,True,Distribution
49,Chris,,1980-01-24,2023-07-08 12:13:00,113590,3.06,False,Sales
60,Paula,,2005-11-23,2023-07-08 14:01:00,48866,4.27,False,Distribution


In [56]:
mask_date = df["Start Date"].between("1991-01-01", "1992-01-01")
df[mask_date].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
27,Scott,,1991-07-11,2023-07-08 18:58:00,122367,5.22,False,Legal
75,Bonnie,Female,1991-07-02,2023-07-08 01:27:00,104897,5.12,True,Human Resources
88,Donna,Female,1991-11-27,2023-07-08 13:59:00,64088,6.16,True,Legal
116,,Male,1991-06-22,2023-07-08 20:58:00,76189,18.99,True,Legal
148,Patrick,,1991-07-14,2023-07-08 02:24:00,124488,14.84,True,Sales


In [62]:
df[df["Last Login Time"].between("08:30AM", "12:00PM")].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-07-08 11:17:00,130590,11.86,False,Finance
7,,Female,2015-07-20,2023-07-08 10:43:00,45906,11.6,True,Finance
10,Louise,Female,1980-08-12,2023-07-08 09:01:00,63241,15.13,True,
18,Diana,Female,1981-10-23,2023-07-08 10:27:00,132940,19.08,False,Client Services
33,Jean,Female,1993-12-18,2023-07-08 09:07:00,119082,16.18,False,Business Development


## **The .duplicated() method**

In [68]:
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Bonus %"] = df["Bonus %"].round(2)
df["Senior Management"] = df["Senior Management"].astype("bool")
df.sort_values("First Name", inplace=True) # ordenando os valores para ver nomes duplicados no dataset
df.head(7)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2023-07-08 10:20:00,61602,11.85,True,Marketing
327,Aaron,Male,1994-01-29,2023-07-08 18:48:00,58755,5.1,True,Marketing
440,Aaron,Male,1990-07-22,2023-07-08 14:53:00,52119,11.34,True,Client Services
937,Aaron,,1986-01-22,2023-07-08 19:39:00,63126,18.42,False,Client Services
137,Adam,Male,2011-05-21,2023-07-08 01:45:00,95327,15.12,False,Distribution
141,Adam,Male,1990-12-24,2023-07-08 20:57:00,110194,14.73,True,Product
302,Adam,Male,2007-07-05,2023-07-08 11:59:00,71276,5.03,True,Human Resources


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 101 to 951
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 50.3+ KB


In [71]:
df[df["First Name"].duplicated()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
327,Aaron,Male,1994-01-29,2023-07-08 18:48:00,58755,5.10,True,Marketing
440,Aaron,Male,1990-07-22,2023-07-08 14:53:00,52119,11.34,True,Client Services
937,Aaron,,1986-01-22,2023-07-08 19:39:00,63126,18.42,False,Client Services
141,Adam,Male,1990-12-24,2023-07-08 20:57:00,110194,14.73,True,Product
302,Adam,Male,2007-07-05,2023-07-08 11:59:00,71276,5.03,True,Human Resources
...,...,...,...,...,...,...,...,...
902,,Male,2001-05-23,2023-07-08 19:52:00,103877,6.32,True,Distribution
925,,Female,2000-08-23,2023-07-08 16:19:00,95866,19.39,True,Sales
946,,Female,1985-09-15,2023-07-08 01:50:00,133472,16.94,True,Distribution
947,,Male,2012-07-30,2023-07-08 15:07:00,107351,5.33,True,Marketing


### O conceito por trás do método "duplicated()" denota que o primeiro registro de uma série de registros duplicados, será marcado como "não duplicado", e apenas os registros subsequentes serão considerados DUPLICADOS.

In [74]:
df[df["First Name"].duplicated(keep = False)] # obtendo todas as linhas com valores duplicados

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2023-07-08 10:20:00,61602,11.85,True,Marketing
327,Aaron,Male,1994-01-29,2023-07-08 18:48:00,58755,5.10,True,Marketing
440,Aaron,Male,1990-07-22,2023-07-08 14:53:00,52119,11.34,True,Client Services
937,Aaron,,1986-01-22,2023-07-08 19:39:00,63126,18.42,False,Client Services
137,Adam,Male,2011-05-21,2023-07-08 01:45:00,95327,15.12,False,Distribution
...,...,...,...,...,...,...,...,...
902,,Male,2001-05-23,2023-07-08 19:52:00,103877,6.32,True,Distribution
925,,Female,2000-08-23,2023-07-08 16:19:00,95866,19.39,True,Sales
946,,Female,1985-09-15,2023-07-08 01:50:00,133472,16.94,True,Distribution
947,,Male,2012-07-30,2023-07-08 15:07:00,107351,5.33,True,Marketing


In [75]:
df["First Name"].drop_duplicates() # remove as linhas duplicadas mantendo apenas o primeiro registro de cada ocorrência

101      Aaron
137       Adam
300       Alan
372     Albert
988      Alice
        ...   
433      Wanda
177      Wayne
820    William
450     Willie
7          NaN
Name: First Name, Length: 201, dtype: object

In [76]:
df[~df["First Name"].duplicated(keep = False)] # obtem todos os nomes únicos

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2023-07-08 06:29:00,95570,18.52,True,Engineering
688,Brian,Male,2007-04-07,2023-07-08 22:47:00,93901,17.82,True,Legal
190,Carol,Female,1996-03-19,2023-07-08 03:39:00,57783,9.13,False,Finance
887,David,Male,2009-12-05,2023-07-08 08:48:00,92242,15.41,False,Legal
5,Dennis,Male,1987-04-18,2023-07-08 01:35:00,115163,10.12,False,Legal
495,Eugene,Male,1984-05-24,2023-07-08 10:54:00,81077,2.12,False,Sales
33,Jean,Female,1993-12-18,2023-07-08 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2023-07-08 15:02:00,120672,19.47,False,Legal
291,Tammy,Female,1984-11-11,2023-07-08 10:30:00,132839,17.46,True,Client Services


In [79]:
df[~df["First Name"].duplicated(keep = "first")] # aqui está uma forma de remover valores duplicados do dataset

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2023-07-08 10:20:00,61602,11.85,True,Marketing
137,Adam,Male,2011-05-21,2023-07-08 01:45:00,95327,15.12,False,Distribution
300,Alan,Male,1988-06-26,2023-07-08 03:54:00,111786,3.59,True,Engineering
372,Albert,Male,1997-02-01,2023-07-08 16:20:00,67827,19.72,True,Engineering
988,Alice,Female,2004-10-05,2023-07-08 09:34:00,47638,11.21,False,Human Resources
...,...,...,...,...,...,...,...,...
433,Wanda,Female,2008-07-20,2023-07-08 13:44:00,65362,7.13,True,Legal
177,Wayne,Male,2012-04-07,2023-07-08 08:00:00,102652,14.08,True,Distribution
820,William,Male,1993-11-18,2023-07-08 12:27:00,54058,5.18,True,Human Resources
450,Willie,Male,2009-08-22,2023-07-08 13:03:00,55038,19.69,False,Legal


## **The .drop_duplicates() method**

In [81]:
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Team"] = df["Team"].astype("category")
df["Bonus %"] = df["Bonus %"].round(2)
df["Senior Management"] = df["Senior Management"].astype("bool")
df.sort_values("First Name", inplace=True) # ordenando os valores para ver nomes duplicados no dataset
df.head(7)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2023-07-08 10:20:00,61602,11.85,True,Marketing
327,Aaron,Male,1994-01-29,2023-07-08 18:48:00,58755,5.1,True,Marketing
440,Aaron,Male,1990-07-22,2023-07-08 14:53:00,52119,11.34,True,Client Services
937,Aaron,,1986-01-22,2023-07-08 19:39:00,63126,18.42,False,Client Services
137,Adam,Male,2011-05-21,2023-07-08 01:45:00,95327,15.12,False,Distribution
141,Adam,Male,1990-12-24,2023-07-08 20:57:00,110194,14.73,True,Product
302,Adam,Male,2007-07-05,2023-07-08 11:59:00,71276,5.03,True,Human Resources


In [82]:
len(df)

1000

In [83]:
len(df.drop_duplicates())

1000

### Nenhuma linha foi removida do nosso dataframe utilizando o método acima, porque para que elas fossem removidas, deveriamos ter registros em nosso dataset onde os dados presentes em uma linha, fossem exatamente iguais aos dados presentes em outra linha qualquer, ou seja, precisariamos ter os mesmos valores para todas as colunas em registros diferentes.

In [84]:
df.drop_duplicates(subset = ["First Name"], keep = "first") 

# aqui estamos removendo valores duplicados apenas na coluna 'First Name', manetendo apenas um registro para cada valor duplicado;
# estamos mantendo o registro pelo indice, então se alterarmos o parâmetro "keep", iremos obter um registro diferente para o 'First Name == "Aaron"';
# Utilizar o parâmetro 'keep = False' irá remover todos os valores duplicados sem manter nenhuma ocorrência que possua o valor duplicado;
# Utilizar mais de um argumento para o parâmetro 'subset' implica em avaliar se existem valores duplicados no registro observado
# para as duas colunas em questão.

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2023-07-08 10:20:00,61602,11.85,True,Marketing
137,Adam,Male,2011-05-21,2023-07-08 01:45:00,95327,15.12,False,Distribution
300,Alan,Male,1988-06-26,2023-07-08 03:54:00,111786,3.59,True,Engineering
372,Albert,Male,1997-02-01,2023-07-08 16:20:00,67827,19.72,True,Engineering
988,Alice,Female,2004-10-05,2023-07-08 09:34:00,47638,11.21,False,Human Resources
...,...,...,...,...,...,...,...,...
433,Wanda,Female,2008-07-20,2023-07-08 13:44:00,65362,7.13,True,Legal
177,Wayne,Male,2012-04-07,2023-07-08 08:00:00,102652,14.08,True,Distribution
820,William,Male,1993-11-18,2023-07-08 12:27:00,54058,5.18,True,Human Resources
450,Willie,Male,2009-08-22,2023-07-08 13:03:00,55038,19.69,False,Legal
