In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('employees.csv')
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


De cara já podemos verificar que exitem valores ausentes no nosso dataset, além de que algumas colunas estão com o tipo de dados 'string', o que não é nem um pouco vantajoso se pensarmos nas operações que gostariamos de realizar com 'datas' por exemplo, afim de criarmos algumas features ou até mesmo investigarmos melhor os nossos dados.

In [4]:
pd.to_datetime(df['Start Date'])

0     1993-08-06
1     1996-03-31
2     1993-04-23
3     2005-03-04
4     1998-01-24
         ...    
995   2014-11-23
996   1984-01-31
997   2013-05-20
998   2013-04-20
999   2012-05-15
Name: Start Date, Length: 1000, dtype: datetime64[ns]

In [5]:
df['Start Date'] = pd.to_datetime(df['Start Date'])

In [6]:
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services


In [7]:
df['Last Login Time'] = pd.to_datetime(df['Last Login Time'])

In [8]:
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-27 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-06-27 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-06-27 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-06-27 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-06-27 16:47:00,101004,1.389,True,Client Services


In [9]:
df['Senior Management'] = df['Senior Management'].astype("bool")

df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-27 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-06-27 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-06-27 11:17:00,130590,11.858,False,Finance


In [10]:
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-27 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-06-27 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-06-27 11:17:00,130590,11.858,False,Finance


In [11]:
df.info() # otimizamos o uso na memória por parte do nosso dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.1+ KB


### Explorando outra forma de convertermos colunas com datas no nosso dataframe

In [12]:
df_parse_dates = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"]) # otimiza a conversão de datas já no import do dataset
df_parse_dates['Gender'] = df_parse_dates['Gender'].astype("category")
df_parse_dates['Senior Management'] = df_parse_dates['Senior Management'].astype("bool")

df_parse_dates.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-27 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-06-27 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-06-27 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-06-27 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-06-27 16:47:00,101004,1.389,True,Client Services


# **Filter a Dataframe based on a condition**

In [13]:
df_conditions = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"]) # otimiza a conversão de datas já no import do dataset
df_conditions['Gender'] = df_conditions['Gender'].astype("category")
df_conditions['Senior Management'] = df_conditions['Senior Management'].astype("bool")

df_conditions.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-27 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-06-27 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-06-27 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-06-27 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-06-27 16:47:00,101004,1.389,True,Client Services


In [14]:
df_conditions[df_conditions["Gender"] == "Male"].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-27 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-06-27 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2023-06-27 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-06-27 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2023-06-27 01:35:00,115163,10.125,False,Legal


In [25]:
df_conditions[df_conditions["Team"] == "Finance"].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-06-27 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-06-27 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2023-06-27 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2023-06-27 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2023-06-27 22:47:00,114796,6.796,False,Finance


Podemos realizar a mesma operação acima de uma maneira mais elegante, uma vez que a sintaxe acima pode ser um pouco confusa. Veja o exemplo a seguir:

In [26]:
mask = df_conditions["Team"] == "Finance"
df_conditions[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-06-27 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-06-27 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2023-06-27 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2023-06-27 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2023-06-27 22:47:00,114796,6.796,False,Finance


In [27]:
# Como a nossa coluna "Senior Management" possui valores booleanos, então a sintaxe pra obter os valores verdadeiros ou falsos é mais simples

df_conditions[~df_conditions["Senior Management"]].head() 

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-06-27 11:17:00,130590,11.858,False,Finance
5,Dennis,Male,1987-04-18,2023-06-27 01:35:00,115163,10.125,False,Legal
13,Gary,Male,2008-01-27,2023-06-27 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2023-06-27 06:09:00,59414,1.256,False,Product
16,Jeremy,Male,2010-09-21,2023-06-27 05:56:00,90370,7.369,False,Human Resources


In [28]:
team_is_not_marketing = df_conditions["Team"] != "Marketing" # Obtemos todas as equipes que não são a de "Marketing".
df_conditions[team_is_not_marketing].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2023-06-27 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-06-27 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-06-27 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-06-27 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2023-06-27 01:35:00,115163,10.125,False,Legal


In [29]:
df_conditions[df_conditions["Salary"] > 110_000]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-06-27 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-06-27 13:00:00,138705,9.340,True,Finance
5,Dennis,Male,1987-04-18,2023-06-27 01:35:00,115163,10.125,False,Legal
9,Frances,Female,2002-08-08,2023-06-27 06:51:00,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,2023-06-27 01:08:00,112807,17.492,True,Human Resources
...,...,...,...,...,...,...,...,...
987,Gloria,Female,2014-12-08,2023-06-27 05:08:00,136709,10.331,True,Finance
991,Rose,Female,2002-08-25,2023-06-27 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2023-06-27 08:35:00,112769,11.625,True,Finance
995,Henry,,2014-11-23,2023-06-27 06:09:00,132483,16.655,False,Distribution


In [31]:
# Filtrando o dataset para obtermos todos os funcionários que tenham um bônus menor do que 1.5%

menores_bonus = df_conditions["Bonus %"] < 1.5
df_conditions[menores_bonus].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,2023-06-27 16:47:00,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,2023-06-27 06:09:00,59414,1.256,False,Product
58,Theresa,Female,2010-04-11,2023-06-27 07:18:00,72670,1.481,True,Engineering
77,Charles,Male,2004-09-14,2023-06-27 20:13:00,107391,1.26,True,Marketing
175,Willie,Male,1998-02-17,2023-06-27 20:20:00,146651,1.451,True,Engineering


In [34]:
# Agora vamos obter todos os funcionários que começaram antes ou em 1º de janeiro de 1985

old_funcionarios = df_conditions["Start Date"] <= "1985-01-01"
df_conditions[old_funcionarios].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2023-06-27 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2023-06-27 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2023-06-27 10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,2023-06-27 18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,2023-06-27 20:49:00,57427,9.557,True,Client Services


# **Filter with more than one condition (AND OPERATOR)**

In [35]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-27 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-06-27 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-06-27 11:17:00,130590,11.858,False,Finance


### As boas práticas nos dizem para armazenarmos condições muito grandes em variáveis diferentes que representem muito bem a regrade negócio envolvida no filtro condicional, isso pode ajudar muito na legibilidade do código.

In [37]:
genero_masculino = df["Gender"] == "Male"
time_de_marketing = df["Team"] == "Marketing"

df[genero_masculino & time_de_marketing].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-06-27 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2023-06-27 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2023-06-27 07:45:00,37598,7.757,True,Marketing
74,Thomas,Male,1995-06-04,2023-06-27 14:24:00,62096,17.029,False,Marketing
77,Charles,Male,2004-09-14,2023-06-27 20:13:00,107391,1.26,True,Marketing


In [44]:
# Testando condicionais

# Não podemos criar condicionais sem os parênteses, como por exemplo: df["Gender"] == "Male" & df["Team"] == "Marketing"
# Pois devemos adicionar os parênteses para indicar a ordem correta das operações lógicas
# A sintaxe abaixo seria a forma correta da sintaxe fornecida acima:

(df["Gender"] == "Male") & (df["Team"] == "Marketing")

0       True
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool