In [1]:
import pandas as pd

# Import Data and Memory Optimization

In [2]:
# import data
employees = pd.read_csv('data/employees.csv') # parse_dates parameter will negate having to convert date types
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services


In [3]:
# info method
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [4]:
# convert start date to datetime data type
employees['Start Date'] = pd.to_datetime(employees['Start Date'])
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services


In [5]:
# convert senior management to boolean type
employees['Senior Management'] = employees['Senior Management'].astype('bool')
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services


In [6]:
# convert gender to category type (memory optimization)
employees['Gender'] = employees['Gender'].astype('category')
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services


In [7]:
# info method (compare to previous info with 62.6KB memory usage)
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   object        
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 49.1+ KB


# Filtering Data (One Condition)

In [8]:
# filtering based on one condition
employees[
    employees['Gender'] == 'Male'
].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,1:35 AM,115163,10.125,False,Legal


In [9]:
# filtering based on one condition (alternate approach)
finance_filter = employees['Team'] == 'Finance'

employees[finance_filter].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
7,,Female,2015-07-20,10:43 AM,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,7:13 AM,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,10:47 PM,114796,6.796,False,Finance


In [10]:
# filtering based on one condition (on a boolean variable)
employees[
    # employees['Senior Management']
    employees['Senior Management'] == True # == True is not required but its better to be explicit
].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,4:20 PM,65476,10.012,True,Product


In [11]:
# filtering based on one condition (not equal)
employees[
    employees['Team'] != 'Marketing'
].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,1:35 AM,115163,10.125,False,Legal


In [12]:
# filtering based on one condition (greater than)
employees[
    employees['Salary'] > 110000
].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
5,Dennis,Male,1987-04-18,1:35 AM,115163,10.125,False,Legal
9,Frances,Female,2002-08-08,6:51 AM,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,1:08 AM,112807,17.492,True,Human Resources


In [13]:
# filtering based on one condition (less than)
employees[
    employees['Bonus %'] < 1.5
].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,6:09 AM,59414,1.256,False,Product
58,Theresa,Female,2010-04-11,7:18 AM,72670,1.481,True,Engineering
77,Charles,Male,2004-09-14,8:13 PM,107391,1.26,True,Marketing
175,Willie,Male,1998-02-17,8:20 PM,146651,1.451,True,Engineering


In [14]:
# filtering based on one condition (greater than/equal to a date)
employees[
    employees['Start Date'] <= '1985-01-01'
].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,9:01 AM,63241,15.132,True,
12,Brandon,Male,1980-12-01,1:08 AM,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,10:27 AM,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,6:30 PM,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,8:49 PM,57427,9.557,True,Client Services


# Filtering Data (> One Condition)

In [15]:
# filtering based on two conditions (and)
# when using multiple conditions, it's preferable to store conditions in variables
emp_male = employees['Gender'] == 'Male'
team_marketing = employees['Team'] == 'Marketing'

employees[
    emp_male & team_marketing
].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2:12 AM,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,7:45 AM,37598,7.757,True,Marketing
74,Thomas,Male,1995-06-04,2:24 PM,62096,17.029,False,Marketing
77,Charles,Male,2004-09-14,8:13 PM,107391,1.26,True,Marketing


In [16]:
# filtering based on two conditions (or)
snr_manager = employees['Senior Management'] == True
start_date_1990 = employees['Start Date'] < '1990-01-01'

employees[
    snr_manager | start_date_1990
].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,1:35 AM,115163,10.125,False,Legal


In [17]:
# filtering based on multiple conditions (and/or)
name_robert = employees['First Name'] == 'Robert'
clt_services = employees['Team'] == 'Client Services'
start_date_2016 = employees['Start Date'] > '2016-06-01'

employees[
    (name_robert & clt_services) | start_date_2016
].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,6:09 AM,59414,1.256,False,Product
98,Tina,Female,2016-06-16,7:47 PM,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,4:26 AM,123294,19.894,False,Client Services
451,Terry,,2016-07-15,12:29 AM,140002,19.49,True,Marketing


# Checking for Inclusion

In [18]:
# isin method for multiple conditions
teams = employees['Team'].isin(
    ['Legal', 'Sales', 'Product']
)

employees[teams].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,1:35 AM,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,4:20 PM,65476,10.012,True,Product
11,Julie,Female,1997-10-26,3:19 PM,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,11:40 PM,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,6:09 AM,59414,1.256,False,Product


In [19]:
# isnull method
team_nulls = employees['Team'].isnull()

employees[team_nulls].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
10,Louise,Female,1980-08-12,9:01 AM,63241,15.132,True,
23,,Male,2012-06-14,4:19 PM,125792,5.042,True,
32,,Male,1998-08-21,2:27 PM,122340,6.417,True,
91,James,,2005-01-26,11:00 PM,128771,8.309,False,


In [20]:
# notnull method
gender_not_null = employees['Gender'].notnull()

employees[gender_not_null].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services


In [21]:
# between method (checking for inclusion within a range of values)
salary_range = employees['Salary'].between(60000, 70000) # inclusive

employees[salary_range].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
6,Ruby,Female,1987-08-17,4:20 PM,65476,10.012,True,Product
10,Louise,Female,1980-08-12,9:01 AM,63241,15.132,True,
20,Lois,,1995-04-22,7:18 PM,64714,4.934,True,Legal
41,Christine,,2015-06-28,1:08 AM,66582,11.308,True,Business Development


In [22]:
# between method (checking for inclusion within a range of values)
bonus_range = employees['Bonus %'].between(2.0, 5.0)

employees[bonus_range].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
20,Lois,,1995-04-22,7:18 PM,64714,4.934,True,Legal
40,Michael,Male,2008-10-10,11:25 AM,99283,2.665,True,Distribution
49,Chris,,1980-01-24,12:13 PM,113590,3.055,False,Sales
60,Paula,,2005-11-23,2:01 PM,48866,4.271,False,Distribution


In [23]:
# between method (checking for inclusion within a range of values)
dates_range = employees['Start Date'].between('1991-01-01', '1992-01-01')

employees[dates_range].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
27,Scott,,1991-07-11,6:58 PM,122367,5.218,False,Legal
75,Bonnie,Female,1991-07-02,1:27 AM,104897,5.118,True,Human Resources
88,Donna,Female,1991-11-27,1:59 PM,64088,6.155,True,Legal
116,,Male,1991-06-22,8:58 PM,76189,18.988,True,Legal
148,Patrick,,1991-07-14,2:24 AM,124488,14.837,True,Sales


In [24]:
# between method (checking for inclusion within a range of values)
login_range = employees['Last Login Time'].between('0830', '1200') # pandas allows flexibility with formatting

employees[login_range].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
7,,Female,2015-07-20,10:43 AM,45906,11.598,True,Finance
13,Gary,Male,2008-01-27,11:40 PM,109831,5.831,False,Sales
18,Diana,Female,1981-10-23,10:27 AM,132940,19.082,False,Client Services
24,John,Male,1992-07-01,10:08 PM,97950,13.873,False,Client Services


# Duplicate Data (Identify and Delete)

In [25]:
# check for duplicate data using duplicate method
employees_sorted = employees.sort_values('First Name').copy()

# below code marks duplicates with a boolean value (True)
employees_sorted['First Name'].duplicated(keep = 'first') # first is not a duplicate and subsequent are
employees_sorted['First Name'].duplicated(keep = 'last') # last is not a duplicate and prior are
employees_sorted['First Name'].duplicated(keep = False) # all are marked as duplicates

# return data frame with the specified condition
employees_sorted[
    ~employees_sorted['First Name'].duplicated(keep = False)
] # this returns all unique first_name records due to the tilde operator (reverses boolean values)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,6:29 AM,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,10:47 PM,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,3:39 AM,57783,9.129,False,Finance
887,David,Male,2009-12-05,8:48 AM,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,1:35 AM,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,10:54 AM,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,9:07 AM,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,3:02 PM,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,10:30 AM,132839,17.463,True,Client Services


In [26]:
# drop_duplicates method (by deafult it checks for duplicates across entire row)
employees_sorted.drop_duplicates(subset = ['First Name'], keep = 'first').head() # keeps the first occurence of a duplicate

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,10:20 AM,61602,11.849,True,Marketing
137,Adam,Male,2011-05-21,1:45 AM,95327,15.12,False,Distribution
300,Alan,Male,1988-06-26,3:54 AM,111786,3.592,True,Engineering
372,Albert,Male,1997-02-01,4:20 PM,67827,19.717,True,Engineering
988,Alice,Female,2004-10-05,9:34 AM,47638,11.209,False,Human Resources


In [27]:
# drop ALL duplicates in a specified column
employees_sorted.drop_duplicates(subset = ['First Name'], keep = False) # drops any first_name with a duplicate entry

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,6:29 AM,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,10:47 PM,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,3:39 AM,57783,9.129,False,Finance
887,David,Male,2009-12-05,8:48 AM,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,1:35 AM,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,10:54 AM,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,9:07 AM,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,3:02 PM,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,10:30 AM,132839,17.463,True,Client Services


In [28]:
# drop ALL duplicates in a specified column when there are no unique values
employees_sorted.drop_duplicates(subset = ['Team'], keep = False) # will return empty data frame

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team


In [29]:
# dropping duplicates across multiple columns
# two records for Aaron since there is a different team
employees_sorted.drop_duplicates(subset = ['First Name', 'Team'], keep = 'first').head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,10:20 AM,61602,11.849,True,Marketing
440,Aaron,Male,1990-07-22,2:53 PM,52119,11.343,True,Client Services
137,Adam,Male,2011-05-21,1:45 AM,95327,15.12,False,Distribution
141,Adam,Male,1990-12-24,8:57 PM,110194,14.727,True,Product
302,Adam,Male,2007-07-05,11:59 AM,71276,5.027,True,Human Resources


# Distinct Values

In [30]:
# identifying unique values
employees['Gender'].unique()

['Male', 'Female', NaN]
Categories (2, object): ['Female', 'Male']

In [31]:
# identifying unique values
employees['Team'].unique()

array(['Marketing', nan, 'Finance', 'Client Services', 'Legal', 'Product',
       'Engineering', 'Business Development', 'Human Resources', 'Sales',
       'Distribution'], dtype=object)

In [32]:
# returning count of unique values  (drops NaN by default)
employees['Team'].nunique(dropna = False)

11