In [1]:
import numpy as np
import pandas as pd

# Pandas
* Series
* Dataframe

In [2]:
# Create series contains ages of 5 staffs
age_lst = [25,27,24,28,30]
age_ser = pd.Series(age_lst)
age_ser

0    25
1    27
2    24
3    28
4    30
dtype: int64

In [3]:
# Create series with index
age_ser = pd.Series([25,27,24,28,30,np.nan],
                    index=['em1','em2','em3','em4','em5','em6'])
age_ser

em1    25.0
em2    27.0
em3    24.0
em4    28.0
em5    30.0
em6     NaN
dtype: float64

## Attribute

In [4]:
#Number of elements in series
print(age_ser.size)
#Number of elements and dimension of series
print(age_ser.shape)
#Index of series
print(age_ser.index)
#Values of series
print(age_ser.values)
#Data type of elements in a series
print(age_ser.dtype)

6
(6,)
Index(['em1', 'em2', 'em3', 'em4', 'em5', 'em6'], dtype='object')
[25. 27. 24. 28. 30. nan]
float64


## Method

In [5]:
#Info of series
print(age_ser.info())
print("-"*6)
#5 first elements 
print(age_ser.head())
print("-"*6)
#5 last elements
print(age_ser.tail())
print("-"*6)
#3 first elements
print(age_ser.head(3))
print("-"*6)
#General information
print(age_ser.describe())

<class 'pandas.core.series.Series'>
Index: 6 entries, em1 to em6
Series name: None
Non-Null Count  Dtype  
--------------  -----  
5 non-null      float64
dtypes: float64(1)
memory usage: 268.0+ bytes
None
------
em1    25.0
em2    27.0
em3    24.0
em4    28.0
em5    30.0
dtype: float64
------
em2    27.0
em3    24.0
em4    28.0
em5    30.0
em6     NaN
dtype: float64
------
em1    25.0
em2    27.0
em3    24.0
dtype: float64
------
count     5.000000
mean     26.800000
std       2.387467
min      24.000000
25%      25.000000
50%      27.000000
75%      28.000000
max      30.000000
dtype: float64


## Query
* Using loc when query by label
* Using iloc when query by index

In [6]:
#Get the value of element having index = 2 and label = 'em3'
print(age_ser.loc['em3'])
print(age_ser.iloc[2])

#Get the value of the last element
print(age_ser[-1])

24.0
24.0
nan


  print(age_ser[-1])


## Query many elements at the same time
* series.loc[label list]
* series.loc[start:stop]
* series.iloc[start:stop]
* series.iloc[index list]

In [7]:
# Get the values of index 1 and 2
print(age_ser.iloc[1:3])

# Get the values of index 0,1,4
print(age_ser.iloc[[0,1,4]])

# Get the values of label em1, em3
print(age_ser.loc[['em1','em3']])

# Get the values of label from em1 to em3
print(age_ser.loc['em1':'em3'])

em2    27.0
em3    24.0
dtype: float64
em1    25.0
em2    27.0
em5    30.0
dtype: float64
em1    25.0
em3    24.0
dtype: float64
em1    25.0
em2    27.0
em3    24.0
dtype: float64


### Query by condition
series[condition]

In [8]:
# Value > 25
print(age_ser[age_ser > 25])

# Value > 25 and < 30
print(age_ser[(age_ser> 25) & (age_ser<30)])

# Value <= 35 or value >= 30
print(age_ser[(age_ser <= 25) | (age_ser >= 30)])

# Value is not > 25
print(age_ser[~(age_ser >25)])

em2    27.0
em4    28.0
em5    30.0
dtype: float64
em2    27.0
em4    28.0
dtype: float64
em1    25.0
em3    24.0
em5    30.0
dtype: float64
em1    25.0
em3    24.0
em6     NaN
dtype: float64


## Duplicate value

In [9]:
# phone_ser 
phone_ser = pd.Series(['0912846759', '0914963258', '0978254361', '0335469512', '0914963258','0914963258', '0914963258'])
phone_ser

0    0912846759
1    0914963258
2    0978254361
3    0335469512
4    0914963258
5    0914963258
6    0914963258
dtype: object

In [10]:
#duplicate value
phone_ser.duplicated() #default keep = 'first', it means first occurence is True
phone_ser.duplicated(keep='last') #last occurence is True
phone_ser.duplicated(keep=False) #all ocurrence are True
phone_ser.drop_duplicates()
phone_ser.drop_duplicates(keep='last')

0    0912846759
2    0978254361
3    0335469512
6    0914963258
dtype: object

## Dataframe

In [11]:
#Create df from list
lst = [1,5,7,8,10]
df = pd.DataFrame(lst,index=['a','b','c','d','e'],columns=['A'])
df

Unnamed: 0,A
a,1
b,5
c,7
d,8
e,10


In [12]:
#Create df from list nest list
lst = [['An',19],['Tuan',20],['Binh',22]]
df = pd.DataFrame(lst,index=['a','b','c'],columns=['name','age'])
df

Unnamed: 0,name,age
a,An,19
b,Tuan,20
c,Binh,22


In [13]:
#Create df from dictionary
emp_df = pd.DataFrame({'name':['Tom','Mike','Rose','Bill','Dick', 'John', 'Tom'],
                       'age':[25,27,24,28,30,27,24],
                       'salary':[1200, 1500, 3000, 1200, 1500, 1300, 1250]},
                      index=['hv1','hv2','hv3','hv4','hv5','hv6','hv7'],
                     columns=['name','salary','age'])
emp_df

Unnamed: 0,name,salary,age
hv1,Tom,1200,25
hv2,Mike,1500,27
hv3,Rose,3000,24
hv4,Bill,1200,28
hv5,Dick,1500,30
hv6,John,1300,27
hv7,Tom,1250,24


### Attributes of dataframe

In [14]:
#Size of df
emp_df.size

#Shape of df
emp_df.shape

#Index of df
emp_df.index

#Columns of df
emp_df.columns

#Label index
emp_df.axes

#Value in df
emp_df.values #return array 2D

#Data type of cols
emp_df.dtypes

name      object
salary     int64
age        int64
dtype: object

### Methods of dataframe
Basically, there are the same as series

In [15]:
#Information
emp_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, hv1 to hv7
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    7 non-null      object
 1   salary  7 non-null      int64 
 2   age     7 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 224.0+ bytes


In [16]:
emp_df.head()
emp_df.tail()
emp_df.describe() #statistic of all columns which are int,float type

Unnamed: 0,salary,age
count,7.0,7.0
mean,1564.285714,26.428571
std,645.958129,2.225395
min,1200.0,24.0
25%,1225.0,24.5
50%,1300.0,27.0
75%,1500.0,27.5
max,3000.0,30.0


In [17]:
#statistic of all columns which are object type
emp_df.describe(include='object')

Unnamed: 0,name
count,7
unique,6
top,Tom
freq,2


In [18]:
#statistic of all columns no matter which type is
emp_df.describe(include='all')

Unnamed: 0,name,salary,age
count,7,7.0,7.0
unique,6,,
top,Tom,,
freq,2,,
mean,,1564.285714,26.428571
std,,645.958129,2.225395
min,,1200.0,24.0
25%,,1225.0,24.5
50%,,1300.0,27.0
75%,,1500.0,27.5


### Read file
In this file, I only use the csv, and excel

In [19]:
em_df = pd.read_csv("data/emp.csv")
emp_df.head()

Unnamed: 0,name,salary,age
hv1,Tom,1200,25
hv2,Mike,1500,27
hv3,Rose,3000,24
hv4,Bill,1200,28
hv5,Dick,1500,30


In [20]:
#Using the first col as index
euro = pd.read_csv("data/euro2012.csv",index_col=0)
euro.head()

Unnamed: 0,Team,Goals,Shots on target,Shots off target,Shooting Accuracy,% Goals-to-shots,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,...,Saves made,Saves-to-shots ratio,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used
0,Croatia,4,13,12,51.9%,16.0%,32,0,0,0,...,13,81.3%,41,62,2,9,0,9,9,16
1,Czech Republic,4,13,18,41.9%,12.9%,39,0,0,0,...,9,60.1%,53,73,8,7,0,11,11,19
2,Denmark,4,10,10,50.0%,20.0%,27,1,0,0,...,10,66.7%,25,38,8,4,0,7,7,15
3,England,5,11,18,50.0%,17.2%,40,0,0,0,...,22,88.1%,43,45,6,5,0,11,11,16
4,France,3,22,24,37.9%,6.5%,65,1,0,0,...,6,54.6%,36,51,5,6,0,11,11,19


In [21]:
#When reading file tsv, it is read as csv and if the file has no header, let create cols list and assign to names
cols = ['col1','col2','col3','col4']
shark = pd.read_csv('data/shark.tsv',delimiter='\t',nrows=10,names=cols)
shark.head()

Unnamed: 0,col1,col2,col3,col4
0,Shark Attacks,USA,Australia,South Africa
1,Jan,28,94,68
2,Feb,27,78,32
3,Mar,66,63,34
4,Apr,103,54,25


In [23]:
#Reading excel file
#If it is nessessary, install openpyxl and using parameter engine = 'openpyxl'ArithmeticError
em_excel = pd.read_excel('data/employees.xlsx', engine='openpyxl')
em_excel.head()

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17 00:00:00,AD_PRES,24000,,,90.0
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21 00:00:00,AD_VP,17000,,100.0,90.0
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13 00:00:00,AD_VP,17000,,100.0,90.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,,102.0,60.0
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21 00:00:00,IT_PROG,6000,,103.0,60.0


In [None]:
#skiprows to skip the number of first rows, and skipfooter is to skip the number of row in the end
can = pd.read_excel("data/Canada.xlxs", skiprows=20,skipfooter=20,sheet_name=1)
can.head()

### Practice with dataframe

In [26]:
# Get the shape of df em_df
em_excel.shape

(107, 11)

In [27]:
#Ger information of dataframe
em_excel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   EMPLOYEE_ID     107 non-null    int64  
 1   FIRST_NAME      107 non-null    object 
 2   LAST_NAME       107 non-null    object 
 3   EMAIL           107 non-null    object 
 4   PHONE_NUMBER    107 non-null    object 
 5   HIRE_DATE       107 non-null    object 
 6   JOB_ID          107 non-null    object 
 7   SALARY          107 non-null    int64  
 8   COMMISSION_PCT  35 non-null     float64
 9   MANAGER_ID      106 non-null    float64
 10  DEPARTMENT_ID   106 non-null    float64
dtypes: float64(3), int64(2), object(6)
memory usage: 9.3+ KB


In [28]:
#Get list of cols and data types
em_excel.dtypes

EMPLOYEE_ID         int64
FIRST_NAME         object
LAST_NAME          object
EMAIL              object
PHONE_NUMBER       object
HIRE_DATE          object
JOB_ID             object
SALARY              int64
COMMISSION_PCT    float64
MANAGER_ID        float64
DEPARTMENT_ID     float64
dtype: object

In [29]:
#Get information of all columns which are number
em_excel.describe()

Unnamed: 0,EMPLOYEE_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
count,107.0,107.0,35.0,106.0,106.0
mean,153.0,6461.831776,0.222857,124.764151,63.207547
std,31.032241,3909.579731,0.085184,20.315395,20.91011
min,100.0,2100.0,0.1,100.0,10.0
25%,126.5,3100.0,0.15,108.0,50.0
50%,153.0,6200.0,0.2,122.0,50.0
75%,179.5,8900.0,0.3,145.0,80.0
max,206.0,24000.0,0.4,205.0,110.0


In [30]:
#Get information of all columns which are object
em_excel.describe(include='object')

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID
count,107,107,107,107,107,107
unique,91,102,107,107,98,19
top,Peter,King,SKING,515.123.4567,2002-06-07 00:00:00,SA_REP
freq,3,2,1,1,4,30


In [31]:
em_excel.head()

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17 00:00:00,AD_PRES,24000,,,90.0
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21 00:00:00,AD_VP,17000,,100.0,90.0
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13 00:00:00,AD_VP,17000,,100.0,90.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,,102.0,60.0
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21 00:00:00,IT_PROG,6000,,103.0,60.0


In [33]:
#Name of staff in the first row
em_excel.loc[0,'FIRST_NAME']

#or
em_excel.iloc[0,1]

'Steven'

#### Query many elements
* df.iloc[start:stop:step, start:stop:step] not include stop
* df.iloc[start:stop:step, cols index list] not include stop
* df.iloc[row index list, start:stop:step]
* df.loc[row label list, col label list]
* df.loc[start:stop:step, cols label list] #include stop

In [34]:
## 'FIRST_NAME', 'LAST_NAME', 'SALARY' của các nhân viên ở dòng 1, 4
em_excel.loc[[1,4],['FIRST_NAME', 'LAST_NAME', 'SALARY']]

Unnamed: 0,FIRST_NAME,LAST_NAME,SALARY
1,Neena,Kochhar,17000
4,Bruce,Ernst,6000


In [36]:
## info from row 2 to row 5 (use index) and cols EMPLOYEE_ID,LAST_NAME,JOB_ID of index 0,2,6
em_excel.iloc[2:6,[0,2,6]]

Unnamed: 0,EMPLOYEE_ID,LAST_NAME,JOB_ID
2,102,De Haan,AD_VP
3,103,Hunold,IT_PROG
4,104,Ernst,IT_PROG
5,105,Austin,IT_PROG


In [37]:
## info of employee has index 1,2,4 and cols has index from 0 to 2(inclue 2)
em_excel.iloc[[1,2,4],:3]

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME
1,101,Neena,Kochhar
2,102,Lex,De Haan
4,104,Bruce,Ernst


In [39]:
# show info from row 2 to 6(use label) and cols EMPLOYEE_ID, LAST_NAME, JOB_ID
em_excel.loc[2:6,["EMPLOYEE_ID", "LAST_NAME", "JOB_ID"]]

Unnamed: 0,EMPLOYEE_ID,LAST_NAME,JOB_ID
2,102,De Haan,AD_VP
3,103,Hunold,IT_PROG
4,104,Ernst,IT_PROG
5,105,Austin,IT_PROG
6,106,Pataballa,IT_PROG


In [40]:
# show infor from row 2 to 6 (use iloc) and cols from 1 to 3
em_excel.iloc[2:6,1:3]

Unnamed: 0,FIRST_NAME,LAST_NAME
2,Lex,De Haan
3,Alexander,Hunold
4,Bruce,Ernst
5,David,Austin


### Take values of a column or many cols
* df.loc[:,"col"]
* df["col"]
* df.iloc[:, col index]
* df.col
* df[cols list]

In [41]:
em_excel.loc[:,"EMAIL"] #or em_excel["EMAIL"]

0         SKING
1      NKOCHHAR
2       LDEHAAN
3       AHUNOLD
4        BERNST
         ...   
102        PFAY
103     SMAVRIS
104       HBAER
105    SHIGGINS
106      WGIETZ
Name: EMAIL, Length: 107, dtype: object

In [42]:
em_excel["EMAIL"]

0         SKING
1      NKOCHHAR
2       LDEHAAN
3       AHUNOLD
4        BERNST
         ...   
102        PFAY
103     SMAVRIS
104       HBAER
105    SHIGGINS
106      WGIETZ
Name: EMAIL, Length: 107, dtype: object

In [43]:
em_excel.iloc[:,3]

0         SKING
1      NKOCHHAR
2       LDEHAAN
3       AHUNOLD
4        BERNST
         ...   
102        PFAY
103     SMAVRIS
104       HBAER
105    SHIGGINS
106      WGIETZ
Name: EMAIL, Length: 107, dtype: object

In [45]:
em_excel.EMAIL

0         SKING
1      NKOCHHAR
2       LDEHAAN
3       AHUNOLD
4        BERNST
         ...   
102        PFAY
103     SMAVRIS
104       HBAER
105    SHIGGINS
106      WGIETZ
Name: EMAIL, Length: 107, dtype: object

In [44]:
em_excel[['EMPLOYEE_ID','EMAIL']]

Unnamed: 0,EMPLOYEE_ID,EMAIL
0,100,SKING
1,101,NKOCHHAR
2,102,LDEHAAN
3,103,AHUNOLD
4,104,BERNST
...,...,...
102,202,PFAY
103,203,SMAVRIS
104,204,HBAER
105,205,SHIGGINS


### Query row or many rows
df.loc[row label]
df.iloc[row index]

In [46]:
# Get info of row 1
em_excel.loc[1]

EMPLOYEE_ID                       101
FIRST_NAME                      Neena
LAST_NAME                     Kochhar
EMAIL                        NKOCHHAR
PHONE_NUMBER             515.123.4568
HIRE_DATE         2005-09-21 00:00:00
JOB_ID                          AD_VP
SALARY                          17000
COMMISSION_PCT                    NaN
MANAGER_ID                      100.0
DEPARTMENT_ID                    90.0
Name: 1, dtype: object

In [48]:
em_excel.iloc[1]

EMPLOYEE_ID                       101
FIRST_NAME                      Neena
LAST_NAME                     Kochhar
EMAIL                        NKOCHHAR
PHONE_NUMBER             515.123.4568
HIRE_DATE         2005-09-21 00:00:00
JOB_ID                          AD_VP
SALARY                          17000
COMMISSION_PCT                    NaN
MANAGER_ID                      100.0
DEPARTMENT_ID                    90.0
Name: 1, dtype: object

In [49]:
#Get info from row 2 to 5
em_excel.loc[2:5]

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13 00:00:00,AD_VP,17000,,100.0,90.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,,102.0,60.0
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21 00:00:00,IT_PROG,6000,,103.0,60.0
5,105,David,Austin,DAUSTIN,590.423.4569,2005-06-25 00:00:00,IT_PROG,4800,,103.0,60.0


In [50]:
em_excel.iloc[2:6]

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13 00:00:00,AD_VP,17000,,100.0,90.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,,102.0,60.0
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21 00:00:00,IT_PROG,6000,,103.0,60.0
5,105,David,Austin,DAUSTIN,590.423.4569,2005-06-25 00:00:00,IT_PROG,4800,,103.0,60.0


In [51]:
# Get info of row 2,3,5
em_excel.loc[[2,3,5]]

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13 00:00:00,AD_VP,17000,,100.0,90.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,,102.0,60.0
5,105,David,Austin,DAUSTIN,590.423.4569,2005-06-25 00:00:00,IT_PROG,4800,,103.0,60.0


### Query by condition
df[condition]

In [52]:
# Get data when JOB_ID == IT_PROG
em_excel[em_excel.JOB_ID == 'IT_PROG']

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,,102.0,60.0
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21 00:00:00,IT_PROG,6000,,103.0,60.0
5,105,David,Austin,DAUSTIN,590.423.4569,2005-06-25 00:00:00,IT_PROG,4800,,103.0,60.0
6,106,Valli,Pataballa,VPATABAL,590.423.4560,2006-02-05 00:00:00,IT_PROG,4800,,103.0,60.0
7,107,Diana,Lorentz,DLORENTZ,590.423.5567,2007-02-07 00:00:00,IT_PROG,4200,,103.0,60.0


In [54]:
# Get data when JOB_ID == IT_PROG and only show col job id and salary
em_excel[em_excel.JOB_ID == 'IT_PROG'][['JOB_ID', 'SALARY']]

Unnamed: 0,JOB_ID,SALARY
3,IT_PROG,9000
4,IT_PROG,6000
5,IT_PROG,4800
6,IT_PROG,4800
7,IT_PROG,4200


In [55]:
#using loc and condition
em_excel.loc[(em_excel.JOB_ID == 'IT_PROG'),['JOB_ID', 'SALARY']]

Unnamed: 0,JOB_ID,SALARY
3,IT_PROG,9000
4,IT_PROG,6000
5,IT_PROG,4800
6,IT_PROG,4800
7,IT_PROG,4200


In [57]:
# find the row has salary is max
em_excel[(em_excel.SALARY == em_excel.SALARY.max())]

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17 00:00:00,AD_PRES,24000,,,90.0


In [58]:
em_excel[(em_excel['DEPARTMENT_ID'] == 60) & (em_excel['SALARY'] > 5000)]

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,,102.0,60.0
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21 00:00:00,IT_PROG,6000,,103.0,60.0


In [60]:
em_excel.dtypes

EMPLOYEE_ID         int64
FIRST_NAME         object
LAST_NAME          object
EMAIL              object
PHONE_NUMBER       object
HIRE_DATE          object
JOB_ID             object
SALARY              int64
COMMISSION_PCT    float64
MANAGER_ID        float64
DEPARTMENT_ID     float64
dtype: object

In [61]:
em_excel.EMPLOYEE_ID.unique()

array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
       139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
       152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
       165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177,
       178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
       191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
       204, 205, 206])

In [62]:
em_excel.loc[em_excel['EMPLOYEE_ID'] == 110, 'SALARY']

10    8200
Name: SALARY, dtype: int64

In [63]:
#set new index, using employee_ID to be index
em_excel.set_index('EMPLOYEE_ID', inplace=True)
em_excel

Unnamed: 0_level_0,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
EMPLOYEE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100,Steven,King,SKING,515.123.4567,2003-06-17 00:00:00,AD_PRES,24000,,,90.0
101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21 00:00:00,AD_VP,17000,,100.0,90.0
102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13 00:00:00,AD_VP,17000,,100.0,90.0
103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,,102.0,60.0
104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21 00:00:00,IT_PROG,6000,,103.0,60.0
...,...,...,...,...,...,...,...,...,...,...
202,Pat,Fay,PFAY,603.123.6666,2005-08-17 00:00:00,MK_REP,6000,,201.0,20.0
203,Susan,Mavris,SMAVRIS,515.123.7777,2002-06-07 00:00:00,HR_REP,6500,,101.0,40.0
204,Hermann,Baer,HBAER,515.123.8888,2002-06-07 00:00:00,PR_REP,10000,,101.0,70.0
205,Shelley,Higgins,SHIGGINS,515.123.8080,2002-06-07 00:00:00,AC_MGR,12008,,101.0,110.0


In [64]:
# reset index
em_excel.reset_index(inplace=True)
em_excel

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17 00:00:00,AD_PRES,24000,,,90.0
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21 00:00:00,AD_VP,17000,,100.0,90.0
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13 00:00:00,AD_VP,17000,,100.0,90.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,,102.0,60.0
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21 00:00:00,IT_PROG,6000,,103.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...
102,202,Pat,Fay,PFAY,603.123.6666,2005-08-17 00:00:00,MK_REP,6000,,201.0,20.0
103,203,Susan,Mavris,SMAVRIS,515.123.7777,2002-06-07 00:00:00,HR_REP,6500,,101.0,40.0
104,204,Hermann,Baer,HBAER,515.123.8888,2002-06-07 00:00:00,PR_REP,10000,,101.0,70.0
105,205,Shelley,Higgins,SHIGGINS,515.123.8080,2002-06-07 00:00:00,AC_MGR,12008,,101.0,110.0


## Duplicated Values in df

In [65]:
data = {
    'fullname': ['John Smith', 'Mary Johnson', 'John Smith', 'Robert Williams', 'Sarah Davis', 'Michael Wilson','John Smith'],
    'job_id': ['IT_PROG', 'SALES_REP', 'IT_PROG', 'HR_REP', 'IT_PROG', 'AC_ACCOUNT', 'IT_PROG'],
    'income': [1000, 1500, 1000, 1200, 1000, 1100, 1200]
}

emp = pd.DataFrame(data)

emp

Unnamed: 0,fullname,job_id,income
0,John Smith,IT_PROG,1000
1,Mary Johnson,SALES_REP,1500
2,John Smith,IT_PROG,1000
3,Robert Williams,HR_REP,1200
4,Sarah Davis,IT_PROG,1000
5,Michael Wilson,AC_ACCOUNT,1100
6,John Smith,IT_PROG,1200


In [66]:
emp.duplicated()

0    False
1    False
2     True
3    False
4    False
5    False
6    False
dtype: bool

In [67]:
emp.duplicated(keep='last')

0     True
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [68]:
emp.duplicated(keep=False)

0     True
1    False
2     True
3    False
4    False
5    False
6    False
dtype: bool

In [69]:
emp.duplicated(keep=False).sum()

np.int64(2)

In [70]:
#duplicate on fullname and job_id cols
emp.duplicated(subset=['fullname','job_id'])

0    False
1    False
2     True
3    False
4    False
5    False
6     True
dtype: bool

In [71]:
#show duplicated rows on job_id, income cols
emp[emp.duplicated(subset=['fullname','job_id'], keep=False)]

Unnamed: 0,fullname,job_id,income
0,John Smith,IT_PROG,1000
2,John Smith,IT_PROG,1000
6,John Smith,IT_PROG,1200


In [72]:
emp.drop_duplicates(inplace=True)
emp

Unnamed: 0,fullname,job_id,income
0,John Smith,IT_PROG,1000
1,Mary Johnson,SALES_REP,1500
3,Robert Williams,HR_REP,1200
4,Sarah Davis,IT_PROG,1000
5,Michael Wilson,AC_ACCOUNT,1100
6,John Smith,IT_PROG,1200


In [73]:
#delete duplicate on job_id, salary cols
emp.drop_duplicates(subset=['fullname','job_id'], inplace=True)
emp

Unnamed: 0,fullname,job_id,income
0,John Smith,IT_PROG,1000
1,Mary Johnson,SALES_REP,1500
3,Robert Williams,HR_REP,1200
4,Sarah Davis,IT_PROG,1000
5,Michael Wilson,AC_ACCOUNT,1100
