In [1]:
import pandas as pd
import numpy as np

In [2]:
data_origin = pd.read_csv("Data_pandas.csv")
data = data_origin.copy()

In [3]:
data.head(5)

Unnamed: 0,ID,Name,LogInTime,LogOutTime,PurchaseNum,Location,PurchaseAmount,Saving,Purchase
0,101,Jack,2/10/18 13:10,2/10/18 14:30,1,Home,150.0,1500,Yes
1,101,Jack,2/15/18 18:00,2/15/18 19:00,0,Home,,1350,No
2,101,Jack,2/15/18 20:00,2/15/18 22:30,0,Home,,1350,No
3,101,Jack,3/11/18 10:50,3/11/18 15:00,2,Outside,190.0,1350,Yes
4,101,Jack,3/11/18 15:15,3/11/18 18:00,1,Home,30.0,1160,Yes


# Select Subset of data from DataFrame

## Using position to select  data (`iloc[]`)

### All example codes are to select 2nd row to 4th row and 3rd columns to 5th columns.

#### Select the data point using range index


In [4]:
data.iloc[1:5,2:5] 

Unnamed: 0,LogInTime,LogOutTime,PurchaseNum
1,2/15/18 18:00,2/15/18 19:00,0
2,2/15/18 20:00,2/15/18 22:30,0
3,3/11/18 10:50,3/11/18 15:00,2
4,3/11/18 15:15,3/11/18 18:00,1


#### Select the data point using list of integers


In [5]:
data.iloc[[1,2,3,4],[2,3,4]]

Unnamed: 0,LogInTime,LogOutTime,PurchaseNum
1,2/15/18 18:00,2/15/18 19:00,0
2,2/15/18 20:00,2/15/18 22:30,0
3,3/11/18 10:50,3/11/18 15:00,2
4,3/11/18 15:15,3/11/18 18:00,1


## Using column names & row indices to select single value (`loc[]`)

In [6]:
# Set new index
new_index = []
for i in range(len(data)):
    new_index.append("index_" + str(i))
data.index = new_index

#### Select the data point using a range of index and column names

In [7]:
data.loc["index_1":"index_4","LogInTime":"PurchaseNum"] 

Unnamed: 0,LogInTime,LogOutTime,PurchaseNum
index_1,2/15/18 18:00,2/15/18 19:00,0
index_2,2/15/18 20:00,2/15/18 22:30,0
index_3,3/11/18 10:50,3/11/18 15:00,2
index_4,3/11/18 15:15,3/11/18 18:00,1


#### Select the data point using a list of index and column names

In [8]:
data.loc[["index_1","index_2","index_3","index_4"],["LogInTime","LogOutTime","PurchaseNum"]]

Unnamed: 0,LogInTime,LogOutTime,PurchaseNum
index_1,2/15/18 18:00,2/15/18 19:00,0
index_2,2/15/18 20:00,2/15/18 22:30,0
index_3,3/11/18 10:50,3/11/18 15:00,2
index_4,3/11/18 15:15,3/11/18 18:00,1


## Using a mix of position & column names/row index to select data

In [9]:
data.iloc[1:5,:].loc[:,"LogInTime":"PurchaseNum"]

Unnamed: 0,LogInTime,LogOutTime,PurchaseNum
index_1,2/15/18 18:00,2/15/18 19:00,0
index_2,2/15/18 20:00,2/15/18 22:30,0
index_3,3/11/18 10:50,3/11/18 15:00,2
index_4,3/11/18 15:15,3/11/18 18:00,1


In [10]:
data.loc["index_1":"index_4",:].iloc[:,2:5]

Unnamed: 0,LogInTime,LogOutTime,PurchaseNum
index_1,2/15/18 18:00,2/15/18 19:00,0
index_2,2/15/18 20:00,2/15/18 22:30,0
index_3,3/11/18 10:50,3/11/18 15:00,2
index_4,3/11/18 15:15,3/11/18 18:00,1


## Using a boolean value to select single value

In [11]:
np.array(data["PurchaseNum"]>2)

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False,  True, False,  True, False,
       False])

In [12]:
data.columns.isin(["LogInTime","LogOutTime","PurchaseNum"])

array([False, False,  True,  True,  True, False, False, False, False])

In [13]:
data.loc[data["PurchaseNum"] > 2,data.columns.isin(["LogInTime","LogOutTime","PurchaseNum"])]

Unnamed: 0,LogInTime,LogOutTime,PurchaseNum
index_5,5/10/18 18:00,5/10/18 21:00,4
index_14,6/7/18 15:25,6/7/18 21:45,4
index_16,6/20/18 17:30,6/20/18 23:00,3


In [14]:
data.loc[data["PurchaseNum"] > 2,~data.columns.isin(["LogInTime","LogOutTime","PurchaseNum"])]

Unnamed: 0,ID,Name,Location,PurchaseAmount,Saving,Purchase
index_5,101,Jack,Home,100.0,1130,Yes
index_14,103,Tom,Outside,200.0,3420,Yes
index_16,103,Tom,Work,60.0,3125,Yes


# Basic Stats function for DataFrame 

## For categorical variable column

### `.value_counts()` returns the counts of each categorical variable

In [15]:
data["Location"].value_counts().head(1)

Home    9
Name: Location, dtype: int64

### `.unique()` returns the unique value in this categorical variable

In [16]:
len(data["Location"].unique())
print("There are %r levels in Location." %len(data["Location"].unique()))

There are 3 levels in Location.


In [17]:
data["Location"].unique()

array(['Home', 'Outside', 'Work'], dtype=object)

## For continuous variable column

In [18]:
data["PurchaseAmount"].min() # Return the minimum value

20.0

In [19]:
data["PurchaseAmount"].mean() # Return the average value

78.2

In [20]:
data["PurchaseAmount"].max() # Return the maximum value

200.0

In [21]:
data["PurchaseAmount"].median() # Return the median value

50.0

In [22]:
data["PurchaseAmount"].quantile(0.5) # 50% percentile equals to the median value

50.0

In [23]:
data["PurchaseAmount"].quantile(0.25) # 25% percentile 

40.0

In [24]:
data["PurchaseAmount"].quantile(0.75) # 75% percentile 

97.5

In [25]:
data.isna()["PurchaseAmount"].value_counts()

False    15
True      4
Name: PurchaseAmount, dtype: int64

In [26]:
# Differs from np.where
data["PurchaseAmount"].where(data["PurchaseAmount"] > 50)


index_0     150.0
index_1       NaN
index_2       NaN
index_3     190.0
index_4       NaN
index_5     100.0
index_6       NaN
index_7       NaN
index_8       NaN
index_9      80.0
index_10      NaN
index_11      NaN
index_12      NaN
index_13      NaN
index_14    200.0
index_15     95.0
index_16     60.0
index_17      NaN
index_18      NaN
Name: PurchaseAmount, dtype: float64

# Handling missing data in DataFrame

In [27]:
data_missing = data.copy()

In [28]:
data_missing.replace(1,5,inplace=False)

Unnamed: 0,ID,Name,LogInTime,LogOutTime,PurchaseNum,Location,PurchaseAmount,Saving,Purchase
index_0,101,Jack,2/10/18 13:10,2/10/18 14:30,5,Home,150.0,1500,Yes
index_1,101,Jack,2/15/18 18:00,2/15/18 19:00,0,Home,,1350,No
index_2,101,Jack,2/15/18 20:00,2/15/18 22:30,0,Home,,1350,No
index_3,101,Jack,3/11/18 10:50,3/11/18 15:00,2,Outside,190.0,1350,Yes
index_4,101,Jack,3/11/18 15:15,3/11/18 18:00,5,Home,30.0,1160,Yes
index_5,101,Jack,5/10/18 18:00,5/10/18 21:00,4,Home,100.0,1130,Yes
index_6,101,Jack,7/12/18 17:40,7/12/18 18:20,2,Work,28.0,1030,Yes
index_7,101,Jack,6/5/18 15:25,6/5/18 19:45,0,Home,,1002,No
index_8,102,Amy,5/1/18 14:15,5/1/18 18:00,0,Work,,800,No
index_9,102,Amy,6/10/18 16:00,6/10/18 23:00,2,Home,80.0,800,Yes


In [29]:
data_missing.dropna(inplace=False,axis="columns").head(3)

Unnamed: 0,ID,Name,LogInTime,LogOutTime,PurchaseNum,Location,Saving,Purchase
index_0,101,Jack,2/10/18 13:10,2/10/18 14:30,1,Home,1500,Yes
index_1,101,Jack,2/15/18 18:00,2/15/18 19:00,0,Home,1350,No
index_2,101,Jack,2/15/18 20:00,2/15/18 22:30,0,Home,1350,No


In [30]:
data_missing.dropna(inplace=False,axis="index").head(3)

Unnamed: 0,ID,Name,LogInTime,LogOutTime,PurchaseNum,Location,PurchaseAmount,Saving,Purchase
index_0,101,Jack,2/10/18 13:10,2/10/18 14:30,1,Home,150.0,1500,Yes
index_3,101,Jack,3/11/18 10:50,3/11/18 15:00,2,Outside,190.0,1350,Yes
index_4,101,Jack,3/11/18 15:15,3/11/18 18:00,1,Home,30.0,1160,Yes


In [31]:
data_missing.isna().head(3)

Unnamed: 0,ID,Name,LogInTime,LogOutTime,PurchaseNum,Location,PurchaseAmount,Saving,Purchase
index_0,False,False,False,False,False,False,False,False,False
index_1,False,False,False,False,False,False,True,False,False
index_2,False,False,False,False,False,False,True,False,False


In [32]:
data_missing["PurchaseAmount"].isna().value_counts()

False    15
True      4
Name: PurchaseAmount, dtype: int64

In [33]:
data_missing

Unnamed: 0,ID,Name,LogInTime,LogOutTime,PurchaseNum,Location,PurchaseAmount,Saving,Purchase
index_0,101,Jack,2/10/18 13:10,2/10/18 14:30,1,Home,150.0,1500,Yes
index_1,101,Jack,2/15/18 18:00,2/15/18 19:00,0,Home,,1350,No
index_2,101,Jack,2/15/18 20:00,2/15/18 22:30,0,Home,,1350,No
index_3,101,Jack,3/11/18 10:50,3/11/18 15:00,2,Outside,190.0,1350,Yes
index_4,101,Jack,3/11/18 15:15,3/11/18 18:00,1,Home,30.0,1160,Yes
index_5,101,Jack,5/10/18 18:00,5/10/18 21:00,4,Home,100.0,1130,Yes
index_6,101,Jack,7/12/18 17:40,7/12/18 18:20,2,Work,28.0,1030,Yes
index_7,101,Jack,6/5/18 15:25,6/5/18 19:45,0,Home,,1002,No
index_8,102,Amy,5/1/18 14:15,5/1/18 18:00,0,Work,,800,No
index_9,102,Amy,6/10/18 16:00,6/10/18 23:00,2,Home,80.0,800,Yes


In [34]:
data_missing.fillna(100000).head(3)

Unnamed: 0,ID,Name,LogInTime,LogOutTime,PurchaseNum,Location,PurchaseAmount,Saving,Purchase
index_0,101,Jack,2/10/18 13:10,2/10/18 14:30,1,Home,150.0,1500,Yes
index_1,101,Jack,2/15/18 18:00,2/15/18 19:00,0,Home,100000.0,1350,No
index_2,101,Jack,2/15/18 20:00,2/15/18 22:30,0,Home,100000.0,1350,No


In [35]:
data_missing.fillna(method="ffill").head(3)

Unnamed: 0,ID,Name,LogInTime,LogOutTime,PurchaseNum,Location,PurchaseAmount,Saving,Purchase
index_0,101,Jack,2/10/18 13:10,2/10/18 14:30,1,Home,150.0,1500,Yes
index_1,101,Jack,2/15/18 18:00,2/15/18 19:00,0,Home,150.0,1350,No
index_2,101,Jack,2/15/18 20:00,2/15/18 22:30,0,Home,150.0,1350,No


In [36]:
data_missing.fillna(method="bfill").head(3)

Unnamed: 0,ID,Name,LogInTime,LogOutTime,PurchaseNum,Location,PurchaseAmount,Saving,Purchase
index_0,101,Jack,2/10/18 13:10,2/10/18 14:30,1,Home,150.0,1500,Yes
index_1,101,Jack,2/15/18 18:00,2/15/18 19:00,0,Home,190.0,1350,No
index_2,101,Jack,2/15/18 20:00,2/15/18 22:30,0,Home,190.0,1350,No
