# DataFrame 基礎

In [1]:
import pandas as pd


# 製作 Dataframe 資料集 member
data = {
    "uid": [1, 2, 3, 4, 5],
    "name": ["Howard", "Lily", "Kai", "Jojo", "Ivan"],
    "age": [25, 21, 35, 18, 15],
}

member = pd.DataFrame(data)  # dic-df
member

Unnamed: 0,uid,name,age
0,1,Howard,25
1,2,Lily,21
2,3,Kai,35
3,4,Jojo,18
4,5,Ivan,15


In [2]:
# 查看前五筆資料
member.head()

Unnamed: 0,uid,name,age
0,1,Howard,25
1,2,Lily,21
2,3,Kai,35
3,4,Jojo,18
4,5,Ivan,15


In [3]:
# 查看欄位資訊
member.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   uid     5 non-null      int64 
 1   name    5 non-null      object
 2   age     5 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 252.0+ bytes


In [4]:
# (列數,欄數)  (R,C)  直欄橫列
member.shape

(5, 3)

In [5]:
# 前面的member也會受到影響哦
test = member

test.columns

test.columns = ["num", "word", "fix"]

In [6]:
# 怕同學後面會錯誤 複製一個df出來，member不會受到影響哦
member = pd.DataFrame(data)
test = member.copy()

In [7]:
# 欄位更名
# 全部更名
test.columns = ["numdddd", "1", "2"]
test.columns

Index(['numdddd', '1', '2'], dtype='object')

In [8]:
# 只抓特定欄位更名
test.rename(columns={"numdddd": "name"})
test = test.rename(columns={"numdddd": "name"})
test

Unnamed: 0,name,1,2
0,1,Howard,25
1,2,Lily,21
2,3,Kai,35
3,4,Jojo,18
4,5,Ivan,15


In [9]:
# 重新以Dataframe產出member資料
member = pd.DataFrame(data)

In [10]:
# 取得一欄資料
member["name"]

0    Howard
1      Lily
2       Kai
3      Jojo
4      Ivan
Name: name, dtype: object

In [11]:
# 取得多欄資料
member[["name", "age"]]

Unnamed: 0,name,age
0,Howard,25
1,Lily,21
2,Kai,35
3,Jojo,18
4,Ivan,15


In [12]:
# 取得多欄資料,先將欄位名稱存成陣列
colname = ["name", "age"]
member[colname]

Unnamed: 0,name,age
0,Howard,25
1,Lily,21
2,Kai,35
3,Jojo,18
4,Ivan,15


In [13]:
### 問題：取出 uid 與age 欄位

member[["uid", "age"]]


Unnamed: 0,uid,age
0,1,25
1,2,21
2,3,35
3,4,18
4,5,15


In [14]:
# 欄順列反轉
collist = member.columns[::-1]
df = member[collist]
df

Unnamed: 0,age,name,uid
0,25,Howard,1
1,21,Lily,2
2,35,Kai,3
3,18,Jojo,4
4,15,Ivan,5


In [15]:
# 列順列反轉
indexlist = member.index[::-1]
df = member.iloc[indexlist]
df

Unnamed: 0,uid,name,age
4,5,Ivan,15
3,4,Jojo,18
2,3,Kai,35
1,2,Lily,21
0,1,Howard,25


In [16]:
# 改欄順序
df = member.reindex(columns=["age", "uid", "name"])
df

Unnamed: 0,age,uid,name
0,25,1,Howard
1,21,2,Lily
2,35,3,Kai
3,18,4,Jojo
4,15,5,Ivan


In [17]:
# 問題:請將上述df的欄位順序依uid,age, name排列

df.reindex(columns=["uid", "age", "name"])

Unnamed: 0,uid,age,name
0,1,25,Howard
1,2,21,Lily
2,3,35,Kai
3,4,18,Jojo
4,5,15,Ivan


## 常見計算

In [18]:
member["age"]

0    25
1    21
2    35
3    18
4    15
Name: age, dtype: int64

In [19]:
# 平均會員年紀
member["age"].mean()

np.float64(22.8)

In [20]:
# 最大會員年紀
member["age"].max()

np.int64(35)

In [21]:
# 最小會員年紀
member["age"].min()

np.int64(15)

In [22]:
# 其他常見統計
member["age"].describe()

count     5.000000
mean     22.800000
std       7.758866
min      15.000000
25%      18.000000
50%      21.000000
75%      25.000000
max      35.000000
Name: age, dtype: float64

In [23]:
# 排序(遞增)
member["age"].sort_values()

4    15
3    18
1    21
0    25
2    35
Name: age, dtype: int64

In [24]:
# 排序(遞減)
member["age"].sort_values(ascending=False)

2    35
0    25
1    21
3    18
4    15
Name: age, dtype: int64

In [25]:
# 移除欄位 drop
member2 = member.drop(columns=["uid"])
member2

Unnamed: 0,name,age
0,Howard,25
1,Lily,21
2,Kai,35
3,Jojo,18
4,Ivan,15


In [26]:
# 問題：移除 uid 與 age

member.drop(columns=["uid", "age"])


Unnamed: 0,name
0,Howard
1,Lily
2,Kai
3,Jojo
4,Ivan


In [27]:
# 把data frame轉回list，一列轉為list裡的元素
df_tolist = member.values.tolist()

In [28]:
# 把list轉data frame
list_df = pd.DataFrame(df_tolist)

In [29]:
# 整欄資料型別轉換
member["age"].dtype

dtype('int64')

In [30]:
member["age"] = member["age"].astype("float64")
member["age"]

0    25.0
1    21.0
2    35.0
3    18.0
4    15.0
Name: age, dtype: float64

In [31]:
member["age"] = member["age"].astype("int")
member["age"]

0    25
1    21
2    35
3    18
4    15
Name: age, dtype: int64

In [32]:
member["age"] = member["age"].astype("str")

try:
    member["age"].mean()
except Exception as e:
    print(e)


Could not convert string '2521351815' to numeric


In [33]:
member["age"] = member["age"].astype("int")
member["age"].mean()

np.float64(22.8)

## 進階 - pandas的條件選擇

In [34]:
# 載入資料集transaction
transaction = pd.read_csv("transaction.csv")

In [35]:
# 條件式 會回傳每一row 的true 與False
transaction["product"] == "lemon"

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: product, dtype: bool

In [36]:
# 顯示符合條件的資料
transaction[transaction["product"] == "lemon"]

Unnamed: 0,key,tid,uid,product,quantity,price


In [37]:
transaction[transaction["product"] == "apple"]

Unnamed: 0,key,tid,uid,product,quantity,price
0,1,T0001,1,apple,5,10
9,10,T0005,5,apple,5,10


In [38]:
### 問題：只顯示'tid' 的 T0003 的交易數據

transaction[transaction["tid"] == "T0003"]

Unnamed: 0,key,tid,uid,product,quantity,price
2,3,T0003,4,origin,4,5
3,4,T0003,3,cherry,3,60
4,5,T0003,3,guava,2,20


In [39]:
# 直接整欄計算 + - * /並儲存
transaction["sum"] = transaction["quantity"] * transaction["price"]

In [40]:
# one-hot encoding 用於轉換成類別資料
pd.get_dummies(transaction["product"])

Unnamed: 0,apple,banana,cherry,guava,origin
0,True,False,False,False,False
1,False,True,False,False,False
2,False,False,False,False,True
3,False,False,True,False,False
4,False,False,False,True,False
5,False,True,False,False,False
6,False,False,False,False,True
7,False,False,False,True,False
8,False,False,False,False,True
9,True,False,False,False,False


In [41]:
# pivot_table用法, 什麼樣的人買了什麼樣的產品，多少個
pivot_table = pd.pivot_table(
    transaction, values="quantity", index=["uid"], columns=["product"], aggfunc="sum"
)
pivot_table

product,apple,banana,cherry,guava,origin
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.0,,,,
2,,10.0,,2.0,5.0
3,,,3.0,2.0,
4,,,,,4.0
5,5.0,6.0,,,4.0


In [42]:
# 問題:請將member利用pd.pivot_table做樞鈕表，index為name, columns為age,aggfunc為count

member.pivot_table(index="name", columns="age", aggfunc="count")

Unnamed: 0_level_0,uid,uid,uid,uid,uid
age,15,18,21,25,35
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Howard,,,,1.0,
Ivan,1.0,,,,
Jojo,,1.0,,,
Kai,,,,,1.0
Lily,,,1.0,,


In [43]:
# 切分群組
#  cut (切割的資料 ,分成幾等分,要替換的標籤)
transaction["label"] = pd.cut(
    transaction["price"], 3, labels=["Cheap", "medium", "expensive"]
)

transaction["label"]

0        Cheap
1       medium
2        Cheap
3    expensive
4        Cheap
5       medium
6        Cheap
7        Cheap
8        Cheap
9        Cheap
Name: label, dtype: category
Categories (3, object): ['Cheap' < 'medium' < 'expensive']

In [44]:
# 切分群組, 存到一新物件
priceclass = pd.cut(transaction["price"], 3, labels=["Cheap", "medium", "expensive"])

newdata = pd.concat([transaction, priceclass], axis=1)

newdata

Unnamed: 0,key,tid,uid,product,quantity,price,sum,label,price.1
0,1,T0001,1,apple,5,10,50,Cheap,Cheap
1,2,T0002,5,banana,6,34,204,medium,medium
2,3,T0003,4,origin,4,5,20,Cheap,Cheap
3,4,T0003,3,cherry,3,60,180,expensive,expensive
4,5,T0003,3,guava,2,20,40,Cheap,Cheap
5,6,T0004,2,banana,10,34,340,medium,medium
6,7,T0004,2,origin,5,5,25,Cheap,Cheap
7,8,T0004,2,guava,2,20,40,Cheap,Cheap
8,9,T0005,5,origin,4,5,20,Cheap,Cheap
9,10,T0005,5,apple,5,10,50,Cheap,Cheap


In [45]:
# 依以下標準切分三等分"Cheap", "medium", "expensive"]0-20,20-40,40-最大？）
criterials = [0, 20, 40, transaction["price"].max()]
transaction["pricelabel"] = pd.cut(
    transaction["price"], criterials, labels=["Cheap", "medium", "expensive"]
)

transaction["pricelabel"]

0        Cheap
1       medium
2        Cheap
3    expensive
4        Cheap
5       medium
6        Cheap
7        Cheap
8        Cheap
9        Cheap
Name: pricelabel, dtype: category
Categories (3, object): ['Cheap' < 'medium' < 'expensive']

In [46]:
# 合併 merge，兩個df的結合，類似SQL的join
newdata = pd.merge(transaction, member, how="left", on=["uid"])

newdata

Unnamed: 0,key,tid,uid,product,quantity,price,sum,label,pricelabel,name,age
0,1,T0001,1,apple,5,10,50,Cheap,Cheap,Howard,25
1,2,T0002,5,banana,6,34,204,medium,medium,Ivan,15
2,3,T0003,4,origin,4,5,20,Cheap,Cheap,Jojo,18
3,4,T0003,3,cherry,3,60,180,expensive,expensive,Kai,35
4,5,T0003,3,guava,2,20,40,Cheap,Cheap,Kai,35
5,6,T0004,2,banana,10,34,340,medium,medium,Lily,21
6,7,T0004,2,origin,5,5,25,Cheap,Cheap,Lily,21
7,8,T0004,2,guava,2,20,40,Cheap,Cheap,Lily,21
8,9,T0005,5,origin,4,5,20,Cheap,Cheap,Ivan,15
9,10,T0005,5,apple,5,10,50,Cheap,Cheap,Ivan,15


In [47]:
# 問題:請將下列dic轉成df, 並跟member進行merge
dicsex = {"uid": [1, 2, 3, 4, 5], "sex": ["F", "M", "M", "F", "F"]}

dicsex_df = pd.DataFrame(dicsex)
dicsex_df_merged = pd.merge(member, dicsex_df, how="left", on=["uid"])

dicsex_df_merged

Unnamed: 0,uid,name,age,sex
0,1,Howard,25,F
1,2,Lily,21,M
2,3,Kai,35,M
3,4,Jojo,18,F
4,5,Ivan,15,F


In [48]:
# 問題:請將下列dic轉成df, 並跟member進行concat
dicsex = {"uid": [1, 2, 3, 4, 5], "sex": ["F", "M", "M", "F", "F"]}

dicsex_df = pd.DataFrame(dicsex)
dicsex_df_concat = pd.concat([member, dicsex_df], axis=1)

dicsex_df_concat

Unnamed: 0,uid,name,age,uid.1,sex
0,1,Howard,25,1,F
1,2,Lily,21,2,M
2,3,Kai,35,3,M
3,4,Jojo,18,4,F
4,5,Ivan,15,5,F


In [49]:
# 隨意取資料
transaction.iloc[::, 0:2]

Unnamed: 0,key,tid
0,1,T0001
1,2,T0002
2,3,T0003
3,4,T0003
4,5,T0003
5,6,T0004
6,7,T0004
7,8,T0004
8,9,T0005
9,10,T0005


In [50]:
transaction.loc[::, ["product", "price"]]

Unnamed: 0,product,price
0,apple,10
1,banana,34
2,origin,5
3,cherry,60
4,guava,20
5,banana,34
6,origin,5
7,guava,20
8,origin,5
9,apple,10


In [51]:
# 問題：transaction取前 3 row

transaction.head(3)

Unnamed: 0,key,tid,uid,product,quantity,price,sum,label,pricelabel
0,1,T0001,1,apple,5,10,50,Cheap,Cheap
1,2,T0002,5,banana,6,34,204,medium,medium
2,3,T0003,4,origin,4,5,20,Cheap,Cheap


In [52]:
# 刪除重覆
transaction.drop_duplicates()

Unnamed: 0,key,tid,uid,product,quantity,price,sum,label,pricelabel
0,1,T0001,1,apple,5,10,50,Cheap,Cheap
1,2,T0002,5,banana,6,34,204,medium,medium
2,3,T0003,4,origin,4,5,20,Cheap,Cheap
3,4,T0003,3,cherry,3,60,180,expensive,expensive
4,5,T0003,3,guava,2,20,40,Cheap,Cheap
5,6,T0004,2,banana,10,34,340,medium,medium
6,7,T0004,2,origin,5,5,25,Cheap,Cheap
7,8,T0004,2,guava,2,20,40,Cheap,Cheap
8,9,T0005,5,origin,4,5,20,Cheap,Cheap
9,10,T0005,5,apple,5,10,50,Cheap,Cheap


In [53]:
transaction.drop_duplicates(subset=["tid"])

Unnamed: 0,key,tid,uid,product,quantity,price,sum,label,pricelabel
0,1,T0001,1,apple,5,10,50,Cheap,Cheap
1,2,T0002,5,banana,6,34,204,medium,medium
2,3,T0003,4,origin,4,5,20,Cheap,Cheap
5,6,T0004,2,banana,10,34,340,medium,medium
8,9,T0005,5,origin,4,5,20,Cheap,Cheap


In [54]:
transaction.drop_duplicates(subset=["tid"], inplace=True)  # inplace=True 會直接修改原資料

transaction

Unnamed: 0,key,tid,uid,product,quantity,price,sum,label,pricelabel
0,1,T0001,1,apple,5,10,50,Cheap,Cheap
1,2,T0002,5,banana,6,34,204,medium,medium
2,3,T0003,4,origin,4,5,20,Cheap,Cheap
5,6,T0004,2,banana,10,34,340,medium,medium
8,9,T0005,5,origin,4,5,20,Cheap,Cheap


In [55]:
# 群組 groupby
transaction = pd.read_csv("transaction.csv")

transaction.groupby("product")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1109d4740>

In [56]:
# 群組計算 groupby
transaction.groupby("product").sum()

Unnamed: 0_level_0,key,tid,uid,quantity,price
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
apple,11,T0001T0005,6,10,20
banana,8,T0002T0004,7,16,68
cherry,4,T0003,3,3,60
guava,13,T0003T0004,5,4,40
origin,19,T0003T0004T0005,11,13,15


In [57]:
transaction.groupby("product").size()

product
apple     2
banana    2
cherry    1
guava     2
origin    3
dtype: int64

In [58]:
# 只計算特定欄位總合
transaction.groupby("product")["quantity"].sum()

product
apple     10
banana    16
cherry     3
guava      4
origin    13
Name: quantity, dtype: int64

In [59]:
# 計算特定欄位值狀況
transaction.groupby("product")["uid"].unique()

product
apple        [1, 5]
banana       [5, 2]
cherry          [3]
guava        [3, 2]
origin    [4, 2, 5]
Name: uid, dtype: object

In [60]:
### 問題 如何計算 每個商品quantity的平均？

transaction.groupby("product")["quantity"].mean()


product
apple     5.000000
banana    8.000000
cherry    3.000000
guava     2.000000
origin    4.333333
Name: quantity, dtype: float64

In [61]:
###計算每個商品的總業績額
# 步驟一 計算出每筆交易的銷售額
transaction["sale"] = transaction["quantity"] * transaction["price"]

transaction


Unnamed: 0,key,tid,uid,product,quantity,price,sale
0,1,T0001,1,apple,5,10,50
1,2,T0002,5,banana,6,34,204
2,3,T0003,4,origin,4,5,20
3,4,T0003,3,cherry,3,60,180
4,5,T0003,3,guava,2,20,40
5,6,T0004,2,banana,10,34,340
6,7,T0004,2,origin,5,5,25
7,8,T0004,2,guava,2,20,40
8,9,T0005,5,origin,4,5,20
9,10,T0005,5,apple,5,10,50


In [62]:
# 步驟二 計算出每個商品的總業績
transaction.groupby("product")["sale"].sum()

product
apple     100
banana    544
cherry    180
guava      80
origin     65
Name: sale, dtype: int64

In [63]:
# 問題 計算每個商品總銷售額的平均，最小，以及最大值

transaction.groupby("product")["sale"].agg(["mean", "min", "max"])

Unnamed: 0_level_0,mean,min,max
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
apple,50.0,50,50
banana,272.0,204,340
cherry,180.0,180,180
guava,40.0,40,40
origin,21.666667,20,25
