### Pandas按行遍历DataFrame的3种方法

In [1]:
import pandas as pd
import numpy as np
import collections

In [2]:
df = pd.DataFrame(
    np.random.random(size=(1000000, 4)), 
    columns=list('ABCD')
)
df.head(3)

Unnamed: 0,A,B,C,D
0,0.890946,0.622711,0.802276,0.061784
1,0.870922,0.954206,0.678069,0.944474
2,0.619009,0.086155,0.686046,0.109616


In [3]:
df.shape

(1000000, 4)

### 1. df.iterrows()

#### 使用方式

In [4]:
for idx, row in df.iterrows():
    print(idx, row)
    print(idx, row["A"], row["B"], row["C"], row["D"])
    break

0 A    0.890946
B    0.622711
C    0.802276
D    0.061784
Name: 0, dtype: float64
0 0.8909462120125907 0.6227111800602092 0.8022760436120219 0.061783819225730174


#### 时间耗费

In [5]:
%%time
result = collections.defaultdict(int)
for idx, row in df.iterrows():
    result[(row["A"], row["B"])] += row["A"] + row["B"]

Wall time: 1min 12s


### 2. df.itertuples()

#### 使用方式

In [6]:
for row in df.itertuples():
    print(row)
    print(row.Index, row.A, row.B, row.C, row.D)
    break

Pandas(Index=0, A=0.8909462120125907, B=0.6227111800602092, C=0.8022760436120219, D=0.061783819225730174)
0 0.8909462120125907 0.6227111800602092 0.8022760436120219 0.061783819225730174


#### 时间耗费

In [7]:
%%time
result = collections.defaultdict(int)
for row in df.itertuples():
    result[(row.A, row.B)] += row.A + row.B

Wall time: 1.78 s


### 3. for+zip

#### 使用方式

In [8]:
# 既不需要类型检查，也不需要构建namedtuple
# 缺点是需要挨个指定变量
for A, B in zip(df["A"], df["B"]):
    print(A, B)
    break

0.8909462120125907 0.6227111800602092


#### 时间耗费

In [9]:
%%time
result = collections.defaultdict(int)
for A, B in zip(df["A"], df["B"]):
    result[(A, B)] += A + B

Wall time: 1.01 s
