### Pandas按行遍历DataFrame的3种方法

In [1]:
import pandas as pd
import numpy as np
import collections

In [2]:
df = pd.DataFrame(
    np.random.random(size=(100000, 4)), 
    columns=list('ABCD')
)
df.head(3)

Unnamed: 0,A,B,C,D
0,0.329292,0.975468,0.133584,0.224582
1,0.535746,0.451585,0.71325,0.40977
2,0.735287,0.667472,0.950622,0.245938


In [3]:
df.shape

(100000, 4)

### 1. df.iterrows()

#### 使用方式

In [4]:
for idx, row in df.iterrows():
    print(idx, row)
    print(idx, row["A"], row["B"], row["C"], row["D"])
    break

0 A    0.329292
B    0.975468
C    0.133584
D    0.224582
Name: 0, dtype: float64
0 0.3292915092119043 0.9754683984716609 0.1335841433264423 0.22458227907355865


#### 时间耗费

In [5]:
%%time
result = collections.defaultdict(int)
for idx, row in df.iterrows():
    result[(row["A"], row["B"])] += row["A"] + row["B"]

CPU times: user 7.82 s, sys: 35.6 ms, total: 7.85 s
Wall time: 7.89 s


### 2. df.itertuples()

#### 使用方式

In [6]:
for row in df.itertuples():
    print(row)
    print(row.Index, row.A, row.B, row.C, row.D)
    break

Pandas(Index=0, A=0.3292915092119043, B=0.9754683984716609, C=0.1335841433264423, D=0.22458227907355865)
0 0.3292915092119043 0.9754683984716609 0.1335841433264423 0.22458227907355865


#### 时间耗费

In [7]:
%%time
result = collections.defaultdict(int)
for row in df.itertuples():
    result[(row.A, row.B)] += row.A + row.B

CPU times: user 168 ms, sys: 8.35 ms, total: 177 ms
Wall time: 178 ms


### 3. for+zip

#### 使用方式

In [8]:
# 既不需要类型检查，也不需要构建namedtuple
# 缺点是需要挨个指定变量
for A, B in zip(df["A"], df["B"]):
    print(A, B)
    break

0.3292915092119043 0.9754683984716609


#### 时间耗费

In [9]:
%%time
result = collections.defaultdict(int)
for A, B in zip(df["A"], df["B"]):
    result[(A, B)] += A + B

CPU times: user 82.2 ms, sys: 7.05 ms, total: 89.2 ms
Wall time: 89.9 ms
