# 第5章 : DataFrameを自在に操作する

In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv("../04/sample_with_index.csv") # Unnamed:0というカラムが追加される
df.head(2)

Unnamed: 0.1,Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,0,1997-07-05,2291,25,2.94665,5.305868,45.8933,52.762659,0.276266,green,triangle
1,1,1997-07-06,506,16,1.915208,0.679004,50.611735,31.453719,-1.854628,blue,


In [5]:
df = pd.read_csv("../04/sample_with_index.csv", index_col=0) # Unnamed:0がindexとして利用されて自動でindexが付与されなくなる
df.head(2)

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,2291,25,2.94665,5.305868,45.8933,52.762659,0.276266,green,triangle
1,1997-07-06,506,16,1.915208,0.679004,50.611735,31.453719,-1.854628,blue,


In [6]:
df = pd.read_csv("../04/sample_without_index.csv") # index番号をcsvに保存していないためUnnamedのようなカラムは発生しない
df.head(2)

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,2291,25,2.94665,5.305868,45.8933,52.762659,0.276266,green,triangle
1,1997-07-06,506,16,1.915208,0.679004,50.611735,31.453719,-1.854628,blue,


In [7]:
df_p = pd.read_pickle("../04/sample1.pkl") # pickleだとindexが自動でdropされる
df_p.head()

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,2291,25,2.94665,5.305868,45.8933,52.762659,0.276266,green,triangle
1,1997-07-06,506,16,1.915208,0.679004,50.611735,31.453719,-1.854628,blue,
2,1997-07-07,9629,32,7.869855,6.563335,43.830416,56.239011,0.623901,blue,square
3,1997-07-08,6161,67,6.375209,5.756029,41.358007,61.453113,1.145311,green,square
4,NaT,8570,55,0.390629,3.578136,55.739709,,1.03719,red,square


In [8]:
df.info() # CSVだとDateTimeがobjectで読み込まれる

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        97 non-null     object 
 1   Price       100 non-null    int64  
 2   Quantity    100 non-null    int64  
 3   Width       97 non-null     float64
 4   Height      97 non-null     float64
 5   Quality     96 non-null     float64
 6   Score       93 non-null     float64
 7   Difference  99 non-null     float64
 8   Color       96 non-null     object 
 9   Shape       95 non-null     object 
dtypes: float64(5), int64(2), object(3)
memory usage: 7.9+ KB


In [9]:
df_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        97 non-null     datetime64[ns]
 1   Price       100 non-null    int32         
 2   Quantity    100 non-null    int32         
 3   Width       97 non-null     float64       
 4   Height      97 non-null     float64       
 5   Quality     96 non-null     float64       
 6   Score       93 non-null     float64       
 7   Difference  99 non-null     float64       
 8   Color       96 non-null     object        
 9   Shape       95 non-null     object        
dtypes: datetime64[ns](1), float64(5), int32(2), object(2)
memory usage: 7.2+ KB


In [10]:
df["Date"] = pd.to_datetime(df["Date"]) # objectをdatetimeに変換
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        97 non-null     datetime64[ns]
 1   Price       100 non-null    int64         
 2   Quantity    100 non-null    int64         
 3   Width       97 non-null     float64       
 4   Height      97 non-null     float64       
 5   Quality     96 non-null     float64       
 6   Score       93 non-null     float64       
 7   Difference  99 non-null     float64       
 8   Color       96 non-null     object        
 9   Shape       95 non-null     object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 7.9+ KB


In [11]:
# 読み込み時にobjectではなくDateTimeで読み込む方法
pd.read_csv("../04/sample_with_index.csv", index_col=0, parse_dates=["Date"]).info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        97 non-null     datetime64[ns]
 1   Price       100 non-null    int64         
 2   Quantity    100 non-null    int64         
 3   Width       97 non-null     float64       
 4   Height      97 non-null     float64       
 5   Quality     96 non-null     float64       
 6   Score       93 non-null     float64       
 7   Difference  99 non-null     float64       
 8   Color       96 non-null     object        
 9   Shape       95 non-null     object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 8.6+ KB


In [12]:
df.head()

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,2291,25,2.94665,5.305868,45.8933,52.762659,0.276266,green,triangle
1,1997-07-06,506,16,1.915208,0.679004,50.611735,31.453719,-1.854628,blue,
2,1997-07-07,9629,32,7.869855,6.563335,43.830416,56.239011,0.623901,blue,square
3,1997-07-08,6161,67,6.375209,5.756029,41.358007,61.453113,1.145311,green,square
4,NaT,8570,55,0.390629,3.578136,55.739709,,1.03719,red,square


In [13]:
# df[3]は使えない
# df[0, 3]なども使えない
df[:3] # これは使える

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,2291,25,2.94665,5.305868,45.8933,52.762659,0.276266,green,triangle
1,1997-07-06,506,16,1.915208,0.679004,50.611735,31.453719,-1.854628,blue,
2,1997-07-07,9629,32,7.869855,6.563335,43.830416,56.239011,0.623901,blue,square


In [14]:
df["Price"][0] # Seriesは使える

np.int64(2291)

In [15]:
print(type(df["Price"])) # これはSeries
print(type(df[["Price"]])) # これはDataFrame

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [16]:
# 特定のカラムのみ抽出
df[["Price", "Quantity"]].head(3)

Unnamed: 0,Price,Quantity
0,2291,25
1,506,16
2,9629,32


In [17]:
# locの場合、l:rとスライス記述したときにrも含む
df.loc[0:3, "Date":"Quantity"]

Unnamed: 0,Date,Price,Quantity
0,1997-07-05,2291,25
1,1997-07-06,506,16
2,1997-07-07,9629,32
3,1997-07-08,6161,67


In [18]:
df.loc[[1, 3, 5], "Width"] # 特定のindexのみ指定したり、1つのカラムのみ指定することも可能

1    1.915208
3    6.375209
5    9.456832
Name: Width, dtype: float64

In [20]:
# ilocはindexで取得
# ilocでl:rとスライスで記述したときはrを含まない
df.iloc[0:3, 0:3]

Unnamed: 0,Date,Price,Quantity
0,1997-07-05,2291,25
1,1997-07-06,506,16
2,1997-07-07,9629,32


In [22]:
df.iloc[[0, 5, 10], [4, 5]]

Unnamed: 0,Height,Quality
0,5.305868,45.8933
5,0.600447,53.12667
10,5.615089,58.913664


In [23]:
# カラム名はわかっていてindexがわからないとき
df.iloc[0:3, df.columns.get_loc("Color")]

0    green
1     blue
2     blue
Name: Color, dtype: object

In [24]:
df["Height"] >= 9.5

0     False
1     False
2     False
3     False
4     False
      ...  
95    False
96    False
97    False
98    False
99    False
Name: Height, Length: 100, dtype: bool

In [26]:
# 条件文による抽出
df[df["Height"] >= 9.5]

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
28,1997-08-02,3216,15,1.304293,9.843835,54.360744,42.44777,-0.755223,green,circle
30,1997-08-04,7645,12,5.807748,9.543379,43.975477,59.775418,0.977542,blue,square
44,1997-08-18,3263,73,5.736099,9.603458,,55.576744,0.557674,blue,triangle
59,1997-09-02,7409,46,3.089388,9.666302,51.585682,42.188711,-0.781129,blue,circle
82,NaT,4236,56,2.071701,9.561883,54.54825,50.572798,0.05728,blue,square


In [28]:
# 複数の条件文を論理演算でつなぐときは()を使用する
df[(df["Height"] >= 9.5) & (df["Price"] > 4000)]

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
30,1997-08-04,7645,12,5.807748,9.543379,43.975477,59.775418,0.977542,blue,square
59,1997-09-02,7409,46,3.089388,9.666302,51.585682,42.188711,-0.781129,blue,circle
82,NaT,4236,56,2.071701,9.561883,54.54825,50.572798,0.05728,blue,square


In [29]:
df[(df["Height"] >= 9.5) & ((df["Price"] > 4000) | (df["Shape"] == "triangle"))]

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
30,1997-08-04,7645,12,5.807748,9.543379,43.975477,59.775418,0.977542,blue,square
44,1997-08-18,3263,73,5.736099,9.603458,,55.576744,0.557674,blue,triangle
59,1997-09-02,7409,46,3.089388,9.666302,51.585682,42.188711,-0.781129,blue,circle
82,NaT,4236,56,2.071701,9.561883,54.54825,50.572798,0.05728,blue,square


In [30]:
# 可読性を上げる
condition1 = df["Height"] >= 9.5
condition2 = df["Price"] > 4000
condition3 = df["Shape"] == "triangle"
df[condition1 & (condition2 | condition3)]

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
30,1997-08-04,7645,12,5.807748,9.543379,43.975477,59.775418,0.977542,blue,square
44,1997-08-18,3263,73,5.736099,9.603458,,55.576744,0.557674,blue,triangle
59,1997-09-02,7409,46,3.089388,9.666302,51.585682,42.188711,-0.781129,blue,circle
82,NaT,4236,56,2.071701,9.561883,54.54825,50.572798,0.05728,blue,square


In [33]:
# filter
df.filter(like="or", axis=1).head(3) # カラム名にorを含むものだけ抽出

Unnamed: 0,Score,Color
0,52.762659,green
1,31.453719,blue
2,56.239011,blue


In [34]:
df.filter(like="0", axis=0).head(3)

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,2291,25,2.94665,5.305868,45.8933,52.762659,0.276266,green,triangle
10,1997-07-15,5916,54,2.977257,5.615089,58.913664,71.71257,2.171257,green,
20,1997-07-25,3445,48,7.972677,0.679787,45.954514,40.177057,-0.982294,green,square


In [35]:
# queryでは文字列を使って抽出できる
df.query("9 < Height & Width < 3")

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
28,1997-08-02,3216,15,1.304293,9.843835,54.360744,42.44777,-0.755223,green,circle
82,NaT,4236,56,2.071701,9.561883,54.54825,50.572798,0.05728,blue,square
97,1997-10-10,5113,62,2.384461,9.218051,54.261728,43.816027,-0.618397,blue,square


In [36]:
df.query("Color in ['red', 'blue']")

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
1,1997-07-06,506,16,1.915208,0.679004,50.611735,31.453719,-1.854628,blue,
2,1997-07-07,9629,32,7.869855,6.563335,43.830416,56.239011,0.623901,blue,square
4,NaT,8570,55,0.390629,3.578136,55.739709,,1.037190,red,square
6,1997-07-11,3723,41,8.640421,8.772905,52.750418,48.883017,-0.111698,blue,circle
7,1997-07-12,4664,78,0.511937,6.524186,51.512058,46.378987,-0.362101,red,square
...,...,...,...,...,...,...,...,...,...,...
95,1997-10-08,4285,50,9.133029,5.902204,41.603027,61.307264,1.130726,blue,circle
96,1997-10-09,9714,76,1.711943,7.144745,57.420940,55.600397,0.560040,red,triangle
97,1997-10-10,5113,62,2.384461,9.218051,54.261728,43.816027,-0.618397,blue,square
98,1997-10-11,5916,57,7.212739,6.769230,54.642241,47.752013,-0.224799,blue,circle


In [37]:
df[df['Color'].isin(['red', 'blue'])]

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
1,1997-07-06,506,16,1.915208,0.679004,50.611735,31.453719,-1.854628,blue,
2,1997-07-07,9629,32,7.869855,6.563335,43.830416,56.239011,0.623901,blue,square
4,NaT,8570,55,0.390629,3.578136,55.739709,,1.037190,red,square
6,1997-07-11,3723,41,8.640421,8.772905,52.750418,48.883017,-0.111698,blue,circle
7,1997-07-12,4664,78,0.511937,6.524186,51.512058,46.378987,-0.362101,red,square
...,...,...,...,...,...,...,...,...,...,...
95,1997-10-08,4285,50,9.133029,5.902204,41.603027,61.307264,1.130726,blue,circle
96,1997-10-09,9714,76,1.711943,7.144745,57.420940,55.600397,0.560040,red,triangle
97,1997-10-10,5113,62,2.384461,9.218051,54.261728,43.816027,-0.618397,blue,square
98,1997-10-11,5916,57,7.212739,6.769230,54.642241,47.752013,-0.224799,blue,circle


In [38]:
# 要素の値の更新はloc, ilocを使う
df.loc[0, 'Price'] = 10000
df.head(2)

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,10000,25,2.94665,5.305868,45.8933,52.762659,0.276266,green,triangle
1,1997-07-06,506,16,1.915208,0.679004,50.611735,31.453719,-1.854628,blue,


In [41]:
df.loc[0, ['Width', 'Height']] = 8.88 # 複数のカラムを同じ値で書き換える
df.head(2)

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,10000,25,8.88,8.88,45.8933,52.762659,0.276266,green,triangle
1,1997-07-06,506,16,1.915208,0.679004,50.611735,31.453719,-1.854628,blue,


In [42]:
df.loc[0, ['Width', 'Height']] = [1.11, 7.77] # それぞれ書き換える
df.head(2)

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,10000,25,1.11,7.77,45.8933,52.762659,0.276266,green,triangle
1,1997-07-06,506,16,1.915208,0.679004,50.611735,31.453719,-1.854628,blue,


In [44]:
df.loc[[0, 1], ['Width', 'Height']] = 1.23
df.head(2)

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,10000,25,1.23,1.23,45.8933,52.762659,0.276266,green,triangle
1,1997-07-06,506,16,1.23,1.23,50.611735,31.453719,-1.854628,blue,


In [46]:
df.loc[[0, 1], ['Width', 'Height']] = [7.77, 4.44]
df.head(2)

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,10000,25,7.77,4.44,45.8933,52.762659,0.276266,green,triangle
1,1997-07-06,506,16,7.77,4.44,50.611735,31.453719,-1.854628,blue,


In [47]:
df.loc[[0, 1], ['Width', 'Height']] = [[1.11, 2.22], [3.33, 4.44]]
df.head(2)

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,10000,25,1.11,2.22,45.8933,52.762659,0.276266,green,triangle
1,1997-07-06,506,16,3.33,4.44,50.611735,31.453719,-1.854628,blue,


In [48]:
df.loc[df['Price'] == 10000, 'Price'] = 9999
df.head()

Unnamed: 0,Date,Price,Quantity,Width,Height,Quality,Score,Difference,Color,Shape
0,1997-07-05,9999,25,1.11,2.22,45.8933,52.762659,0.276266,green,triangle
1,1997-07-06,506,16,3.33,4.44,50.611735,31.453719,-1.854628,blue,
2,1997-07-07,9629,32,7.869855,6.563335,43.830416,56.239011,0.623901,blue,square
3,1997-07-08,6161,67,6.375209,5.756029,41.358007,61.453113,1.145311,green,square
4,NaT,8570,55,0.390629,3.578136,55.739709,,1.03719,red,square
