pandas(パンダス)は，`.csv`ファイルなどを使用する際に，便利な機能が多いです．  
是非，使い方を習得しましょう．

In [1]:
# sries（名前付きの数値列）の作成
import pandas as pd

s = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"] )
print (s)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [2]:
# 表（dataframe）の作成
import pandas as pd
import numpy as np

## テストの点数
points = [
 [91, 80],
 [50, 90],
 [80, 85]
]

df = pd.DataFrame( points )
print (df)

    0   1
0  91  80
1  50  90
2  80  85


In [3]:
# 行と列に名前をつけて表（dataframe）の作成
import pandas as pd
import numpy as np

## 名前・科目・点数のデータ
names = ['taro', 'jiro', 'hanako']
subjects = ['math', 'japanese']
points = [
 [91, 80],
 [50, 90],
 [80, 85]
]

## 表を作成
df = pd.DataFrame( points, columns=subjects, index=names )
print (df)

        math  japanese
taro      91        80
jiro      50        90
hanako    80        85


In [16]:
# 表の保存と読み込み

## 名前・科目・点数のデータ
names = ['taro', 'jiro', 'hanako']
subjects = ['math', 'japanese']
points = [
 [91, 80],
 [50, 90],
 [80, 85]
]

## 表を作成
df = pd.DataFrame( points, columns=subjects, index=names )

## 保存（保存したtest.csvはエクセル等で開いて編集可能）
df.to_csv("test.csv", sep=",")
### 日本語有:df.to_csv("xxx.csv", encoding="shift_jis")

## 試しにエクセルで編集してみましょう．

## 読み込み（編集したファイルを確認）
df2 = pd.read_csv('test.csv', index_col=0, header=0)
print( df2 )

## +a
## pickleでの保存
df.to_pickle("test.pkl")
## 読み込み
df3 = pd.read_pickle("test.pkl")
print( df3 )

        math  japanese
taro      91        80
jiro      50        90
hanako    80        85
        math  japanese
taro      91        80
jiro      50        90
hanako    80        85


In [5]:
# 表（dataframe）からデータの取り出し
import pandas as pd
import numpy as np

## 名前・科目・点数のデータ
names = ['taro', 'jiro', 'hanako']
subjects = ['math', 'japanese']
points = [
 [91, 80],
 [50, 90],
 [80, 85]
]

## 表を作成
df = pd.DataFrame( points, columns=subjects, index=names )

## 確認
print("--- 全員の点数の取得 ---")
print (df.values)

print()
print( "--- 列の名前の取得 ---" )
print (df.columns)
print(df.columns[0])

print()
print("--- 行の名前の取得 ---")
print (df.index)
print(df.index[0])

print()
print("--- リスト化 ---")
print (df.columns.to_list() )
print (df.index.to_list() )

--- 全員の点数の取得 ---
[[91 80]
 [50 90]
 [80 85]]

--- 列の名前の取得 ---
Index(['math', 'japanese'], dtype='object')
math

--- 行の名前の取得 ---
Index(['taro', 'jiro', 'hanako'], dtype='object')
taro

--- リスト化 ---
['math', 'japanese']
['taro', 'jiro', 'hanako']


In [6]:
# column（科目名）やindex（人物名）の変更
import pandas as pd
import numpy as np

## 名前・科目・点数のデータ
names = ['taro', 'jiro', 'hanako']
subjects = ['math', 'japanese']
points = [
 [91, 80],
 [50, 90],
 [80, 85]
]

## 表を作成
df = pd.DataFrame( points, columns=subjects, index=names )
print("--- 変更前 ----")
print( df )
print()


## column変更
df.columns = ['english', 'math' ]

## index変更
df.index = ['goro', 'jiro', 'hanako']

print("--- 変更後 ---")
print( df )

--- 変更前 ----
        math  japanese
taro      91        80
jiro      50        90
hanako    80        85

--- 変更後 ---
        english  math
goro         91    80
jiro         50    90
hanako       80    85


In [7]:
# 単独の要素の選択&抽出
import pandas as pd
import numpy as np

## 名前・科目・点数のデータ
names = ['taro', 'jiro', 'hanako']
subjects = ['math', 'japanese']
points = [
 [91, 80],
 [50, 90],
 [80, 85]
]

## 表を作成
df = pd.DataFrame( points, columns=subjects, index=names )

## 行・列名で指定
print (df.at['taro', 'japanese'])

## 行・列番号で指定
print (df.iat[0, 1])

80
80


In [8]:
# 複数要素の選択&抽出
import pandas as pd
import numpy as np

## 名前・科目・点数のデータ
names = ['taro', 'jiro', 'hanako']
subjects = ['math', 'japanese']
points = [
 [91, 80],
 [50, 90],
 [80, 85]
]

## 表を作成
df = pd.DataFrame( points, columns=subjects, index=names )

## 特定の列（科目名）の取得
print( df['math'] )
print( df.math )
print()

## columnやindexによる選択
print( df.loc[:, 'math'] )
print( df.loc['jiro', :] )
print( df.loc['taro':'jiro', :] )
print( df.loc[['taro','jiro'], ['math','japanese']] )
print()

## 行・列番号で指定
print( df.iloc[[0, 2], [0, 1]] )
print()

## 条件により行を抽出
print( df.query('math > 80') )
print( df[(df['math'] > 80) & (df['japanese'] < 90)] ) #and
print( df[(df['math'] > 80) | (df['japanese'] < 90)] ) #or 
print( df[~(df['math'] > 80)] ) #not

## 他にも，任意の行や列に関してグループ化する方法で，`df.groupby`があります．

taro      91
jiro      50
hanako    80
Name: math, dtype: int64
taro      91
jiro      50
hanako    80
Name: math, dtype: int64

taro      91
jiro      50
hanako    80
Name: math, dtype: int64
math        50
japanese    90
Name: jiro, dtype: int64
      math  japanese
taro    91        80
jiro    50        90
      math  japanese
taro    91        80
jiro    50        90

        math  japanese
taro      91        80
hanako    80        85

      math  japanese
taro    91        80
      math  japanese
taro    91        80
        math  japanese
taro      91        80
hanako    80        85
        math  japanese
jiro      50        90
hanako    80        85


In [9]:
# 要素の変更
import pandas as pd
import numpy as np

## 名前・科目・点数のデータ
names = ['taro', 'jiro', 'hanako']
subjects = ['math', 'japanese']
points = [
 [91, 80],
 [50, 90],
 [80, 85]
]

## 表を作成
df = pd.DataFrame( points, columns=subjects, index=names )

## 列（特定の科目の点数）の変更
df.loc[:, 'japanese'] = [10, 50, 90]
print (df)

## 行（特定の生徒の点数）の変更
df.loc['hanako', :] = [100, 100]
print (df)

## 全体
df.loc[:] = np.arange(6).reshape(3, 2) * 100/6
print (df)

## 転置
df_t = df.T
print (df_t)

        math  japanese
taro      91        10
jiro      50        50
hanako    80        90
        math  japanese
taro      91        10
jiro      50        50
hanako   100       100
             math   japanese
taro     0.000000  16.666667
jiro    33.333333  50.000000
hanako  66.666667  83.333333
               taro       jiro     hanako
math       0.000000  33.333333  66.666667
japanese  16.666667  50.000000  83.333333


In [10]:
# 様々な配列からの作成
import pandas as pd
import numpy as np

## 空配列に追加
df = pd.DataFrame(index=[], columns=[])
print( df )

## 行を一つずつ追加
df = pd.DataFrame(index=[], columns = ['japanese', 'math', 'english', 'sports'])
for i in range(4):
  tempdf = pd.Series( np.random.randint(0, 100, 4), index=df.columns )
  df = df.append(tempdf, ignore_index=True)
print( df )

## 辞書`dict`から作成
data = {
    "math": [100, 60, 80], 
    "english": [40, 20, 50],
    "phisics":[90, 85, 70]
}
df = pd.DataFrame( data, index=['taro', 'jiro', 'hanako'])
print (df)

## 要素がない箇所は欠損値で埋められる
data = {
    "math": { "taro":100, "jiro":60, "hanako":80}, 
    "english": { "taro":40, "hanako":50},
    "phisics": { "jiro":85, "hanako": 70}
}
df = pd.DataFrame( data )
print (df)

Empty DataFrame
Columns: []
Index: []
  japanese math english sports
0       57    6      57     29
1       50   32      76     42
2       12   62      89      1
3       10   84      52     11
        math  english  phisics
taro     100       40       90
jiro      60       20       85
hanako    80       50       70
        math  english  phisics
taro     100     40.0      NaN
jiro      60      NaN     85.0
hanako    80     50.0     70.0


辞書を読み込めるので，`json`ファイルから読み込み可能．
**pd.read_json()**



In [11]:
# 欠損値の扱い
import pandas as pd
import numpy as np
Nan = np.nan

## 名前・科目・点数のデータ
names = ['taro', 'jiro', 'hanako']
subjects = ['math', 'japanese', 'programming']
points = [
 [91, Nan, 50],
 [50, Nan, 40],
 [Nan, Nan, Nan]
]
df = pd.DataFrame( points, columns=subjects, index=names )
print (df)

## すべてが欠損値の行を削除する
print(df.dropna(how='all'))
print()

## すべてが欠損値の列を削除する
print(df.dropna(how='all', axis=1))
print()

## 名前・科目・点数のデータ
points = [
 [91, Nan, 50],
 [50, 80, 40],
 [60, Nan, Nan]
]
df2 = pd.DataFrame( points, columns=subjects, index=names )
print(df2)
print()

## 欠損値が一つでも含まれる行を削除
print(df2.dropna()) #how='any' でも良い
print()

## 欠損値が一つでも含まれる列を削除
print(df2.dropna(axis=1))
print()
## 置換
print(df2.fillna(0))
print()

## 欠損値が一つでも含まれる行・列を抽出する
print(df2)
print(df2[df2.isnull().any(axis=1)])
print()

###df.mean()などの演算では，NaNは除外される
#print( df2.mean() )

        math  japanese  programming
taro    91.0       NaN         50.0
jiro    50.0       NaN         40.0
hanako   NaN       NaN          NaN
      math  japanese  programming
taro  91.0       NaN         50.0
jiro  50.0       NaN         40.0

        math  programming
taro    91.0         50.0
jiro    50.0         40.0
hanako   NaN          NaN

        math  japanese  programming
taro      91       NaN         50.0
jiro      50      80.0         40.0
hanako    60       NaN          NaN

      math  japanese  programming
jiro    50      80.0         40.0

        math
taro      91
jiro      50
hanako    60

        math  japanese  programming
taro      91       0.0         50.0
jiro      50      80.0         40.0
hanako    60       0.0          0.0

        math  japanese  programming
taro      91       NaN         50.0
jiro      50      80.0         40.0
hanako    60       NaN          NaN
        math  japanese  programming
taro      91       NaN         50.0
hanako    60       N

In [12]:
# 要素や行・列の削除
import numpy as np
import pandas as pd

## 名前・科目・点数のデータ
names = ['taro', 'jiro', 'hanako']
subjects = ['math', 'japanese', 'programming']
points = [
 [91, 80, 100],
 [50, 90, 100],
 [80, 85, 100]
]

## 表を作成
df = pd.DataFrame( points, columns=subjects, index=names )

## 行削除
print(df.drop(index='taro'))
print()

## 列の削除
print(df.drop(columns='math'))
print()

## 複数の行と列の削除
print(df.drop(index=['taro', 'hanako'], columns=df.columns[[0, 2]]))

        math  japanese  programming
jiro      50        90          100
hanako    80        85          100

        japanese  programming
taro          80          100
jiro          90          100
hanako        85          100

      japanese
jiro        90


In [13]:
# 行・列の追加&結合
import pandas as pd
import numpy as np

## 名前・科目・点数のデータ
names = ['taro', 'jiro', 'hanako']
subjects = ['grade', 'math', 'japanese', 'programming']
points = [
 ['A', 91, 80, 100],
 ['B', 50, 90, 100],
 ['S', 80, 85, 100]
]

## 表を作成
df = pd.DataFrame( points, columns=subjects, index=names )

### 列(科目)
## 列にスカラーや配列を追加
df['chemistry'] = 30 #df.assign()でも良い(元のdataframeが変更されない)
print(df)
print()

df['phisics'] = [40, 45, 45]
print(df)
print()

## 各列を用いて，新たな列の作成
df['science'] = df['phisics'] + df['chemistry']
df['grade'] = df['grade'].str.lower() #大文字小文字の変換
print(df)
print()

## 任意の場所に要素を追加
df.insert(1, 'history', pd.Series(['D', 'A',], index=['taro', 'hanako']))
print(df)
print()

### 行(名前)
## 行の追加
df.loc['goro'] = 0
print(df)
print()

## データフレームを追加
df2 = pd.DataFrame( np.ones(16).reshape(2, 8)*100, index=[ 'amuro', 'char'], columns=df.columns )
print (df.append(df2))

       grade  math  japanese  programming  chemistry
taro       A    91        80          100         30
jiro       B    50        90          100         30
hanako     S    80        85          100         30

       grade  math  japanese  programming  chemistry  phisics
taro       A    91        80          100         30       40
jiro       B    50        90          100         30       45
hanako     S    80        85          100         30       45

       grade  math  japanese  programming  chemistry  phisics  science
taro       a    91        80          100         30       40       70
jiro       b    50        90          100         30       45       75
hanako     s    80        85          100         30       45       75

       grade history  math  japanese  programming  chemistry  phisics  science
taro       a       D    91        80          100         30       40       70
jiro       b     NaN    50        90          100         30       45       75
hanako     s    

In [14]:
# dataframeの結合
import pandas as pd 
import numpy as np

## 名前・科目・点数のデータ
names = ['taro', 'jiro' ]
points1 = [
 [80, 100],
 [90, 100],
]

points2 = [
 [60, 50],
 [90, 40],
]


df = pd.DataFrame( points1, columns=['phisics', 'chemistry'],  index=names)
df3 = pd.DataFrame( points2, columns=['phisics', 'chemistry'],  index=["hanako", "maruko"])

## 横に結合（科目を追加）
df2 = pd.DataFrame( points2, columns=['history', 'econoics'],  index=names)
print(pd.concat([df, df2], axis=1))
print()

## 縦に結合（生徒を追加）
print(pd.concat([df, df3]))

      phisics  chemistry  history  econoics
taro       80        100       60        50
jiro       90        100       90        40

        phisics  chemistry
taro         80        100
jiro         90        100
hanako       60         50
maruko       90         40


$[+ \alpha]$ `df.describe()`は，DataFrameの様々な統計量を容易に確認できる便利な関数です．  