In [1]:
import pandas as pd
import numpy as np

# ファイルの読み込み

In [2]:
frame = pd.read_csv('data/data1.csv')
frame

Unnamed: 0,time,sensor1,sensor2
0,0,1,5
1,1,3,6
2,2,4,7
3,3,3,5
4,4,8,9


In [3]:
frame = pd.read_csv('data/data1_noheader.csv', header=None)
frame

Unnamed: 0,0,1,2
0,0,1,5
1,1,3,6
2,2,4,7
3,3,3,5
4,4,8,9


In [4]:
frame = pd.read_csv('data/data1_noheader.csv', names=['a','b','c'])
frame

Unnamed: 0,a,b,c
0,0,1,5
1,1,3,6
2,2,4,7
3,3,3,5
4,4,8,9


In [5]:
pd.read_pickle('data/data3.pickle')

Unnamed: 0,a,b,c
0,0,1,5
1,1,3,6
2,2,4,7
3,3,3,5
4,4,8,9


# 大容量のデータの読み込み

In [6]:
pd.read_csv('data/data2.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [7]:
pd.read_csv('data/data2.csv', chunksize= 10)

<pandas.io.parsers.TextFileReader at 0x183fc9bb948>

# データ書き出し

In [8]:
frame.to_csv('data/out1.csv')
frame.to_pickle('data/out2.pickle')

# 欠損値の削除、穴埋め

In [12]:
data = pd.Series([1,np.nan, 3,np.nan,5])
print(data)

print('\n欠損値の削除')
print(data.dropna())

0    1.0
1    NaN
2    3.0
3    NaN
4    5.0
dtype: float64

欠損値の削除
0    1.0
2    3.0
4    5.0
dtype: float64


In [17]:
frame = pd.DataFrame([[1.,2.,3.],
                    [4.,np.nan,6.],
                    [np.nan,np.nan,np.nan]])
print(frame)

print('\n欠損値を含む行、列の削除')
print(frame.dropna())

print('\n全てが欠損値の行のみ削除')
print(frame.dropna(how='all'))

     0    1    2
0  1.0  2.0  3.0
1  4.0  NaN  6.0
2  NaN  NaN  NaN

欠損値の削除
     0    1    2
0  1.0  2.0  3.0

全てが欠損値の行のみ削除
     0    1    2
0  1.0  2.0  3.0
1  4.0  NaN  6.0


In [20]:
frame = pd.DataFrame([[1.,2.,3.],
                    [4.,np.nan,6.],
                    [np.nan,np.nan,np.nan]])
print(frame)

print('\n欠損値を0で穴埋め')
print(frame.fillna(0.))

print('\n欠損値をDictionaryで穴埋め')
print(frame.fillna({0:10, 1:20, 2:30}))

     0    1    2
0  1.0  2.0  3.0
1  4.0  NaN  6.0
2  NaN  NaN  NaN

欠損値を0で穴埋め
     0    1    2
0  1.0  2.0  3.0
1  4.0  0.0  6.0
2  0.0  0.0  0.0

欠損値をDictionaryで穴埋め
      0     1     2
0   1.0   2.0   3.0
1   4.0  20.0   6.0
2  10.0  20.0  30.0


In [36]:
frame = pd.DataFrame([[1,2,3],
                    [4,-127,6],
                    [-127,-127,-127]])
print(frame)

print('\n指定値を欠損値として置き換え')
print(frame.replace(-127, np.nan))

     0    1    2
0    1    2    3
1    4 -127    6
2 -127 -127 -127

指定値を欠損値として置き換え
     0    1    2
0  1.0  2.0  3.0
1  4.0  NaN  6.0
2  NaN  NaN  NaN


# 重複の除去

In [33]:
data = pd.DataFrame({'sensor1':[1.,2.,2.,4.,2.], 'sensor2': [6.,2.,2.,4.,2.]})
print(data)

print('\n重複する行をBoolで表示')
print(data.duplicated())

print('\n重複する行を削除')
print(data.drop_duplicates())

   sensor1  sensor2
0      1.0      6.0
1      2.0      2.0
2      2.0      2.0
3      4.0      4.0
4      2.0      2.0

重複する行をBoolで表示
0    False
1    False
2     True
3    False
4     True
dtype: bool

重複する行を削除
   sensor1  sensor2
0      1.0      6.0
1      2.0      2.0
3      4.0      4.0


# 外れ値の検出と除去

In [52]:
data = pd.DataFrame(np.random.randn(5,3))
print(data)
print('\n所定以上の値を外れ値として限界値に丸める')
data[(np.abs(data) > 1)] = np.sign(data)
data

          0         1         2
0  0.803816  0.070844 -2.343208
1 -0.283637 -0.007649  1.723306
2 -0.306617 -0.511262  1.544639
3 -1.319703 -0.190599 -0.441576
4 -0.753192 -1.403927  0.168618

所定以上の値を外れ値として限界値に丸める


Unnamed: 0,0,1,2
0,0.803816,0.070844,-1.0
1,-0.283637,-0.007649,1.0
2,-0.306617,-0.511262,1.0
3,-1.0,-0.190599,-0.441576
4,-0.753192,-1.0,0.168618
