In [2]:
import pandas as pd
import numpy as np

# ファイルの読み込み

In [2]:
frame = pd.read_csv('data/data1.csv')
frame

Unnamed: 0,time,sensor1,sensor2
0,0,1,5
1,1,3,6
2,2,4,7
3,3,3,5
4,4,8,9


In [12]:
frame = pd.read_csv('data/data1.csv')
print(frame.describe(include='all'))

           time   sensor1  sensor2
count  5.000000  5.000000  5.00000
mean   2.000000  3.800000  6.40000
std    1.581139  2.588436  1.67332
min    0.000000  1.000000  5.00000
25%    1.000000  3.000000  5.00000
50%    2.000000  3.000000  6.00000
75%    3.000000  4.000000  7.00000
max    4.000000  8.000000  9.00000


In [3]:
frame = pd.read_csv('data/data1_noheader.csv', header=None)
frame

Unnamed: 0,0,1,2
0,0,1,5
1,1,3,6
2,2,4,7
3,3,3,5
4,4,8,9


In [4]:
frame = pd.read_csv('data/data1_noheader.csv', names=['a','b','c'])
frame

Unnamed: 0,a,b,c
0,0,1,5
1,1,3,6
2,2,4,7
3,3,3,5
4,4,8,9


In [5]:
pd.read_pickle('data/data3.pickle')

Unnamed: 0,a,b,c
0,0,1,5
1,1,3,6
2,2,4,7
3,3,3,5
4,4,8,9


# 大容量のデータの読み込み

In [6]:
data = pd.read_csv('data/data2.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [15]:
data = pd.read_csv('data/data2.csv')
data.describe(include='all')

Unnamed: 0,one,two,three,four,key
count,10000.0,10000.0,10000.0,10000.0,10000
unique,,,,,36
top,,,,,E
freq,,,,,368
mean,0.04575,0.000871,-0.026463,0.015985,
std,0.948825,1.003829,1.037273,0.982409,
min,-3.726864,-3.465356,-3.234391,-3.173509,
25%,-0.618617,-0.706643,-0.727791,-0.676291,
50%,0.041638,0.018972,-0.03234,-0.005338,
75%,0.701536,0.708405,0.626904,0.659369,


In [18]:
data = pd.read_csv('data/data2.csv')
data['key'].value_counts()

E    368
X    364
L    346
O    343
Q    340
M    338
J    337
F    335
K    334
H    330
V    328
I    327
U    326
P    324
A    320
D    320
R    318
Y    314
G    308
S    308
N    306
W    305
T    304
B    302
Z    288
C    286
4    171
6    166
7    164
8    162
3    162
5    157
2    152
0    151
9    150
1    146
Name: key, dtype: int64

In [7]:
pd.read_csv('data/data2.csv', chunksize= 10)

<pandas.io.parsers.TextFileReader at 0x183fc9bb948>

# データ書き出し

In [8]:
frame.to_csv('data/out1.csv')
frame.to_pickle('data/out2.pickle')

# 欠損値の削除、穴埋め

In [4]:
data = pd.Series([1,np.nan, 3,np.nan,5])
print(data)

print('\n欠損値かどうかを確認')
print(data.isnull())

print('\n欠損値の削除')
print(data.dropna())

0    1.0
1    NaN
2    3.0
3    NaN
4    5.0
dtype: float64

欠損値かどうかを確認
0    False
1     True
2    False
3     True
4    False
dtype: bool

欠損値の削除
0    1.0
2    3.0
4    5.0
dtype: float64


In [7]:
frame = pd.DataFrame([[1.,2.,3.],
                    [4.,np.nan,6.],
                    [np.nan,np.nan,np.nan]])
print(frame)

print('\n欠損値を含む行のみを確認')
print(frame[frame.isnull().any(axis=1)])

print('\n欠損値を含む行、列の削除')
print(frame.dropna())

print('\n全てが欠損値の行のみ削除')
print(frame.dropna(how='all'))

     0    1    2
0  1.0  2.0  3.0
1  4.0  NaN  6.0
2  NaN  NaN  NaN

欠損値を含む行のみを確認
     0   1    2
1  4.0 NaN  6.0
2  NaN NaN  NaN

欠損値を含む行、列の削除
     0    1    2
0  1.0  2.0  3.0

全てが欠損値の行のみ削除
     0    1    2
0  1.0  2.0  3.0
1  4.0  NaN  6.0


In [20]:
frame = pd.DataFrame([[1.,2.,3.],
                    [4.,np.nan,6.],
                    [np.nan,np.nan,np.nan]])
print(frame)

print('\n欠損値を0で穴埋め')
print(frame.fillna(0.))

print('\n欠損値をDictionaryで穴埋め')
print(frame.fillna({0:10, 1:20, 2:30}))

     0    1    2
0  1.0  2.0  3.0
1  4.0  NaN  6.0
2  NaN  NaN  NaN

欠損値を0で穴埋め
     0    1    2
0  1.0  2.0  3.0
1  4.0  0.0  6.0
2  0.0  0.0  0.0

欠損値をDictionaryで穴埋め
      0     1     2
0   1.0   2.0   3.0
1   4.0  20.0   6.0
2  10.0  20.0  30.0


In [36]:
frame = pd.DataFrame([[1,2,3],
                    [4,-127,6],
                    [-127,-127,-127]])
print(frame)

print('\n指定値を欠損値として置き換え')
print(frame.replace(-127, np.nan))

     0    1    2
0    1    2    3
1    4 -127    6
2 -127 -127 -127

指定値を欠損値として置き換え
     0    1    2
0  1.0  2.0  3.0
1  4.0  NaN  6.0
2  NaN  NaN  NaN


# 重複の除去

In [8]:
data = pd.DataFrame({'sensor1':[1.,2.,2.,4.,2.], 'sensor2': [6.,2.,2.,4.,2.]})
print(data)

print('\n重複する行のみ表示')
print(data[data.duplicated()])

print('\n重複する行を削除')
print(data.drop_duplicates())

   sensor1  sensor2
0      1.0      6.0
1      2.0      2.0
2      2.0      2.0
3      4.0      4.0
4      2.0      2.0

重複する行のみ表示
   sensor1  sensor2
2      2.0      2.0
4      2.0      2.0

重複する行を削除
   sensor1  sensor2
0      1.0      6.0
1      2.0      2.0
3      4.0      4.0


# インデックスの振り直し

In [11]:
data = pd.DataFrame({'sensor1':[1.,2.,2.,4.,2.], 'sensor2': [6.,2.,2.,4.,2.]})
print(data)

print('\n重複する行を削除')
data.drop_duplicates(inplace=True)
print(data)

print('\nインデックスを振りなおす')
data.reset_index(drop=True, inplace=True)
print(data)

   sensor1  sensor2
0      1.0      6.0
1      2.0      2.0
2      2.0      2.0
3      4.0      4.0
4      2.0      2.0

重複する行を削除
   sensor1  sensor2
0      1.0      6.0
1      2.0      2.0
3      4.0      4.0

インデックスを振りなおす
   sensor1  sensor2
0      1.0      6.0
1      2.0      2.0
2      4.0      4.0


# 外れ値の検出と除去

In [52]:
data = pd.DataFrame(np.random.randn(5,3))
print(data)
print('\n所定以上の値を外れ値として限界値に丸める')
data[(np.abs(data) > 1)] = np.sign(data)
data

          0         1         2
0  0.803816  0.070844 -2.343208
1 -0.283637 -0.007649  1.723306
2 -0.306617 -0.511262  1.544639
3 -1.319703 -0.190599 -0.441576
4 -0.753192 -1.403927  0.168618

所定以上の値を外れ値として限界値に丸める


Unnamed: 0,0,1,2
0,0.803816,0.070844,-1.0
1,-0.283637,-0.007649,1.0
2,-0.306617,-0.511262,1.0
3,-1.0,-0.190599,-0.441576
4,-0.753192,-1.0,0.168618


In [20]:
print('\n結合前')
data1 = pd.DataFrame(np.random.randn(5,3))
print(data1)
data2 = pd.DataFrame(np.random.randn(5,3))
print(data2)
print('\n結合後')
data = pd.concat([data1, data2], axis=1)
print(data)


結合前
          0         1         2
0 -0.970429  0.976698  0.120852
1 -1.014045 -2.087784  0.675438
2  1.053432  0.152857  0.892146
3  1.121543 -0.284786  0.470989
4 -1.303786 -0.107382 -0.542098
          0         1         2
0 -0.033789  1.243303 -0.336750
1 -0.541188  1.474833 -0.434581
2 -0.479493 -0.392412 -0.459585
3  1.031489  0.759068  0.826408
4 -0.733451 -0.487153 -1.155012

結合後
          0         1         2         0         1         2
0 -0.970429  0.976698  0.120852 -0.033789  1.243303 -0.336750
1 -1.014045 -2.087784  0.675438 -0.541188  1.474833 -0.434581
2  1.053432  0.152857  0.892146 -0.479493 -0.392412 -0.459585
3  1.121543 -0.284786  0.470989  1.031489  0.759068  0.826408
4 -1.303786 -0.107382 -0.542098 -0.733451 -0.487153 -1.155012
