## pandas

In [2]:
import pandas as pd
import numpy as np

In [7]:
pd.__version__
np.__version__

'1.16.4'

In [None]:
np?
pd?

In [3]:
data = [100, 200, 300, 400, 500]
pd.Series(data)

0    100
1    200
2    300
3    400
4    500
dtype: int64

In [4]:
pd.Series(data)[0]
pd.Series(data)[0:3]
pd.Series(data)[::-1]

4    500
3    400
2    300
1    200
0    100
dtype: int64

In [5]:
d = pd.Series(data)
d.values
d.index

RangeIndex(start=0, stop=5, step=1)

In [8]:
d = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])
d
# d[0]
# d.values
# d.index

200

In [22]:
d + 100
d * 2
d // 2
d ** 2

a     10000
b     40000
c     90000
d    160000
e    250000
dtype: int64

In [29]:
d['a']
d[::-1]
d[::2]
d.a
d.b
d.c

300

In [30]:
d[d>300]

d    400
e    500
dtype: int64

In [31]:
d > 300

a    False
b    False
c    False
d     True
e     True
dtype: bool

In [32]:
for i in d:
    print(i)

100
200
300
400
500


In [39]:
dic = {
    '2015년':1000000,
    '2016년':2000000,
    '2017년':3000000,
    '2018년':4000000,
    '2019년':11000000,
    '2020년':30000000,
}
pd.Series(dic)['2018년':]

2018년     4000000
2019년    11000000
2020년    30000000
dtype: int64

In [41]:
pd.Series(dic)[-3:]

2018년     4000000
2019년    11000000
2020년    30000000
dtype: int64

In [43]:
dic = {
    '2015년':1000000,
    '2016년':2000000,
    '2017년':3000000,
    '2018년':4000000,
    '2019년':11000000,
    '2020년':30000000,
}
pd.Series(dic, index=['2017년', '2019년', '2020년'])

2017년     3000000
2019년    11000000
2020년    30000000
dtype: int64

## Series에 key, value, index

In [49]:
s = pd.Series(dic)
'2015년' in s
1000000 in s

False

In [51]:
'2015년' in dic
1000000 in dic

False

In [52]:
s.keys()

Index(['2015년', '2016년', '2017년', '2018년', '2019년', '2020년'], dtype='object')

In [59]:
list(s.items())

[('2015년', 1000000),
 ('2016년', 2000000),
 ('2017년', 3000000),
 ('2018년', 4000000),
 ('2019년', 11000000),
 ('2020년', 30000000)]

In [62]:
dic.keys()
dic.values()
dic.items()

dict_items([('2015년', 1000000), ('2016년', 2000000), ('2017년', 3000000), ('2018년', 4000000), ('2019년', 11000000), ('2020년', 30000000)])

In [63]:
s

2015년     1000000
2016년     2000000
2017년     3000000
2018년     4000000
2019년    11000000
2020년    30000000
dtype: int64

In [65]:
s[['2017년', '2020년']] #팬시 인덱싱

2017년     3000000
2020년    30000000
dtype: int64

In [9]:
data = ['a', 'b', 'c', 'd', 'e']

In [10]:
pd.Series(data)[::2]

0    a
2    c
4    e
dtype: object

In [11]:
pd.Series(data, index=[1, 3, 5, 7, 9])[1:4]

3    b
5    c
7    d
dtype: object

In [17]:
pd.Series(data, index=[1, 3, 5, 7, 9])
# pd.Series(data, index=[1, 3, 5, 7, 9])[3] -> b! 이렇게 하였을 때 명시적 인덱스만 따름

1    a
3    b
5    c
7    d
9    e
dtype: object

In [73]:
pd.Series(data, index=[1, 3, 5, 7, 9]).loc[1:4] #명시적인 인덱스만 따름

1    a
3    b
dtype: object

In [74]:
pd.Series(data, index=[1, 3, 5, 7, 9]).iloc[1:4] #묵시적인 인덱스만 따름

3    b
5    c
7    d
dtype: object

## 결측값(NaN, None) 처리

In [85]:
data = [1, 2, 3, None]

# sum(data)

In [89]:
import numpy as np
import pandas as pd

# np.array(np.sum(data))
np.array(data)
pd.Series(data)

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [93]:
pd.Series(data)[3] + 100
pd.Series(data)[3] * 100
pd.Series(data)[3] * 0

nan

In [96]:
s = pd.Series(data)
s.sum()
s.max()
s.min()

1.0

In [98]:
s.isnull()

0    False
1    False
2    False
3     True
dtype: bool

In [18]:
data = [1, 2, 3, None, None, None, None]

s = pd.Series(data)
s.isnull()
s.isnull().sum()

4

In [19]:
s.notnull()
s.notnull().sum()

3

In [20]:
s.dropna() #결측값을 삭제

0    1.0
1    2.0
2    3.0
dtype: float64

In [25]:
s.fillna(0) #결측값을 0으로 변경
s.fillna(s.mean()) #결측값을 평균으로변경

0    1.0
1    2.0
2    3.0
3    2.0
4    2.0
5    2.0
6    2.0
dtype: float64

In [28]:
a = np.array([10, 20, 30, 40])
b = a[:3].copy()
b[0] = 1000
a

array([10, 20, 30, 40])

## multiIndex

In [106]:
매출 = {
    '2015년':1000000,
    '2016년':2000000,
    '2017년':3000000,
    '2018년':4000000,
    '2019년':11000000,
    '2020년':30000000,
}
순익 = {
    '2015년':100001,
    '2016년':200001,
    '2017년':300001,
    '2018년':400001,
    '2019년':1100001,
    '2020년':3000001,
}

In [None]:
# 매출'2015년':1000000,
#     '2016년':2000000,
#     '2017년':3000000,
#     '2018년':4000000,
#     '2019년':11000000,
#     '2020년':30000000,
# 순익'2015년':100001,
#     '2016년':200001,
#     '2017년':300001,
#     '2018년':400001,
#     '2019년':1100001,
#     '2020년':3000001,

In [111]:
indexOne = list(zip(['매출' for i in range(len(매출.keys()))], 매출.keys()))
indexTwo = list(zip(['순익' for i in range(len(순익.keys()))], 순익.keys()))
index = indexOne + indexTwo
index

[('매출', '2015년'),
 ('매출', '2016년'),
 ('매출', '2017년'),
 ('매출', '2018년'),
 ('매출', '2019년'),
 ('매출', '2020년'),
 ('순익', '2015년'),
 ('순익', '2016년'),
 ('순익', '2017년'),
 ('순익', '2018년'),
 ('순익', '2019년'),
 ('순익', '2020년')]

In [None]:
# 매출.keys()
# ['매출' for i in range(len(매출.keys()))]

In [112]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['매출', '순익'], ['2015년', '2016년', '2017년', '2018년', '2019년', '2020년']],
           codes=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], [0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5]])

In [116]:
값 = list(매출.values()) + list(순익.values())

매출.values()
순익.values()
값

[1000000,
 2000000,
 3000000,
 4000000,
 11000000,
 30000000,
 100001,
 200001,
 300001,
 400001,
 1100001,
 3000001]

In [118]:
result = pd.Series(값, index=index)
result

매출  2015년     1000000
    2016년     2000000
    2017년     3000000
    2018년     4000000
    2019년    11000000
    2020년    30000000
순익  2015년      100001
    2016년      200001
    2017년      300001
    2018년      400001
    2019년     1100001
    2020년     3000001
dtype: int64

In [122]:
result['매출'].sum()
result['순익'][-3:].sum()

4500003

In [123]:
result['순익'][-3:]

2018년     400001
2019년    1100001
2020년    3000001
dtype: int64

## 연산과 집계함수

In [124]:
s = pd.Series([100, 200, 300, 400, 500])
ss = pd.Series([10, 20, 30, 40, 50])

In [125]:
s + 100

0    200
1    300
2    400
3    500
4    600
dtype: int64

In [127]:
s.add(100)

0    200
1    300
2    400
3    500
4    600
dtype: int64

In [128]:
s + ss

0    110
1    220
2    330
3    440
4    550
dtype: int64

In [129]:
s.add(ss)

0    110
1    220
2    330
3    440
4    550
dtype: int64

In [130]:
s - ss

0     90
1    180
2    270
3    360
4    450
dtype: int64

In [131]:
s.sub(ss)

0     90
1    180
2    270
3    360
4    450
dtype: int64

In [132]:
s * ss

0     1000
1     4000
2     9000
3    16000
4    25000
dtype: int64

In [133]:
s.mul(ss)

0     1000
1     4000
2     9000
3    16000
4    25000
dtype: int64

In [134]:
s // ss

0    10
1    10
2    10
3    10
4    10
dtype: int64

In [135]:
s / ss

0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
dtype: float64

In [136]:
s.floordiv(ss)

0    10
1    10
2    10
3    10
4    10
dtype: int64

In [137]:
s.div(ss)

0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
dtype: float64

In [138]:
s % ss

0    0
1    0
2    0
3    0
4    0
dtype: int64

In [139]:
s.mod(ss)

0    0
1    0
2    0
3    0
4    0
dtype: int64

In [140]:
s.mod(3)

0    1
1    2
2    0
3    1
4    2
dtype: int64

In [141]:
s

0    100
1    200
2    300
3    400
4    500
dtype: int64

In [143]:
s ** 3

0      1000000
1      8000000
2     27000000
3     64000000
4    125000000
dtype: int64

In [144]:
s.pow(3)

0      1000000
1      8000000
2     27000000
3     64000000
4    125000000
dtype: int64

In [151]:
s.count()

5

In [161]:
s.min()
s.max()
s.mean()  #평균
s.median()  #중앙값
s.sum()
s.std() #표준편차
s.var() #분산
s.mad() #절대 표준편차

25000.0

In [166]:
s.describe() #기초 통계를 한 번에 볼 수 있음

count      5.000000
mean     300.000000
std      158.113883
min      100.000000
25%      200.000000
50%      300.000000
75%      400.000000
max      500.000000
dtype: float64

In [169]:
s.head(3)

0    100
1    200
2    300
dtype: int64

In [170]:
s.tail(3)

2    300
3    400
4    500
dtype: int64

## 데이터 결합

In [3]:
import numpy as np

a = np.arange(10).reshape(2, 5)
b = np.arange(10).reshape(2, 5)
c = np.arange(10).reshape(2, 5)

a

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [4]:
np.concatenate([a, b, c])

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [7]:
np.concatenate([a, b, c], axis=1)

array([[0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9, 5, 6, 7, 8, 9, 5, 6, 7, 8, 9]])

In [9]:
import pandas as pd

a = pd.Series(['A', 'B', 'C', 'D', 'E'], index=range(1, 6))
b = pd.Series(['A', 'B', 'C', 'D', 'E'], index=range(1, 6))
c = pd.Series(['A', 'B', 'C', 'D', 'E'], index=range(1, 6))

a

1    A
2    B
3    C
4    D
5    E
dtype: object

In [12]:
np.concatenate([a, b, c])

array(['A', 'B', 'C', 'D', 'E', 'A', 'B', 'C', 'D', 'E', 'A', 'B', 'C',
       'D', 'E'], dtype=object)

In [13]:
pd.concat([a, b, c])

1    A
2    B
3    C
4    D
5    E
1    A
2    B
3    C
4    D
5    E
1    A
2    B
3    C
4    D
5    E
dtype: object

In [14]:
pd.concat([a, b, c])[5]

5    E
5    E
5    E
dtype: object

In [19]:
pd.concat([a, b, c], verify_integrity=False)

1    A
2    B
3    C
4    D
5    E
1    A
2    B
3    C
4    D
5    E
1    A
2    B
3    C
4    D
5    E
dtype: object

In [30]:
d = pd.concat([a, b, c], verify_integrity=False, ignore_index=True, copy=False, axis=1)
type(d)
d

Unnamed: 0,0,1,2
1,A,A,A
2,B,B,B
3,C,C,C
4,D,D,D
5,E,E,E


In [24]:
a.append(b)

1    A
2    B
3    C
4    D
5    E
1    A
2    B
3    C
4    D
5    E
dtype: object

In [35]:
a = pd.Series(['A', 'B', 'C', 'D', 'E', 'F'], index=range(1, 7))
b = pd.Series(['A', 'B', 'C', 'D', 'E'], index=range(1, 6))
c = pd.Series(['A', 'B', 'C', 'D', 'E'], index=range(1, 6))

d = pd.concat([a, b, c], axis=1)
type(d)
d

Unnamed: 0,0,1,2
1,A,A,A
2,B,B,B
3,C,C,C
4,D,D,D
5,E,E,E
6,F,,


In [36]:
pd.concat([a, b, c], axis=1, join='inner')

Unnamed: 0,0,1,2
1,A,A,A
2,B,B,B
3,C,C,C
4,D,D,D
5,E,E,E


In [37]:
pd.concat([a, b, c], axis=1, join='outer')

Unnamed: 0,0,1,2
1,A,A,A
2,B,B,B
3,C,C,C
4,D,D,D
5,E,E,E
6,F,,


## DataFrame

In [None]:
연차  연도      매출    순익   직원수
1     2015   1000000  100001        1
2     2016   2000000  200001        2
3     2017   3000000  300001        4
4     2018   4000000  400001        8
5     2019   8000000  800001       16
6     2020  16000000 1600001       32

In [39]:
rawData = {
    '연차':[1, 2, 3, 4, 5, 6],
    '연도':[2015, 2016, 2017, 2018, 2019, 2020],
    '매출':[1000000, 2000000, 3000000, 4000000, 8000000, 16000000],
    '순익':[100001, 200001, 300001, 400001, 800001, 1600001],
    '직원수':[1, 2, 4, 8, 16, 32]
}

In [40]:
pd.DataFrame(rawData)

Unnamed: 0,연차,연도,매출,순익,직원수
0,1,2015,1000000,100001,1
1,2,2016,2000000,200001,2
2,3,2017,3000000,300001,4
3,4,2018,4000000,400001,8
4,5,2019,8000000,800001,16
5,6,2020,16000000,1600001,32


In [42]:
# pd.DataFrame(rawData)[0]
pd.DataFrame(rawData)['연도']

0    2015
1    2016
2    2017
3    2018
4    2019
5    2020
Name: 연도, dtype: int64

In [47]:
pd.DataFrame(rawData).iloc[0:3]
pd.DataFrame(rawData).iloc[-3:]

Unnamed: 0,연차,연도,매출,순익,직원수
3,4,2018,4000000,400001,8
4,5,2019,8000000,800001,16
5,6,2020,16000000,1600001,32


In [54]:
pd.DataFrame(rawData, columns=['연차','매출','순익','직원수'], index=rawData['연도'])

Unnamed: 0,연차,매출,순익,직원수
2015,1,1000000,100001,1
2016,2,2000000,200001,2
2017,3,3000000,300001,4
2018,4,4000000,400001,8
2019,5,8000000,800001,16
2020,6,16000000,1600001,32


In [123]:
%%writefile rawData.csv
1, 2, 3, 4, 5, 6, 7
연차,1, 2, 3, 4, 5, 6
연도,2015, 2016, 2017, 2018, 2019, 2020
매출,1000000, 2000000, 3000000, 4000000, 8000000, 16000000
순익,100001, 200001, 300001, 400001, 800001, 1600001
직원수,1, 2, 4, 8, 16, 32

Overwriting rawData.csv


In [124]:
pd.read_csv('rawData.csv')

Unnamed: 0,1,2,3,4,5,6,7
0,연차,1,2,3,4,5,6
1,연도,2015,2016,2017,2018,2019,2020
2,매출,1000000,2000000,3000000,4000000,8000000,16000000
3,순익,100001,200001,300001,400001,800001,1600001
4,직원수,1,2,4,8,16,32


In [120]:
pd.read_csv('rawData.csv').columns

Index(['1', ' 2', ' 3', ' 4', ' 5', ' 6', ' 7'], dtype='object')

In [121]:
pd.read_csv('rawData.csv').index

RangeIndex(start=0, stop=5, step=1)

## DataFrame에 데이터 조작

In [1]:
import pandas as pd

rawData = {
    '연차':[1, 2, 3, 4, 5, 6],
    '연도':[2015, 2016, 2017, 2018, 2019, 2020],
    '매출':[1000000, 2000000, 3000000, 4000000, 8000000, 16000000],
    '순익':[100001, 200001, 300001, 400001, 800001, 1600001],
    '직원수':[1, 2, 4, 8, 16, 32]
}

df = pd.DataFrame(rawData)

In [2]:
df

Unnamed: 0,연차,연도,매출,순익,직원수
0,1,2015,1000000,100001,1
1,2,2016,2000000,200001,2
2,3,2017,3000000,300001,4
3,4,2018,4000000,400001,8
4,5,2019,8000000,800001,16
5,6,2020,16000000,1600001,32


In [4]:
df['매출']
df.매출

0     1000000
1     2000000
2     3000000
3     4000000
4     8000000
5    16000000
Name: 매출, dtype: int64

In [7]:
df['순이익율'] = (df['순익'] / df['매출']) * 100
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율
0,1,2015,1000000,100001,1,10.0001
1,2,2016,2000000,200001,2,10.00005
2,3,2017,3000000,300001,4,10.000033
3,4,2018,4000000,400001,8,10.000025
4,5,2019,8000000,800001,16,10.000013
5,6,2020,16000000,1600001,32,10.000006


In [8]:
df['test'] = 100
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율,test
0,1,2015,1000000,100001,1,10.0001,100
1,2,2016,2000000,200001,2,10.00005,100
2,3,2017,3000000,300001,4,10.000033,100
3,4,2018,4000000,400001,8,10.000025,100
4,5,2019,8000000,800001,16,10.000013,100
5,6,2020,16000000,1600001,32,10.000006,100


In [10]:
import numpy as np

df['testTwo'] = np.nan
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율,test,testTwo
0,1,2015,1000000,100001,1,10.0001,100,
1,2,2016,2000000,200001,2,10.00005,100,
2,3,2017,3000000,300001,4,10.000033,100,
3,4,2018,4000000,400001,8,10.000025,100,
4,5,2019,8000000,800001,16,10.000013,100,
5,6,2020,16000000,1600001,32,10.000006,100,


In [11]:
df['testThree'] = None
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율,test,testTwo,testThree
0,1,2015,1000000,100001,1,10.0001,100,,
1,2,2016,2000000,200001,2,10.00005,100,,
2,3,2017,3000000,300001,4,10.000033,100,,
3,4,2018,4000000,400001,8,10.000025,100,,
4,5,2019,8000000,800001,16,10.000013,100,,
5,6,2020,16000000,1600001,32,10.000006,100,,


In [13]:
df[['test', 'testTwo', 'testThree']] = 1000
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율,test,testTwo,testThree
0,1,2015,1000000,100001,1,10.0001,1000,1000,1000
1,2,2016,2000000,200001,2,10.00005,1000,1000,1000
2,3,2017,3000000,300001,4,10.000033,1000,1000,1000
3,4,2018,4000000,400001,8,10.000025,1000,1000,1000
4,5,2019,8000000,800001,16,10.000013,1000,1000,1000
5,6,2020,16000000,1600001,32,10.000006,1000,1000,1000


In [14]:
del df['test']
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율,testTwo,testThree
0,1,2015,1000000,100001,1,10.0001,1000,1000
1,2,2016,2000000,200001,2,10.00005,1000,1000
2,3,2017,3000000,300001,4,10.000033,1000,1000
3,4,2018,4000000,400001,8,10.000025,1000,1000
4,5,2019,8000000,800001,16,10.000013,1000,1000
5,6,2020,16000000,1600001,32,10.000006,1000,1000


In [15]:
df.drop(['testTwo'], axis='columns', inplace=True) #원본에 직접 반영
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율,testThree
0,1,2015,1000000,100001,1,10.0001,1000
1,2,2016,2000000,200001,2,10.00005,1000
2,3,2017,3000000,300001,4,10.000033,1000
3,4,2018,4000000,400001,8,10.000025,1000
4,5,2019,8000000,800001,16,10.000013,1000
5,6,2020,16000000,1600001,32,10.000006,1000


In [16]:
df.drop(['testThree'], axis='columns', inplace=True) #원본에 직접 반영
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율
0,1,2015,1000000,100001,1,10.0001
1,2,2016,2000000,200001,2,10.00005
2,3,2017,3000000,300001,4,10.000033
3,4,2018,4000000,400001,8,10.000025
4,5,2019,8000000,800001,16,10.000013
5,6,2020,16000000,1600001,32,10.000006


In [17]:
df.drop(df.columns[[0, 2]], axis='columns')

Unnamed: 0,연도,순익,직원수,순이익율
0,2015,100001,1,10.0001
1,2016,200001,2,10.00005
2,2017,300001,4,10.000033
3,2018,400001,8,10.000025
4,2019,800001,16,10.000013
5,2020,1600001,32,10.000006


In [35]:
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율
0,1,2015,1000000,100001,1,10.0001
1,2,2016,2000000,200001,2,10.00005
2,3,2017,3000000,300001,4,10.000033
3,4,2018,4000000,400001,8,10.000025
4,5,2019,8000000,800001,16,10.000013
5,6,2020,16000000,1600001,32,10.000006


In [43]:
dfTwo = pd.DataFrame(np.array([[7, 2021, 160000000, 16000001, 60]]),
        columns=['연차','연도','매출','순익','직원수']).append(df, ignore_index=True)
# dfTwo.ndim
dfTwo

Unnamed: 0,매출,순이익율,순익,연도,연차,직원수
0,160000000,,16000001,2021,7,60
1,1000000,10.0001,100001,2015,1,1
2,2000000,10.00005,200001,2016,2,2
3,3000000,10.000033,300001,2017,3,4
4,4000000,10.000025,400001,2018,4,8
5,8000000,10.000013,800001,2019,5,16
6,16000000,10.000006,1600001,2020,6,32


In [44]:
dfTwo.drop([0], inplace=True)
dfTwo

Unnamed: 0,매출,순이익율,순익,연도,연차,직원수
1,1000000,10.0001,100001,2015,1,1
2,2000000,10.00005,200001,2016,2,2
3,3000000,10.000033,300001,2017,3,4
4,4000000,10.000025,400001,2018,4,8
5,8000000,10.000013,800001,2019,5,16
6,16000000,10.000006,1600001,2020,6,32


In [45]:
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율
0,1,2015,1000000,100001,1,10.0001
1,2,2016,2000000,200001,2,10.00005
2,3,2017,3000000,300001,4,10.000033
3,4,2018,4000000,400001,8,10.000025
4,5,2019,8000000,800001,16,10.000013
5,6,2020,16000000,1600001,32,10.000006


In [47]:
df[df.매출 > 5000000]

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율
4,5,2019,8000000,800001,16,10.000013
5,6,2020,16000000,1600001,32,10.000006


In [49]:
# df[df.매출 > 5000000, ['순익', '직원수']] Error
df.loc[df.매출 > 5000000, ['순익', '직원수']]

Unnamed: 0,순익,직원수
4,800001,16
5,1600001,32


In [50]:
df[df.직원수 > 10]['순익'] - 10000

4     790001
5    1590001
Name: 순익, dtype: int64

In [52]:
df['순익'] = df[df.직원수 > 10]['순익'] - 10000
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율
0,1,2015,1000000,,1,10.0001
1,2,2016,2000000,,2,10.00005
2,3,2017,3000000,,4,10.000033
3,4,2018,4000000,,8,10.000025
4,5,2019,8000000,780001.0,16,10.000013
5,6,2020,16000000,1580001.0,32,10.000006


In [54]:
rawData = {
    '연차':[1, 2, 3, 4, 5, 6],
    '연도':[2015, 2016, 2017, 2018, 2019, 2020],
    '매출':[1000000, 2000000, 3000000, 4000000, 8000000, 16000000],
    '순익':[100001, 200001, 300001, 400001, 800001, 1600001],
    '직원수':[1, 2, 4, 8, 16, 32]
}

df = pd.DataFrame(rawData)

df['순익'] = np.where(df['직원수'] > 10, df['순익'] - 10000, df['순익'])
df

Unnamed: 0,연차,연도,매출,순익,직원수
0,1,2015,1000000,100001,1
1,2,2016,2000000,200001,2
2,3,2017,3000000,300001,4
3,4,2018,4000000,400001,8
4,5,2019,8000000,790001,16
5,6,2020,16000000,1590001,32


In [57]:
df.loc[6] = df.loc[5] * 2
df

Unnamed: 0,연차,연도,매출,순익,직원수
0,1,2015,1000000,100001,1
1,2,2016,2000000,200001,2
2,3,2017,3000000,300001,4
3,4,2018,4000000,400001,8
4,5,2019,8000000,790001,16
5,6,2020,16000000,1590001,32
6,12,4040,32000000,3180002,64


In [58]:
df['순이익율'] = (df['순익'] / df['매출'])*100

In [60]:
df.loc[6] = df.loc[5] * 2
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율
0,1.0,2015.0,1000000.0,100001.0,1.0,10.0001
1,2.0,2016.0,2000000.0,200001.0,2.0,10.00005
2,3.0,2017.0,3000000.0,300001.0,4.0,10.000033
3,4.0,2018.0,4000000.0,400001.0,8.0,10.000025
4,5.0,2019.0,8000000.0,790001.0,16.0,9.875012
5,6.0,2020.0,16000000.0,1590001.0,32.0,9.937506
6,12.0,4040.0,32000000.0,3180002.0,64.0,19.875013


In [61]:
df['연도'][6] = 2021
df['연차'][6] = 7
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율
0,1.0,2015.0,1000000.0,100001.0,1.0,10.0001
1,2.0,2016.0,2000000.0,200001.0,2.0,10.00005
2,3.0,2017.0,3000000.0,300001.0,4.0,10.000033
3,4.0,2018.0,4000000.0,400001.0,8.0,10.000025
4,5.0,2019.0,8000000.0,790001.0,16.0,9.875012
5,6.0,2020.0,16000000.0,1590001.0,32.0,9.937506
6,7.0,2021.0,32000000.0,3180002.0,64.0,19.875013


In [62]:
df.dtypes

연차      float64
연도      float64
매출      float64
순익      float64
직원수     float64
순이익율    float64
dtype: object

In [63]:
df['연차'] = df['연차'].astype('int')
df['연도'] = df['연도'].astype('int')
df

Unnamed: 0,연차,연도,매출,순익,직원수,순이익율
0,1,2015,1000000.0,100001.0,1.0,10.0001
1,2,2016,2000000.0,200001.0,2.0,10.00005
2,3,2017,3000000.0,300001.0,4.0,10.000033
3,4,2018,4000000.0,400001.0,8.0,10.000025
4,5,2019,8000000.0,790001.0,16.0,9.875012
5,6,2020,16000000.0,1590001.0,32.0,9.937506
6,7,2021,32000000.0,3180002.0,64.0,19.875013


In [65]:
pd.Series([1, '2', '3', 'hojun', True, 10.1])

0        1
1        2
2        3
3    hojun
4     True
5     10.1
dtype: object

In [66]:
pd.to_numeric(pd.Series([1, '2', '3', 'hojun', True, 10.1]), errors='ignore')

0        1
1        2
2        3
3    hojun
4     True
5     10.1
dtype: object

In [67]:
pd.to_numeric(pd.Series([1, '2', '3', 'hojun', True, 10.1]), errors='coerce')

0     1.0
1     2.0
2     3.0
3     NaN
4     1.0
5    10.1
dtype: float64

## MultiIndex

In [14]:
import numpy as np

np.random.rand(4, 2) # 0부터 1사이, 균일 분포, Matrix 생성
np.random.randint(10) # 0부터 9사이, 숫자 1개 생성
np.random.randint(10, 20, size=10)
np.random.randint(10, 20, size=(3, 5))
np.random.randn(4, 2) # 가우시안 표준 정규분포, Matrix 생성
np.unique([1, 1, 1, 2, 2, 3]) # 중복된 값 제거
np.random.choice(10, 5, replace=False)  # 3개만 선택, replace는 중복허락함

array([6, 7, 3, 8, 1])

In [15]:
import pandas as pd

df = pd.DataFrame(np.random.randint(50, 100, size=(4, 3)),
            index=[['1학년', '1학년', '2학년', '2학년'], ['1반', '2반', '1반', '2반']],
            columns=['국', '영', '수'])

df

Unnamed: 0,Unnamed: 1,국,영,수
1학년,1반,77,83,60
1학년,2반,94,54,92
2학년,1반,81,70,75
2학년,2반,65,99,52


In [16]:
df = pd.DataFrame(np.random.randint(50, 100, size=(4, 3)))

df

Unnamed: 0,0,1,2
0,63,66,82
1,65,65,78
2,52,52,88
3,50,73,74


In [17]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [18]:
df.index = ['1반', '2반', '1반', '2반']

df

Unnamed: 0,0,1,2
1반,63,66,82
2반,65,65,78
1반,52,52,88
2반,50,73,74


In [19]:
df.columns

RangeIndex(start=0, stop=3, step=1)

In [20]:
df.columns = ['국', '영', '수']

df

Unnamed: 0,국,영,수
1반,63,66,82
2반,65,65,78
1반,52,52,88
2반,50,73,74


In [21]:
df.index = [['1학년', '1학년', '2학년', '2학년'], ['1반', '2반', '1반', '2반']]

df

Unnamed: 0,Unnamed: 1,국,영,수
1학년,1반,63,66,82
1학년,2반,65,65,78
2학년,1반,52,52,88
2학년,2반,50,73,74


In [22]:
df.columns = [['언어', '언어', '수리'],['국', '영', '수']]

df

Unnamed: 0_level_0,Unnamed: 1_level_0,언어,언어,수리
Unnamed: 0_level_1,Unnamed: 1_level_1,국,영,수
1학년,1반,63,66,82
1학년,2반,65,65,78
2학년,1반,52,52,88
2학년,2반,50,73,74


In [23]:
df['언어']

Unnamed: 0,Unnamed: 1,국,영
1학년,1반,63,66
1학년,2반,65,65
2학년,1반,52,52
2학년,2반,50,73


In [24]:
df['수리']

Unnamed: 0,Unnamed: 1,수
1학년,1반,82
1학년,2반,78
2학년,1반,88
2학년,2반,74


In [25]:
df['언어']['국']

1학년  1반    63
     2반    65
2학년  1반    52
     2반    50
Name: 국, dtype: int32

In [26]:
df.loc['1학년']

Unnamed: 0_level_0,언어,언어,수리
Unnamed: 0_level_1,국,영,수
1반,63,66,82
2반,65,65,78


In [28]:
df.loc['1학년', '1반']

언어  국    63
    영    66
수리  수    82
Name: (1학년, 1반), dtype: int32

In [31]:
df.index = [['제주고', '제주고', '제주고', '제주고'], ['1학년', '1학년', '2학년', '2학년'], ['1반', '2반', '1반', '2반']]

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,언어,언어,수리
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,국,영,수
제주고,1학년,1반,63,66,82
제주고,1학년,2반,65,65,78
제주고,2학년,1반,52,52,88
제주고,2학년,2반,50,73,74
