In [1]:
import pandas as pd
from pandas import Series, DataFrame

# 5-1. 산술연산
## 1) Series 산술연산
### - 같은 인덱스 라벨을 가진 항목끼리 산술 연산을 수행함.
### - 겹치는 인덱스가 없다면, 데이터는 NA가 된다. 
<img src="img/산술연산.jpg" alt="산술연산표" style="width: 800px;"/>

In [2]:
sr1 = Series([1,2,3], index = list('abc'))
sr2 =  Series([4,5,6], index = list('abd'))

In [3]:
#같은 인덱스 라벨을 가진 항목끼리 산술 연산을 수행함.
#겹치는 인덱스가 없다면, 데이터는 NA가 된다. 
sr1+sr2

a    5.0
b    7.0
c    NaN
d    NaN
dtype: float64

In [6]:
#sr1 + sr2와 동일하지만, 함수로 사용하는 경우에는 다양한 파라미터 설정이 가능해짐.
sr1.add(sr2)

a    5.0
b    7.0
c    NaN
d    NaN
dtype: float64

In [4]:
#산술 연산 시, 데이터가 없는 경우 fill_value인자의 값으로 간주함.
sr1.add(sr2, fill_value = 0)

a    5.0
b    7.0
c    3.0
d    6.0
dtype: float64

## 2) DataFrame 산술연산

In [5]:
arr1 = [[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]]
df1 = DataFrame(arr1, columns = list('bcd'), index = ['Ohio', 'Texas', 'Colorado'])
df1

Unnamed: 0,b,c,d
Ohio,0,1,2
Texas,3,4,5
Colorado,6,7,8


In [7]:
arr2 = [[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]]
df2 = DataFrame(arr2, columns = list('bde'), index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
df2

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [8]:
# 컬럼과 로우 인덱스 모두 동일한 항목끼리 연산이 수행됨.
# 둘 중 하나라도 없으면 NaN 
df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [9]:
df1.add(df2,fill_value = 0)
#fill_value = <값> : Missing Value를 <값>으로 간주함. 두 데이터프레임에 모두 없는 경우에는 결과도 Missing
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.add.html#pandas.DataFrame.add

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


## DataFrame 산술 연산
<img src="img/데이터프레임add.jpg" alt="Add" style="width: 1000px;"/>

## 3) DataFrame과 Series 간의 연산

### 기본적으로 DataFrame과 Series 간의 산술연산은 Series의 색인을 DataFrame의 칼럼에 맞추고 아래로 전파한다.

In [10]:
df = DataFrame([[1,3, 5, 7],[2,4, 6, 8], [10, 20, 30, 40]], 
                          columns = list('ABCD'), index = ['X','Y','Z'])
df

Unnamed: 0,A,B,C,D
X,1,3,5,7
Y,2,4,6,8
Z,10,20,30,40


In [11]:
#A, B, C, D 컬럼 각각에 10, 20, 30, 40을 더하기
sr = Series([10,20,30,40],index = ['A','B','C','D'])

In [13]:
df + sr

Unnamed: 0,A,B,C,D
X,11,23,35,47
Y,12,24,36,48
Z,20,40,60,80


In [26]:
#실습. X,Y,Z 로우에 각각에 10, 100, 1000을 더하시오.
sr2 = pd.Series([10,100,1000],index = ['X','Y','Z'])
df.add(sr2,axis = 0)

Unnamed: 0,A,B,C,D
X,11,13,15,17
Y,102,104,106,108
Z,1010,1020,1030,1040


## 4) 함수 적용과 매핑

### apply() :각 로우나 컬럼의 1차원 배열에 함수를 적용
### applymap() : 각 항목에 함수를 적용

#### 4-1) apply() 함수

In [30]:
def diff(x):
    return x.max()-x.min()

In [31]:
diff(sr1)

2

In [32]:
df1

Unnamed: 0,b,c,d
Ohio,0,1,2
Texas,3,4,5
Colorado,6,7,8


In [33]:
df1.apply(diff)

b    6
c    6
d    6
dtype: int64

In [34]:
df1.apply(diff,axis=1)

Ohio        2
Texas       2
Colorado    2
dtype: int64

#### 4-2) applymap() 함수

In [36]:
def mul2(x):
    return x*2

In [37]:
df1

Unnamed: 0,b,c,d
Ohio,0,1,2
Texas,3,4,5
Colorado,6,7,8


In [38]:
df1.applymap(mul2)

Unnamed: 0,b,c,d
Ohio,0,2,4
Texas,6,8,10
Colorado,12,14,16


In [39]:
diff(df1)
#df1.apply(diff)

b    6
c    6
d    6
dtype: int64

In [40]:
mul2(df1)
#df1.apply(mul2)

Unnamed: 0,b,c,d
Ohio,0,2,4
Texas,6,8,10
Colorado,12,14,16


## 5. 컬럼 추가하기

In [41]:
df1

Unnamed: 0,b,c,d
Ohio,0,1,2
Texas,3,4,5
Colorado,6,7,8


In [45]:
df1['e'] = 10
df1

Unnamed: 0,b,c,d,e
Ohio,0,1,2,10
Texas,3,4,5,10
Colorado,6,7,8,10


In [46]:

df1['f'] = df1['d'] + df1['e']
df1

Unnamed: 0,b,c,d,e,f
Ohio,0,1,2,10,12
Texas,3,4,5,10,15
Colorado,6,7,8,10,18


In [51]:
#실습. df1에 'g'라는 새로운 컬럼을 추가하고, 값은 각 로우(row)에 있는 값들 중 가장 작은 값으로 할당
df1['g'] = df1.apply(min,axis = 1)
df1

Unnamed: 0,b,c,d,e,f,g
Ohio,0,1,2,10,12,0
Texas,3,4,5,10,15,3
Colorado,6,7,8,10,18,6


In [53]:
# 실습을 위한 데이터 읽기 (NC Dinos.xlsx의 첫번째 시트값만 읽어오기)
# 첫번째 시트는 2013년도 선수 기록 정보
NC13 = pd.read_excel('data/NC Dinos.xlsx',sheet_name = '2013')
NC13

Unnamed: 0,선수명,팀명,경기,타석,타수,안타,홈런,득점,타점,볼넷,삼진,도루,BABIP,타율,출루율,장타율,OPS,wOBA,WAR
0,모창민,NC,108,436,395,109,12,57,51,37,68,16,0.307,0.276,0.339,0.443,0.782,0.353,2.31
1,이호준,NC,126,508,442,123,20,46,87,60,109,2,0.324,0.278,0.362,0.475,0.837,0.373,1.85
2,김종호,NC,128,546,465,129,0,72,22,57,100,50,0.352,0.277,0.376,0.333,0.709,0.339,1.55
3,나성범,NC,104,458,404,98,14,55,64,33,95,12,0.279,0.243,0.319,0.416,0.735,0.329,1.5
4,조영훈,NC,120,426,380,107,6,38,39,39,56,4,0.316,0.282,0.35,0.413,0.763,0.348,0.83
5,이현곤,NC,91,161,139,38,0,10,9,16,14,2,0.304,0.273,0.361,0.324,0.685,0.327,0.52
6,이상호,NC,102,138,125,31,0,26,13,9,21,24,0.298,0.248,0.299,0.320,0.619,0.289,0.16
7,강진성,NC,3,3,2,1,0,1,0,1,0,0,0.500,0.500,0.667,1.000,1.667,0.671,0.1
8,조평호,NC,26,86,79,21,2,12,7,6,24,1,0.358,0.266,0.318,0.418,0.736,0.329,0.09
9,박민우,NC,32,48,42,11,0,10,6,5,7,9,0.306,0.262,0.333,0.286,0.619,0.296,0.07


In [54]:
# 실습. '연도' 라는 새로운 컬럼을 추가하고, 값을 2013 으로 초기화하기
NC13['연도'] = 2013
NC13

Unnamed: 0,선수명,팀명,경기,타석,타수,안타,홈런,득점,타점,볼넷,삼진,도루,BABIP,타율,출루율,장타율,OPS,wOBA,WAR,연도
0,모창민,NC,108,436,395,109,12,57,51,37,68,16,0.307,0.276,0.339,0.443,0.782,0.353,2.31,2013
1,이호준,NC,126,508,442,123,20,46,87,60,109,2,0.324,0.278,0.362,0.475,0.837,0.373,1.85,2013
2,김종호,NC,128,546,465,129,0,72,22,57,100,50,0.352,0.277,0.376,0.333,0.709,0.339,1.55,2013
3,나성범,NC,104,458,404,98,14,55,64,33,95,12,0.279,0.243,0.319,0.416,0.735,0.329,1.5,2013
4,조영훈,NC,120,426,380,107,6,38,39,39,56,4,0.316,0.282,0.35,0.413,0.763,0.348,0.83,2013
5,이현곤,NC,91,161,139,38,0,10,9,16,14,2,0.304,0.273,0.361,0.324,0.685,0.327,0.52,2013
6,이상호,NC,102,138,125,31,0,26,13,9,21,24,0.298,0.248,0.299,0.320,0.619,0.289,0.16,2013
7,강진성,NC,3,3,2,1,0,1,0,1,0,0,0.500,0.500,0.667,1.000,1.667,0.671,0.1,2013
8,조평호,NC,26,86,79,21,2,12,7,6,24,1,0.358,0.266,0.318,0.418,0.736,0.329,0.09,2013
9,박민우,NC,32,48,42,11,0,10,6,5,7,9,0.306,0.262,0.333,0.286,0.619,0.296,0.07,2013


In [55]:
# 실습. 안타와 홈런의 수를 합한 값을 '안타홈런'이라는 컬럼으로 추가
NC13['안타홈런'] = NC13['안타'] + NC13['홈런']
NC13

Unnamed: 0,선수명,팀명,경기,타석,타수,안타,홈런,득점,타점,볼넷,...,도루,BABIP,타율,출루율,장타율,OPS,wOBA,WAR,연도,안타홈런
0,모창민,NC,108,436,395,109,12,57,51,37,...,16,0.307,0.276,0.339,0.443,0.782,0.353,2.31,2013,121
1,이호준,NC,126,508,442,123,20,46,87,60,...,2,0.324,0.278,0.362,0.475,0.837,0.373,1.85,2013,143
2,김종호,NC,128,546,465,129,0,72,22,57,...,50,0.352,0.277,0.376,0.333,0.709,0.339,1.55,2013,129
3,나성범,NC,104,458,404,98,14,55,64,33,...,12,0.279,0.243,0.319,0.416,0.735,0.329,1.5,2013,112
4,조영훈,NC,120,426,380,107,6,38,39,39,...,4,0.316,0.282,0.35,0.413,0.763,0.348,0.83,2013,113
5,이현곤,NC,91,161,139,38,0,10,9,16,...,2,0.304,0.273,0.361,0.324,0.685,0.327,0.52,2013,38
6,이상호,NC,102,138,125,31,0,26,13,9,...,24,0.298,0.248,0.299,0.320,0.619,0.289,0.16,2013,31
7,강진성,NC,3,3,2,1,0,1,0,1,...,0,0.500,0.500,0.667,1.000,1.667,0.671,0.1,2013,1
8,조평호,NC,26,86,79,21,2,12,7,6,...,1,0.358,0.266,0.318,0.418,0.736,0.329,0.09,2013,23
9,박민우,NC,32,48,42,11,0,10,6,5,...,9,0.306,0.262,0.333,0.286,0.619,0.296,0.07,2013,11


In [56]:
def 안타홈런더하기(x):
    return x.홈런 + x.안타

In [57]:
NC13['안타홈런'] = NC13.apply(안타홈런더하기)

AttributeError: ("'Series' object has no attribute '홈런'", 'occurred at index 선수명')