In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', 250)

## 데이터 프레임에 새로운 항목 추가하기
* 열을 추가할때는 `객체명[새로운컬럼명] = 값목록` 사용
* 행을 추가할때는 `객체명.append(Series/Dict객체)` 사용

In [3]:
a = pd.DataFrame()
a

In [5]:
# 열 추가
a['name'] = ['혜교', '수지', '지현']
a['kor'] = [76,87,98]
a['eng'] = [54,65,76]
a['mat'] = [43,76,90]
a

Unnamed: 0,name,kor,eng,mat
0,혜교,76,54,43
1,수지,87,65,76
2,지현,98,76,90


In [7]:
b = pd.DataFrame()
# 행 추가
hekyo = pd.Series(['혜교',76,54,43],
       index=['name','kor','eng','mat'])
jihyun = pd.Series(['지현',87,65,76],
          index=['name','kor','eng','mat'])
suji = pd.Series(['수지',99,98,97],
          index=['name','kor','eng','mat'])

In [None]:
b.append(hekyo, ignore)
b.append(jihyun)
b.append(suji)

## 데이터프레임에서 rank 다루기
* rank(정렬방법)
* 기본적으로 오름차순으로 처리
* 동일한 값들은 그것들의 평균값으로 순위지정
* method 속성을 이용해서 동일한 값의 순위를 다양하게 지정할 수 있음

In [8]:
a.rank()  # 결과는 해당 데이터의 index가 출력

Unnamed: 0,name,kor,eng,mat
0,3.0,1.0,1.0,1.0
1,1.0,2.0,2.0,2.0
2,2.0,3.0,3.0,3.0


In [8]:
a.rank(ascending=False)

Unnamed: 0,name,kor,eng,mat
0,3.0,1.0,1.0,1.0
1,1.0,2.0,2.0,2.0
2,2.0,3.0,3.0,3.0


In [9]:
data = [7, -2, 7, 4, 2, 0, 4]
df = pd.Series(data)

In [10]:
# 7, -2, 7, 4, 2, 0, 4
# 7   1  6  5  3  2  4
df.rank()  # 동일한 값의 순위는 평균값으로 지정

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [12]:
# 7, -2, 7, 4, 2, 0, 4
# 7   1  6  5  3  2  4
df.rank(method='min')

0    6.0
1    1.0
2    6.0
3    4.0
4    3.0
5    2.0
6    4.0
dtype: float64

In [13]:
# 7, -2, 7, 4, 2, 0, 4
# 7   1  6  5  3  2  4
df.rank(method='max')

0    7.0
1    1.0
2    7.0
3    5.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [14]:
# 7, -2, 7, 4, 2, 0, 4
# 6   1  7  4  3  2  5
df.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

### 문자열 처리
* 문자열 관련 함수를 적용하려면, 먼저 str함수로 대상을 문자형으로 변환해야 함

In [15]:
data = ['Tom','  James  ', np.nan, 12345, 'Vin', 'Steve']
df = pd.Series(data)

In [16]:
# 문자열 길이 : len
df.str.len()

0    3.0
1    9.0
2    NaN
3    NaN
4    3.0
5    5.0
dtype: float64

In [19]:
# 대소문자 변환 : upper, lower, swapcase
df.str.lower()

0          tom
1      james  
2          NaN
3          NaN
4          vin
5        steve
dtype: object

In [18]:
df.str.upper()

0          TOM
1      JAMES  
2          NaN
3          NaN
4          VIN
5        STEVE
dtype: object

In [20]:
# 대문자는 소문자로, 소문자는 대문자로 변환
df.str.swapcase()

0          tOM
1      jAMES  
2          NaN
3          NaN
4          vIN
5        sTEVE
dtype: object

In [21]:
# 공백제거 : strip
df.str.strip()

0      Tom
1    James
2      NaN
3      NaN
4      Vin
5    Steve
dtype: object

In [22]:
# 특정 문자열 포함여부 : contains
df.str.contains('123')

0    False
1    False
2      NaN
3      NaN
4    False
5    False
dtype: object

In [23]:
df.str.contains('T')

0     True
1    False
2      NaN
3      NaN
4    False
5    False
dtype: object

In [25]:
# 특정 문자열 찾으면 인덱스 출력 : find
df.str.find('T')

0    0.0
1   -1.0
2    NaN
3    NaN
4   -1.0
5   -1.0
dtype: float64

In [26]:
df.str.find('es')

0   -1.0
1    5.0
2    NaN
3    NaN
4   -1.0
5   -1.0
dtype: float64

In [27]:
# 특정 문자열 찾으면 리스트 출력 : findall
df.str.findall('T')

0    [T]
1     []
2    NaN
3    NaN
4     []
5     []
dtype: object

In [28]:
df.str.findall('es')

0      []
1    [es]
2     NaN
3     NaN
4      []
5      []
dtype: object

### label encoding
* 명목형 문자형값을 코드형 숫자값으로 변환하는 것을 의미
* 머신러닝에서는 문자를 다루는 것보다 숫자를 다루는 것이 더 낫기 때문
* 인코딩 결과시 결과값에 서수의 개념이 포함됨 - 분석시 왜곡될 가능성 존재

In [43]:
df = pd.DataFrame( [['green','M',10.1,'class1'],
                     ['red','L',13.5,'class2'],
                     ['blue','XL',15.3,'class1']] )
df.columns=['color','size','price','classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [44]:
# 코드:값
size_map = {'M':1, 'L':2, 'XL':3}
df['size2'] = df['size'].map(size_map)
df

Unnamed: 0,color,size,price,classlabel,size2
0,green,M,10.1,class1,1
1,red,L,13.5,class2,2
2,blue,XL,15.3,class1,3


In [45]:
color_map = {'green':2, 'red':3, 'blue':1}
df['color2'] = df['color'].map(color_map)
df

Unnamed: 0,color,size,price,classlabel,size2,color2
0,green,M,10.1,class1,1,2
1,red,L,13.5,class2,2,3
2,blue,XL,15.3,class1,3,1


In [46]:
label_map = {'class1':1, 'class2':2}
df['classlabel2'] = df['classlabel'].map(label_map)
df

Unnamed: 0,color,size,price,classlabel,size2,color2,classlabel2
0,green,M,10.1,class1,1,2,1
1,red,L,13.5,class2,2,3,2
2,blue,XL,15.3,class1,3,1,1


### onehot-encoding
* 하나의 True와 나머지가 False인 인코딩을 의미
* 컴퓨터는 문자보다 숫자를 더 잘 처리함
* 따라서, 단어/기호를 숫자형태로 바꾸는 기법 필요
* 범주형 데이터로 구성된 분류문제를 푸는 
  머신러닝/딥러닝에 주로 사용
* 예를 들어 과일이미지를 이용해서 과일이름을 판별하는 경우
  + 사과/복숭아/오렌지
  + 1   / 2   /  3   (레이블인코딩)
  + 100 / 010 / 001  (원핫인코딩)

In [47]:
df = pd.DataFrame( [['green','M',10.1,'class1'],
                     ['red','L',13.5,'class2'],
                     ['blue','XL',15.3,'class1']] )
df.columns=['color','size','price','classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [49]:
df.color.str.get_dummies()

Unnamed: 0,blue,green,red
0,0,1,0
1,0,0,1
2,1,0,0


In [51]:
df['size'].str.get_dummies()

Unnamed: 0,L,M,XL
0,0,1,0
1,1,0,0
2,0,0,1


In [53]:
df.classlabel.str.get_dummies()

Unnamed: 0,class1,class2
0,1,0
1,0,1
2,1,0


### 타이타닉 데이터에 대해 레이블 인코딩 처리

In [54]:
import seaborn as sns

In [55]:
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [77]:
titanic['class'].value_counts()

class
Third     491
First     216
Second    184
Name: count, dtype: int64

In [63]:
class_map = {'First':1, 'Second':2, 'Third':3}
titanic['class2'] = titanic['class'].map(class_map)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,class2
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,3
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,3
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,1
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,3


In [78]:
titanic['who'].value_counts()

who
man      537
woman    271
child     83
Name: count, dtype: int64

In [68]:
who_map = {'man':1, 'woman':2, 'child':3}
titanic['who2'] = titanic['who'].map(who_map)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,class2,who2
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,3,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,3,2
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,1,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,3,1


In [79]:
titanic['adult_male'].value_counts()

adult_male
True     537
False    354
Name: count, dtype: int64

In [80]:
adult_male_map = {True:1, False:2}
titanic['adult_male2'] = titanic['adult_male'].map(adult_male_map)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,class2,who2,adult_male2
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,3,1,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1,2,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,3,2,2
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,1,2,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,3,1,1


In [84]:
titanic['embark_town'].value_counts()

embark_town
Southampton    644
Cherbourg      168
Queenstown      77
Name: count, dtype: int64

In [85]:
et2_map = {'Cherbourg':1, 'Queenstown':2, 'Southampton':3, np.nan:4}
titanic['et2'] = titanic['embark_town'].map(et2_map)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,class2,who2,adult_male2,et2
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,3,1,1,3
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1,2,2,1
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,3,2,2,3
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,1,2,2,3
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,3,1,1,3


In [87]:
alive_map = {'no':1, 'yes':2}
titanic['alive2'] = titanic['alive'].map(alive_map)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,class2,who2,adult_male2,et2,alive2
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,3,1,1,3,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1,2,2,1,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,3,2,2,3,2
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,1,2,2,3,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,3,1,1,3,1


In [88]:
alone_map = {True:1, False:2}
titanic['alone2'] = titanic['alone'].map(alone_map)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,class2,who2,adult_male2,et2,alive2,alone2
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,3,1,1,3,1,2
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1,2,2,1,2,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,3,2,2,3,2,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,1,2,2,3,2,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,3,1,1,3,1,1
