## 함수 적용

### 데이터프레임 전체 동일 함수 적용

In [1]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]})
df

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [8]:
def sqrts(x):
    print(x)
    print('#'*20)
    return x**2



In [6]:
sqrts(7)

7


49

In [9]:
df.apply(sqrts)

0    1
1    2
2    3
Name: A, dtype: int64
####################
0    4
1    5
2    6
Name: B, dtype: int64
####################
0    7
1    8
2    9
Name: C, dtype: int64
####################


Unnamed: 0,A,B,C
0,1,16,49
1,4,25,64
2,9,36,81


### Series 단위로 함수 적용

In [10]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9],
                   'D': ['사과', '배', '포도'],})
                   
df

Unnamed: 0,A,B,C,D
0,1,4,7,사과
1,2,5,8,배
2,3,6,9,포도


In [11]:
def square(x):
    return x**2



In [13]:
df['A**2'] = df['A'].apply(square)
df
# A**2를 그냥 A로 두면 업데이트도 됨.

Unnamed: 0,A,B,C,D,A**2
0,1,4,7,사과,1
1,2,5,8,배,4
2,3,6,9,포도,9


In [14]:
df['A^2_lambda']= df['A'].apply(lambda x: x**2)
df

Unnamed: 0,A,B,C,D,A**2,A^2_lambda
0,1,4,7,사과,1,1
1,2,5,8,배,4,4
2,3,6,9,포도,9,9


In [18]:
def encoding(value):
# 사과 -> 0, 배 -> 1, 포도 -> 2
    print(value)
    print('#'*20)

    if value == '사과':
        return 0
    elif value == '배':
        return 1
    elif value == '포도':
        return 2


df['D_encode'] = df['D'].apply(encoding)
df

사과
####################
배
####################
포도
####################


Unnamed: 0,A,B,C,D,A**2,A^2_lambda,사과,배,포도,D_encode
0,1,4,7,사과,1,1,0,1,2,0
1,2,5,8,배,4,4,0,1,2,1
2,3,6,9,포도,9,9,0,1,2,2


In [19]:
df["D_encode_map"] = df['D'].map({"사과":0,"배":1,"포도":2})
df

Unnamed: 0,A,B,C,D,A**2,A^2_lambda,사과,배,포도,D_encode,D_encode_map
0,1,4,7,사과,1,1,0,1,2,0,0
1,2,5,8,배,4,4,0,1,2,1,1
2,3,6,9,포도,9,9,0,1,2,2,2


### 예:한글 전처리 적용
1. 문장부호를 제거합니다.

In [21]:
import pandas as pd

review = pd.Series(['왜 3D로 나와서 제 심기를 불편하게 하죠??', 
                           '오늘 날씨가 좋네요.ㅋㅋㅋ', 
                           '한글 전처리가 어렵네요ㅠㅠ', 
                           '자연어처리 공부를 열심히 해봅시다!', 
                           '데이터분석을 즐겨봅시다.'])

review

0    왜 3D로 나와서 제 심기를 불편하게 하죠??
1               오늘 날씨가 좋네요.ㅋㅋㅋ
2               한글 전처리가 어렵네요ㅠㅠ
3          자연어처리 공부를 열심히 해봅시다!
4                데이터분석을 즐겨봅시다.
dtype: object

In [28]:
import re

# 한글문장을 전처리하는 함수를 정의합니다.
# 한글만(숫자, 영문자, 자음, 특수기호 제거) - 정규표현식(regular expression)

def preprocess(text):
    korean_pattern = re.compile('[^가-힣]+')
    
    processed_text = korean_pattern.sub(' ', text)
    
    return processed_text

# apply 함수를 사용하여 review 시리즈의 각 원소에 대해 preprocess 함수를 적용합니다.
kor_preprocessed = review.apply(preprocess)
# 결과를 출력합니다.
print(kor_preprocessed)


0    왜 로 나와서 제 심기를 불편하게 하죠 
1               오늘 날씨가 좋네요 
2             한글 전처리가 어렵네요 
3       자연어처리 공부를 열심히 해봅시다 
4             데이터분석을 즐겨봅시다 
dtype: object


## 나이를 구간으로 표현하기

In [29]:
import seaborn as sns
df = sns.load_dataset("titanic") 
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [30]:
import math
# (age//10) * 10

def age_cat(age):
    return (age//10) * 10

In [31]:
age_cat(7),age_cat(24.0),age_cat(36.0),age_cat(40.0)

(0, 20.0, 30.0, 40.0)

In [32]:
df['age_cat'] = df['age'].apply(age_cat)
df.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_cat
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,20.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,30.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,20.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,30.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,30.0


### 남자:0, 여자:1로 표현하기

* apply()함수 사용
* map()함수 사용

In [33]:
import seaborn as sns
df = sns.load_dataset("titanic") 
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [34]:
df["sex_label"] = df['sex'].map({"male":0,"female":1})
df[['sex_label','sex']].head()

Unnamed: 0,sex_label,sex
0,0,male
1,1,female
2,1,female
3,1,female
4,0,male


### 전화번호 전처리

In [35]:
import pandas as pd

phone_numbers = pd.Series(['010-5432-9876',
'01088881111',
'010-33337777',
'010-2468-1357.',
'010 9876 5432-',
'010-7777-5555',
'01011112222',
'010-4444-8888',
'010-1234-5678',
'010/8765/4321'])
phone_numbers

0     010-5432-9876
1       01088881111
2      010-33337777
3    010-2468-1357.
4    010 9876 5432-
5     010-7777-5555
6       01011112222
7     010-4444-8888
8     010-1234-5678
9     010/8765/4321
dtype: object

In [36]:
import re
# re.sub(r'[^\d]', '', '010 9876 5432-')
# 숫자가 아니면 없애라는 함수
re.sub(r'[^\d]', '', '010/8765/4321')

'01087654321'

In [40]:
# 전화번호를 전처리하는 함수를 정의합니다.
def preprocess_phone_number(phone_number):
    digit = re.sub(r'[^\d]', '', phone_number)
    form_number = '-'.join([digit[:3], digit[3:7], digit[7:]])
    return form_number
        

# apply 함수를 사용하여 phone_numbers 시리즈의 각 원소에 대해 preprocess_phone_number 함수를 적용합니다.
form_phone_number = phone_numbers.apply(preprocess_phone_number)
print(form_phone_number)

0    010-5432-9876
1    010-8888-1111
2    010-3333-7777
3    010-2468-1357
4    010-9876-5432
5    010-7777-5555
6    010-1111-2222
7    010-4444-8888
8    010-1234-5678
9    010-8765-4321
dtype: object
