In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.rand(5, 2), columns=["A", "B"])

df

Unnamed: 0,A,B
0,0.086796,0.649372
1,0.301754,0.957568
2,0.759831,0.909795
3,0.692002,0.515629
4,0.088481,0.043401


In [27]:
df['A'] < 0.5

0     True
1     True
2    False
3    False
4     True
Name: A, dtype: bool

In [28]:
df[(df["A"] < 0.5) & (df["B"] > 0.3)]

Unnamed: 0,A,B
0,0.086796,0.649372
1,0.301754,0.957568


In [29]:
df.query("A < 0.5 and B > 0.3")

Unnamed: 0,A,B
0,0.086796,0.649372
1,0.301754,0.957568


In [30]:
condition = (df["A"] < 0.5) & (df["B"] > 0.3)
print(df[condition])

          A         B
0  0.086796  0.649372
1  0.301754  0.957568


In [33]:
# 문자열 조건 검색
data = {
    'Animal' : ['Dog','Cat','Cat','Pig','Cat'],
    'Name' : ['Happy','Sam','Tom','Mini','Rocky']
}

pd.DataFrame(data)

Unnamed: 0,Animal,Name
0,Dog,Happy
1,Cat,Sam
2,Cat,Tom
3,Pig,Mini
4,Cat,Rocky


In [35]:
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,Animal,Name
0,Dog,Happy
1,Cat,Sam
2,Cat,Tom


In [36]:
df['Animal']

0    Dog
1    Cat
2    Cat
3    Pig
4    Cat
Name: Animal, dtype: object

In [37]:
df['Animal'].str.contains('Cat')
# str.contains() 문자중에 매개변수로 받는 인자가 포함되어 있는지 확인(T/F)

0    False
1     True
2     True
3    False
4     True
Name: Animal, dtype: bool

In [38]:
df.Animal.str.match('Cat')

0    False
1     True
2     True
3    False
4     True
Name: Animal, dtype: bool

In [39]:
df[df['Animal'].str.contains('Cat')]

Unnamed: 0,Animal,Name
1,Cat,Sam
2,Cat,Tom
4,Cat,Rocky


In [8]:
# apply를 통해서 함수로 데이터 다루기
df = pd.DataFrame(np.arange(5), columns=["Num"])
def square(x):
    return x**2
df["Num"].apply(square)
df["Square"] = df.Num.apply(lambda x: x ** 2)


In [9]:
df

Unnamed: 0,Num,Square
0,0,0
1,1,1
2,2,4
3,3,9
4,4,16


In [10]:
df = pd.DataFrame(columns=["phone"])
df.loc[0] = "010-1234-1235"
df.loc[1] = "공일공-일이삼사-1235"
df.loc[2] = "010.1234.일이삼오"
df.loc[3] = "공1공-1234.1이3오"
df["preprocess_phone"] = ''

In [11]:
df

Unnamed: 0,phone,preprocess_phone
0,010-1234-1235,
1,공일공-일이삼사-1235,
2,010.1234.일이삼오,
3,공1공-1234.1이3오,


In [18]:
def get_preprocess_phone(phone):
    mapping_dict = {
        "공": "0",
        "일": "1",
        "이": "2",
        "삼": "3",
        "사": "4",
        "오": "5",
        "-": "",
        ".": "",
}
    for key, value in mapping_dict.items():
        phone = phone.replace(key, value)
    
    return phone
df["preprocess_phone"] = df["phone"].apply(get_preprocess_phone)
    
df

Unnamed: 0,phone,preprocess_phone
0,010-1234-1235,1012341235
1,공일공-일이삼사-1235,1012341235
2,010.1234.일이삼오,1012341235
3,공1공-1234.1이3오,1012341235


In [14]:
df

Unnamed: 0,phone,preprocess_phone
0,010-1234-1235,
1,공일공-일이삼사-1235,
2,010.1234.일이삼오,
3,공1공-1234.1이3오,


In [40]:
# replace
data = {'0': 'Male','1':'Male','2':'Male','3':'Female','4':'Female',}
data

{'0': 'Male', '1': 'Male', '2': 'Male', '3': 'Female', '4': 'Female'}

In [41]:
data.values()

dict_values(['Male', 'Male', 'Male', 'Female', 'Female'])

In [43]:
data_list = list(data.values())

In [44]:
pd.DataFrame(data_list,columns=['sex'])

Unnamed: 0,sex
0,Male
1,Male
2,Male
3,Female
4,Female


In [45]:
df = pd.DataFrame(data_list,columns=['sex'])

df.sex.replace({'Male':0,'Female':1})

0    0
1    0
2    0
3    1
4    1
Name: sex, dtype: int64

In [48]:
df.sex.replace({"Male": 0, "Female": 1}, inplace=True)
df

Unnamed: 0,sex
0,0
1,0
2,0
3,1
4,1


In [50]:
# 그룹으로 묶기
# 조건부로 집계하고 싶은 경우
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data': range(6)})
df.groupby('key')
df


Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [51]:
df.groupby('key').sum()


Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [53]:
df.groupby(['key','data']).sum()

key,data
A,0
A,3
B,1
B,4
C,2
C,5


In [54]:
# aggregate
# groupby 통해서 집계를 한번에 계산하고자 할 때

data = {
    'group' : ['A','B','A','B','A','B','A','B',],
    'data1' : [20,30,40,20,30,40,20,50],
    'data2' : [1,2,3,4,5,6,7,8]
    
}

pd.DataFrame(data)

Unnamed: 0,group,data1,data2
0,A,20,1
1,B,30,2
2,A,40,3
3,B,20,4
4,A,30,5
5,B,40,6
6,A,20,7
7,B,50,8


In [59]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,group,data1,data2
0,A,20,1
1,B,30,2
2,A,40,3
3,B,20,4
4,A,30,5


In [62]:
df.groupby('group').aggregate(['min', 'median', 'max'])


Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,20,25.0,40,1,4.0,7
B,20,35.0,50,2,5.0,8


In [64]:
df.groupby('group').aggregate({'data1': 'min', 'data2': 'sum'})

Unnamed: 0_level_0,data1,data2
group,Unnamed: 1_level_1,Unnamed: 2_level_1
A,20,16
B,20,20
