In [323]:
import pandas as pd
import numpy as np
data = {
  'gender' : ['F', 'M', 'F', 'M'],
  'country' : ['KOREA', 'CHINA', 'JAPAN', 'USA']
}
df = pd.DataFrame(data)
df

Unnamed: 0,gender,country
0,F,KOREA
1,M,CHINA
2,F,JAPAN
3,M,USA


In [324]:
# 성별이 F인 데이터만 추출
df.query("gender == 'F'")

Unnamed: 0,gender,country
0,F,KOREA
2,F,JAPAN


In [325]:
# 성별이 M이고 USA인 데이터 추출
df.query("gender == 'M' & country == 'USA'")

Unnamed: 0,gender,country
3,M,USA


In [326]:
# '성별'이라는 파생변수를 추가
df['성별'] = np.where(df['gender'] == 'F', '여', '남')
df

Unnamed: 0,gender,country,성별
0,F,KOREA,여
1,M,CHINA,남
2,F,JAPAN,여
3,M,USA,남


In [327]:
# country의 이름을 국가로 변경
df = df.rename(columns={'country' : '국가'})

In [328]:
# 국가와 성별만 추출해서 표시
df[['국가', '성별']]
# df.loc[:, '국가', '성별'] #이름을 가지고 추출
df.iloc[: , 1:] # 인덱스 번호 가지고 추출


Unnamed: 0,국가,성별
0,KOREA,여
1,CHINA,남
2,JAPAN,여
3,USA,남


In [329]:
# gender를 삭제
df.drop(columns='gender')
# 원본에 반영하려면 대입연산자 혹은 inplace속성 사용을 해야 반영됨 

Unnamed: 0,국가,성별
0,KOREA,여
1,CHINA,남
2,JAPAN,여
3,USA,남


In [330]:
# mpg.csv을 데이터프레임으로 가져와서
# category와 cty만 추출해서 새 데이터프레임을 생성

# 새로 생성된 데이터프레임을 이용해서
# category가 suv인 cty의 평균과 category가 compact인 cty의 평균 
# 어떤게 더 높은지 구해보십쇼

mpg = pd.read_csv('data/mpg.csv')
new_mpg = mpg[['category', 'cty']]
new_mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  234 non-null    object
 1   cty       234 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 3.8+ KB


In [331]:
suv_mean = new_mpg.query("category == 'suv'")['cty'].mean()
compact_mean = new_mpg.query("category == 'compact'")['cty'].mean()
print(suv_mean, compact_mean)

if suv_mean > compact_mean:
  print("suv가 도시 평균 연비가 더 좋습니다.")
else :
  compact_mean > suv_mean
  print("compact가 도시 연비가 더 좋습니다.")


13.5 20.127659574468087
compact가 도시 연비가 더 좋습니다.


In [332]:
# 정렬
exam = pd.read_csv('data/exam.csv')
exam.head(2)

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60


In [333]:
# 수학 기준으로 오름차순 정렬
exam.sort_values('math') # sort_values로 정렬을 할 수 있고 ()안에 기준점을 정할 수 있음

Unnamed: 0,id,nclass,math,english,science
8,9,3,20,98,15
4,5,2,25,80,65
3,4,1,30,98,58
2,3,1,45,86,78
11,12,3,45,85,32
12,13,4,46,98,65
13,14,4,48,87,12
0,1,1,50,98,50
9,10,3,50,98,45
5,6,2,50,89,98


In [334]:
# 내림차순 
exam.sort_values('math', ascending=False)

Unnamed: 0,id,nclass,math,english,science
7,8,2,90,78,25
18,19,5,89,68,87
6,7,2,80,90,45
17,18,5,80,78,90
19,20,5,78,83,58
14,15,4,75,56,78
10,11,3,65,65,65
16,17,5,65,68,98
1,2,1,60,97,60
15,16,4,58,98,65


In [335]:
# 정렬 기준이 2개 이상
# 1차로 nclass 오름차순 후 2차로 math 오름차순 정렬
exam.sort_values(['nclass', 'math'])

Unnamed: 0,id,nclass,math,english,science
3,4,1,30,98,58
2,3,1,45,86,78
0,1,1,50,98,50
1,2,1,60,97,60
4,5,2,25,80,65
5,6,2,50,89,98
6,7,2,80,90,45
7,8,2,90,78,25
8,9,3,20,98,15
11,12,3,45,85,32


In [336]:
# nclass 오름차순, math 내림차순 정렬
exam.sort_values(['nclass', 'math'], ascending=[True, False])


Unnamed: 0,id,nclass,math,english,science
1,2,1,60,97,60
0,1,1,50,98,50
2,3,1,45,86,78
3,4,1,30,98,58
7,8,2,90,78,25
6,7,2,80,90,45
5,6,2,50,89,98
4,5,2,25,80,65
10,11,3,65,65,65
9,10,3,50,98,45


In [337]:

exam.sort_values(['nclass', 'math'], ascending=[True, False]).reset_index(drop=True)

Unnamed: 0,id,nclass,math,english,science
0,2,1,60,97,60
1,1,1,50,98,50
2,3,1,45,86,78
3,4,1,30,98,58
4,8,2,90,78,25
5,7,2,80,90,45
6,6,2,50,89,98
7,5,2,25,80,65
8,11,3,65,65,65
9,10,3,50,98,45


In [338]:
# mpg데이터에서 제조사가 audi인 자동차들 중 hwy(고속도로 연비)가 1~5등인 데이터만 추출
hwy = mpg.query("manufacturer == 'audi'")
hwy.sort_values(['hwy'], ascending=False).head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,category
2,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
3,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
1,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
0,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
9,audi,a4 quattro,2.0,2008,4,manual(m6),4,20,28,p,compact


In [339]:
# 파생변수 만들기
# df['추가할파생변수이름'] = 계산식
exam = pd.read_csv('data/exam.csv')
exam.head(2)

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60


In [340]:
# 파생변수 추가해주는 함수
# .assign( 추가할변수명 = 계산식 )
exam.assign(total = exam['math'] + exam['english'] + exam['science']) 

Unnamed: 0,id,nclass,math,english,science,total
0,1,1,50,98,50,198
1,2,1,60,97,60,217
2,3,1,45,86,78,209
3,4,1,30,98,58,186
4,5,2,25,80,65,170
5,6,2,50,89,98,237
6,7,2,80,90,45,215
7,8,2,90,78,25,193
8,9,3,20,98,15,133
9,10,3,50,98,45,193


In [341]:
exam.assign(
  total = exam['math'] + exam['english'] + exam['science'],
  mean = (exam['math'] + exam['english'] + exam['science']) /3
  ) 

Unnamed: 0,id,nclass,math,english,science,total,mean
0,1,1,50,98,50,198,66.0
1,2,1,60,97,60,217,72.333333
2,3,1,45,86,78,209,69.666667
3,4,1,30,98,58,186,62.0
4,5,2,25,80,65,170,56.666667
5,6,2,50,89,98,237,79.0
6,7,2,80,90,45,215,71.666667
7,8,2,90,78,25,193,64.333333
8,9,3,20,98,15,133,44.333333
9,10,3,50,98,45,193,64.333333


In [342]:
# 함수/메서드 체이닝
exam.assign(test = np.where(exam['science'] >= 60, 'pass', 'fail'))\
  .sort_values('math').tail(3)

Unnamed: 0,id,nclass,math,english,science,test
6,7,2,80,90,45,fail
18,19,5,89,68,87,pass
7,8,2,90,78,25,fail


In [343]:
# mpg 데이터 이용하기
mpg = pd.read_csv('data/mpg.csv')
m1 = mpg.copy()
# 1. cty와 hwy를 더한 합산연비(total)를 추가 
# m1['total'] = m1['cty'] + m1['hwy']
m1 = m1.assign(total = m1['cty'] + m1['hwy'])



In [344]:
# 2. 합산연비를 이용해서 평균연비(mean)를 추가
# m1['mean'] = m1['total'] / 2
m1 = m1.assign(mean = m1['total'] / 2)
m1.head(2)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,category,total,mean
0,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,47,23.5
1,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,50,25.0


In [345]:
# 3. 평균 연비가 높은 자동차 3개 출력
m1 = m1.sort_values('mean', ascending=False)
m1.head(3)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,category,total,mean
221,volkswagen,new beetle,1.9,1999,4,manual(m5),f,35,44,d,subcompact,79,39.5
212,volkswagen,jetta,1.9,1999,4,manual(m5),f,33,44,d,compact,77,38.5
222,volkswagen,new beetle,1.9,1999,4,auto(l4),f,29,41,d,subcompact,70,35.0


In [346]:
# 4. 1~3번까지 작업한걸 체이닝으로 하나의 구문으로 해보기
# 그냥 평균연비가 높은 자동차 3개만 보고싶음
mpg.assign(mean = mpg['cty'] + mpg['hwy'] / 2).sort_values('mean', ascending=False).head(3)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,category,mean
221,volkswagen,new beetle,1.9,1999,4,manual(m5),f,35,44,d,subcompact,57.0
212,volkswagen,jetta,1.9,1999,4,manual(m5),f,33,44,d,compact,55.0
222,volkswagen,new beetle,1.9,1999,4,auto(l4),f,29,41,d,subcompact,49.5


In [347]:
# 그룹별 통계
exam = pd.read_csv('data/exam.csv')
exam.head(3)

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60
2,3,1,45,86,78


In [348]:
exam['math'].mean()

np.float64(57.45)

In [349]:
exam.groupby('nclass')['math'].mean()

nclass
1    46.25
2    61.25
3    45.00
4    56.75
5    78.00
Name: math, dtype: float64

In [350]:
exam.groupby('nclass').agg(mean_math = ('math', 'mean'))

Unnamed: 0_level_0,mean_math
nclass,Unnamed: 1_level_1
1,46.25
2,61.25
3,45.0
4,56.75
5,78.0


In [351]:
exam = exam.groupby('nclass')[['math', 'english']].mean()

In [352]:
exam.groupby('nclass').agg(
  mean_math = ('math', 'mean'),
  sum_math = ('math', 'sum'),
  median_math = ('math', 'median'),
  count = ('math', 'count')
)

Unnamed: 0_level_0,mean_math,sum_math,median_math,count
nclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,46.25,46.25,46.25,1
2,61.25,61.25,61.25,1
3,45.0,45.0,45.0,1
4,56.75,56.75,56.75,1
5,78.0,78.0,78.0,1


In [353]:
mpg.groupby(['manufacturer', 'drv']).agg(mean_cty = ('cty', 'mean'))

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_cty
manufacturer,drv,Unnamed: 2_level_1
audi,4,16.818182
audi,f,18.857143
chevrolet,4,12.5
chevrolet,f,18.8
chevrolet,r,14.1
dodge,4,12.0
dodge,f,15.818182
ford,4,13.307692
ford,r,14.75
honda,f,24.444444


In [354]:
# 제조사가 chevrolet인 데이터 중 drv별 cty의평균과 개수
mpg.query("manufacturer == 'chevrolet'").groupby('drv').agg(drv_mean = ('cty', 'mean'), count = ('cty', 'count'))

Unnamed: 0_level_0,drv_mean,count
drv,Unnamed: 1_level_1,Unnamed: 2_level_1
4,12.5,4
f,18.8,5
r,14.1,10


In [355]:
# mpg데이터를 이용해서 작업
mpg = pd.read_csv('data/mpg.csv')
mpg.head(2)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,category
0,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
1,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact


In [371]:
# category별 cty의 평균 구하기
mpg.groupby('category')['cty'].mean()


category
2seater       15.400000
compact       20.127660
midsize       18.756098
minivan       15.818182
pickup        13.000000
subcompact    20.371429
suv           13.500000
Name: cty, dtype: float64