In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from pandas.testing import assert_frame_equal
import matplotlib.pyplot as plt
import seaborn as sns
p1 = Path.cwd() / 'back_data'

In [2]:
college = pd.read_csv(p1 / 'college.csv')
columns = college.columns

In [5]:
# index 형식(columns 포함)은 values 속성을 사용해 기저 numpy 배열에 접근 가능
# 스칼라, 리스트, 슬라이스로 아이템 선택 가능
# 기본 산술과 비교 연산자 적용 가능
# index 형식은 immutable -> 값을 변경하려면 리스트로 전환해야
# index 형식은 set와 유사(단 중복값 지원) -> 합집합, 교집합, 차집합, 대칭차집합 지원
columns + '_A'

Index(['INSTNM_A', 'CITY_A', 'STABBR_A', 'HBCU_A', 'MENONLY_A', 'WOMENONLY_A',
       'RELAFFIL_A', 'SATVRMID_A', 'SATMTMID_A', 'DISTANCEONLY_A', 'UGDS_A',
       'UGDS_WHITE_A', 'UGDS_BLACK_A', 'UGDS_HISP_A', 'UGDS_ASIAN_A',
       'UGDS_AIAN_A', 'UGDS_NHPI_A', 'UGDS_2MOR_A', 'UGDS_NRA_A',
       'UGDS_UNKN_A', 'PPTUG_EF_A', 'CURROPER_A', 'PCTPELL_A', 'PCTFLOAN_A',
       'UG25ABV_A', 'MD_EARN_WNE_P10_A', 'GRAD_DEBT_MDN_SUPP_A'],
      dtype='object')

In [6]:
# index 정렬 과정에서 양 쪽 index가 동일하지 않을 경우 카티션 곱 생성
# index가 고유하거나 둘 다 정확히 동일한 요소 및 순서를 가질 경우 카디션 곱 미생성
# index가 고유하지 않을 경우 요소는 동일하나 순서가 다르면 카티션 곱 생성
# 여러 열로 groupby 작업을 하거나 열 중 하나가 categorical 형식인 경우 카티션 곱 생성 -> observed=True 인자 활용 필요

In [7]:
# 리스트 언패킹과 list comprehension을 활용하여 3개 baseball 데이터셋 호출
bball14, bball15, bball16 = [pd.read_csv(p1 / f'baseball{year}.csv', index_col='playerID') for year in [14, 15, 16]]

In [9]:
# index 형식의 difference() 메서드 활용 -> 차집합의 개념
bball14.index.difference(bball15)

Index(['altuvjo01', 'cartech02', 'castrja01', 'corpoca01', 'dominma01',
       'fowlede01', 'gonzama01', 'grossro01', 'guzmaje01', 'hoeslj01',
       'krausma01', 'marisja01', 'preslal01', 'singljo02', 'springe01',
       'villajo01'],
      dtype='object', name='playerID')

In [12]:
hits14 = bball14['H']
hits15 = bball15['H']
hits16 = bball16['H']
# add() 메서드 활용하여 더하면서 fill_value=0 인자 전달하여 결측치 고려
hits_total = hits14.add(hits15, fill_value=0).add(hits16, fill_value=0)
hits_total.head(), hits_total.hasnans

(playerID
 altuvjo01    641.0
 bregmal01     53.0
 cartech02    193.0
 castrja01    243.0
 congeha01     46.0
 Name: H, dtype: float64,
 False)

In [13]:
# DataFrame이 다른 DataFrame이나 Series에서 새 열을 추가할 때 index가 먼저 정렬된 다음 새 열 생성

In [15]:
employee = pd.read_csv(p1 / 'employee.csv')
dept_sal = employee[['DEPARTMENT', 'BASE_SALARY']]
# sort_values() 메서드 활용하여 정렬
dept_sal = dept_sal.sort_values(by=['DEPARTMENT', 'BASE_SALARY'], ascending=[True, False])

In [20]:
# DEPARTMENT의 중복값을 제거하여 가장 큰 BASE_SALARY만 남기기 -> DEPARTMENT를 index로 설정
max_dept_sal = dept_sal.drop_duplicates(subset='DEPARTMENT').set_index('DEPARTMENT')
max_dept_sal.head()

Unnamed: 0_level_0,BASE_SALARY
DEPARTMENT,Unnamed: 1_level_1
Admn. & Regulatory Affairs,140416.0
City Controller's Office,64251.0
City Council,100000.0
Convention and Entertainment,38397.0
Dept of Neighborhoods (DON),89221.0


In [21]:
# index가 자동 정렬되므로, max_dept_sal을 기존 DataFrame에 추가하면 잘 삽입
# but DEPARTMENT에 중복된 부서가 있었다면 카티션 곱이 생성되었을 것
(employee
.set_index('DEPARTMENT')
.assign(MAX_DEPT_SALARY=max_dept_sal['BASE_SALARY'])
.head()
)

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,BASE_SALARY,RACE,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE,MAX_DEPT_SALARY
DEPARTMENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Municipal Courts Department,0,ASSISTANT DIRECTOR (EX LVL),121862.0,Hispanic/Latino,Full Time,Female,Active,2006-06-12,2012-10-13,121862.0
Library,1,LIBRARY ASSISTANT,26125.0,Hispanic/Latino,Full Time,Female,Active,2000-07-19,2010-09-18,107763.0
Houston Police Department-HPD,2,POLICE OFFICER,45279.0,White,Full Time,Male,Active,2015-02-03,2015-02-03,199596.0
Houston Fire Department (HFD),3,ENGINEER/OPERATOR,63166.0,White,Full Time,Male,Active,1982-02-08,1991-05-25,210588.0
General Services Department,4,ELECTRICIAN,56347.0,White,Full Time,Male,Active,1989-06-19,1994-10-22,89194.0


In [28]:
# groupby의 transform() 메서드 활용하여 복제 가능
max_sal = (employee
.groupby('DEPARTMENT')['BASE_SALARY']
# transform('max')를 통해 기존 인덱스를 유지하며 BASE_SALARY의 최대값 추출
.transform('max')
)
max_sal

0       121862.0
1       107763.0
2       199596.0
3       210588.0
4        89194.0
          ...   
1995    199596.0
1996    210588.0
1997    199596.0
1998    199596.0
1999    210588.0
Name: BASE_SALARY, Length: 2000, dtype: float64

In [30]:
(employee
# rename() 메서드를 통해 max_sal Series의 이름 지정. 인덱스를 기준으로 합치므로 left_index, right_index에 True 지정
.merge(max_sal.rename('MAX_DEPT_SALARY'), how='inner', left_index=True, right_index=True)
.head()
)

Unnamed: 0,UNIQUE_ID,POSITION_TITLE,DEPARTMENT,BASE_SALARY,RACE,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE,MAX_DEPT_SALARY
0,0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,Hispanic/Latino,Full Time,Female,Active,2006-06-12,2012-10-13,121862.0
1,1,LIBRARY ASSISTANT,Library,26125.0,Hispanic/Latino,Full Time,Female,Active,2000-07-19,2010-09-18,107763.0
2,2,POLICE OFFICER,Houston Police Department-HPD,45279.0,White,Full Time,Male,Active,2015-02-03,2015-02-03,199596.0
3,3,ENGINEER/OPERATOR,Houston Fire Department (HFD),63166.0,White,Full Time,Male,Active,1982-02-08,1991-05-25,210588.0
4,4,ELECTRICIAN,General Services Department,56347.0,White,Full Time,Male,Active,1989-06-19,1994-10-22,89194.0


In [31]:
college = pd.read_csv(p1 / 'college.csv', index_col='INSTNM')

In [34]:
# 'MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP' 열의 경우 수치가 되면 좋지만 object 형식이므로 바꿀 필요
# value_counts() 메서드 활용하여 빈도 확인하면서 문제점 파악 -> PrivacySuppressed가 다수
cols = ['MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']
[college[col].value_counts() for col in cols]

[PrivacySuppressed    822
 38800                151
 21500                 97
 49200                 78
 27400                 46
                     ... 
 84000                  1
 66900                  1
 52800                  1
 67800                  1
 186500                 1
 Name: MD_EARN_WNE_P10, Length: 598, dtype: int64,
 PrivacySuppressed    1510
 9500                  514
 27000                 306
 25827.5               136
 25000                 124
                      ... 
 9604                    1
 19262                   1
 8099                    1
 8050                    1
 11061                   1
 Name: GRAD_DEBT_MDN_SUPP, Length: 2038, dtype: int64]

In [36]:
# pd.to_numeric() 함수 사용 가능 -> replace()로 바꾸고 형변환 하는 것 보다 이게 더 효율적
for col in cols:
    # errors 파라미터 활용 -> 'coerce', 'ignore', 'raise' 활용 가능
    # 'coerce'의 경우 숫자가 아닌 값을 NaN으로 변경
    college[col] = pd.to_numeric(college[col], errors='coerce')
# dtypes 속성을 통해 변화 확인
college.loc[:, cols].dtypes

MD_EARN_WNE_P10       float64
GRAD_DEBT_MDN_SUPP    float64
dtype: object

In [37]:
college_n = college.select_dtypes('number')
college_n.head()

Unnamed: 0_level_0,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,UGDS_WHITE,UGDS_BLACK,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama A & M University,1.0,0.0,0.0,0,424.0,420.0,0.0,4206.0,0.0333,0.9353,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300.0,33888.0
University of Alabama at Birmingham,0.0,0.0,0.0,0,570.0,565.0,0.0,11383.0,0.5922,0.26,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700.0,21941.5
Amridge University,0.0,0.0,0.0,1,,,1.0,291.0,0.299,0.4192,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100.0,23370.0
University of Alabama in Huntsville,0.0,0.0,0.0,0,595.0,590.0,0.0,5451.0,0.6988,0.1255,...,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500.0,24097.0
Alabama State University,1.0,0.0,0.0,0,425.0,430.0,0.0,4811.0,0.0158,0.9208,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600.0,33118.5


In [42]:
# 일부 열들은 오직 이진 값(0 또는 1)만 가지므로, 해당 열들은 제외
# nunique() 메서드 활용하여 고유값이 2개인 열 찾기
binary_only = college_n.nunique().eq(2)
binary_cols = binary_only[binary_only].index
# drop() 메서드 활용하여 고유값이 2개인 열 제외
college_n2 = college_n.drop(columns=binary_cols)
college_n2.head()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Alabama A & M University,424.0,420.0,4206.0,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138,0.0656,0.7356,0.8284,0.1049,30300.0,33888.0
University of Alabama at Birmingham,570.0,565.0,11383.0,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01,0.2607,0.346,0.5214,0.2422,39700.0,21941.5
Amridge University,,,291.0,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715,0.4536,0.6801,0.7795,0.854,40100.0,23370.0
University of Alabama in Huntsville,595.0,590.0,5451.0,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035,0.2146,0.3072,0.4596,0.264,45500.0,24097.0
Alabama State University,425.0,430.0,4811.0,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137,0.0892,0.7347,0.7554,0.127,26600.0,33118.5


In [48]:
# idxmax() 메서드 활용하여 각 열에서 최대값을 갖는 index 출력
max_cols = college_n2.idxmax()
# 불리언 배열을 통해 해당 학교들만 출력
college_n2.loc[max_cols.unique()].head()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
California Institute of Technology,765.0,785.0,983.0,0.2787,0.0153,0.1221,0.4385,0.001,0.0,0.057,0.0875,0.0,0.0,0.1126,0.2303,0.0082,77800.0,11812.5
University of Phoenix-Arizona,,,151558.0,0.3098,0.1555,0.076,0.0082,0.0042,0.005,0.1131,0.0131,0.3152,0.0,0.6009,0.592,,,33000.0
Mr Leon's School of Hair Design-Moscow,,,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625,0.625,0.2,,15710.0
Velvatex College of Beauty Culture,,,25.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.7692,0.0,0.52,,
Thunderbird School of Global Management,,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,118900.0,


In [50]:
# 함수 생성을 활용하여 코드 재정렬
def remove_binary_cols(df):
    binary_only = df.nunique() == 2
    cols = binary_only[binary_only].index.tolist()
    return df.drop(columns=cols)

def select_rows_with_max_cols(df):
    max_cols = df.idxmax()
    unique = max_cols.unique()
    return df.loc[unique]

(college
   .assign(
       MD_EARN_WNE_P10=pd.to_numeric(college.MD_EARN_WNE_P10, errors='coerce'),
       GRAD_DEBT_MDN_SUPP=pd.to_numeric(college.GRAD_DEBT_MDN_SUPP, errors='coerce'))
   .select_dtypes('number')
   .pipe(remove_binary_cols)
   .pipe(select_rows_with_max_cols)
   .head()
)

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
California Institute of Technology,765.0,785.0,983.0,0.2787,0.0153,0.1221,0.4385,0.001,0.0,0.057,0.0875,0.0,0.0,0.1126,0.2303,0.0082,77800.0,11812.5
University of Phoenix-Arizona,,,151558.0,0.3098,0.1555,0.076,0.0082,0.0042,0.005,0.1131,0.0131,0.3152,0.0,0.6009,0.592,,,33000.0
Mr Leon's School of Hair Design-Moscow,,,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625,0.625,0.2,,15710.0
Velvatex College of Beauty Culture,,,25.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.7692,0.0,0.52,,
Thunderbird School of Global Management,,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,118900.0,


In [59]:
# 최대값이 공통일 경우 idxmax()는 가장 먼저 나온 최대값을 출력 -> 모든 공통 최대값을 출력하려면? 여러 메서드 체인으로 결합
# max() 메서드를 통해 각 열의 최대값 확인
college_n2.max()
# 해당 최대값이 기존 DataFrame과 일치하는지를 True / False로 출력 -> any(axis=1) 결합하여 하나라도 True이면 포함
college_n2_max = (college_n2
.eq(college_n2.max())
.any(axis=1)
)
college_n2_max.head()

INSTNM
Alabama A & M University               False
University of Alabama at Birmingham    False
Amridge University                     False
University of Alabama in Huntsville    False
Alabama State University               False
dtype: bool

In [62]:
# style.hightlight_max()로 시각적 확인
# 최대값이 0.0인 경우 모두 포함되는 문제 -> 추후 이걸 제외해야
college_n2[college_n2_max].head().style.highlight_max()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Thunderbird School of Global Management,,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,118900.0,
Southwest University of Visual Arts-Tucson,,,161.0,0.4534,0.0435,0.3168,0.0062,0.0311,0.0,0.0,0.0,0.1491,0.2795,0.4469,0.4292,0.8657,27200.0,49750.0
ABC Beauty College Inc,,,38.0,0.2895,0.6579,0.0526,0.0,0.0,0.0,0.0,0.0,0.0,0.2105,0.9815,1.0,0.4688,,16500.0
Professional Cosmetology Education Center,,,47.0,0.7234,0.234,0.0,0.0,0.0213,0.0,0.0213,0.0,0.0,0.0,0.6308,1.0,0.38,13800.0,9833.0
Velvatex College of Beauty Culture,,,25.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.7692,0.0,0.52,,


In [63]:
# filter() 메서드를 활용해 'UGDS_'를 포함한 열들만 선택
college_ugds = college.filter(like='UGDS_')
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [66]:
# 각 학교별 최대 인종을 출력
highest_race = college_ugds.idxmax(axis=1)
# value_counts()를 활용해 최대 출현 분포를 반환 -> 백인이 가장 많은 학교의 비율이 약 67%
highest_race.value_counts(normalize=True)

UGDS_WHITE    0.670352
UGDS_BLACK    0.151586
UGDS_HISP     0.129473
UGDS_UNKN     0.023422
UGDS_ASIAN    0.012074
UGDS_AIAN     0.006110
UGDS_NRA      0.004073
UGDS_NHPI     0.001746
UGDS_2MOR     0.001164
dtype: float64

In [72]:
# 흑인 학생들의 비율이 가장 높은 학교의 경우 두 번째로 높은 인종의 비율 분포는?
# idxmax(axis=1) 메서드 및 eq() 메서드 활용하여 흑인이 가장 많은지 여부를 True/False로 반환
highest_race_black = college_ugds.idxmax(axis=1).eq('UGDS_BLACK')
# 불리언 배열의 필터링 활용 및 idxmax(axis=1) 메서드 다시 활용
(college_ugds[highest_race_black]
.drop(columns='UGDS_BLACK')
.idxmax(axis=1)
.value_counts(normalize=True)
)

UGDS_WHITE    0.661228
UGDS_HISP     0.230326
UGDS_UNKN     0.071977
UGDS_NRA      0.018234
UGDS_ASIAN    0.009597
UGDS_2MOR     0.006718
UGDS_AIAN     0.000960
UGDS_NHPI     0.000960
dtype: float64