In [41]:
import pandas as pd
import numpy as np
from pathlib import Path
from pandas.testing import assert_frame_equal
p1 = Path.cwd() / 'back_data'

In [3]:
movies = pd.read_csv(p1 / 'movie.csv')

In [7]:
# 원하는 열의 리스트를 전달(리스트를 사용하지 않고 여러 열을 전달하지 않도록 주의) -> 해당 열만 출력
# list comprehension 활용 가능 -> 순서가 꼬이는데 sorted(list)로 정렬 가능
movie_actor_director = movies[sorted([col for col in movies.columns if '_name' in col and ('actor' in col or 'director' in col)])]
movie_actor_director

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker
...,...,...,...,...
4911,Eric Mabius,Daphne Zuniga,Crystal Lowe,Scott Smith
4912,Natalie Zea,Valorie Curry,Sam Underwood,
4913,Eva Boehnke,Maxwell Moody,David Chandler,Benjamin Roberds
4914,Alan Ruck,Daniel Henney,Eliza Coupe,Daniel Hsia


In [8]:
# 열 이름만 전달하면 Series 반환, 단일 아이템 리스트 전달하면 DataFrame 반환
type(movies['director_name']), type(movies[['director_name']]), type(movies.loc[:, 'director_name']), type(movies.loc[:, ['director_name']])

(pandas.core.series.Series,
 pandas.core.frame.DataFrame,
 pandas.core.series.Series,
 pandas.core.frame.DataFrame)

In [9]:
# movies의 열 이름 변경
movies = movies.rename(columns=lambda col: str(col).replace('facebook_likes', 'fb').replace('_for_revies', ''))
movies.dtypes.value_counts()

float64    13
object     12
int64       3
dtype: int64

In [15]:
# select_dtypes()메서드를 통해 원하는 data type의 열만 선택
# include, exclude 파라미터 활용 가능 -> 리스트로 묶어서 전달도 가능
movies.select_dtypes(include='number').head()

Unnamed: 0,num_critic_for_reviews,duration,director_fb,actor_3_fb,actor_1_fb,gross,num_voted_users,cast_total_fb,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_fb,imdb_score,aspect_ratio,movie_fb
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


In [17]:
# filter() 메서드를 이용하여 열 선택 가능
# like 파라미터로 부분 문자열 찾기 가능, items 파라미터로 열 이름 리스트로 전달 가능, regex 파라미터로 정규표현식 사용 가능 -> 세 개의 파라미터는 상호 배타적
# items 파라미터의 경우 인덱서 연산을 복제한 것과 유사하나 KeyError 예외 상황 발생 안하는 차이
movies.filter(like='fb').head()
movies.filter(regex='.*\d.*')

Unnamed: 0,actor_3_fb,actor_2_name,actor_1_fb,actor_1_name,actor_3_name,actor_2_fb
0,855.0,Joel David Moore,1000.0,CCH Pounder,Wes Studi,936.0
1,1000.0,Orlando Bloom,40000.0,Johnny Depp,Jack Davenport,5000.0
2,161.0,Rory Kinnear,11000.0,Christoph Waltz,Stephanie Sigman,393.0
3,23000.0,Christian Bale,27000.0,Tom Hardy,Joseph Gordon-Levitt,23000.0
4,,Rob Walker,131.0,Doug Walker,,12.0
...,...,...,...,...,...,...
4911,318.0,Daphne Zuniga,637.0,Eric Mabius,Crystal Lowe,470.0
4912,319.0,Valorie Curry,841.0,Natalie Zea,Sam Underwood,593.0
4913,0.0,Maxwell Moody,0.0,Eva Boehnke,David Chandler,0.0
4914,489.0,Daniel Henney,946.0,Alan Ruck,Eliza Coupe,719.0


In [18]:
movies = pd.read_csv(p1 / 'movie.csv')

In [22]:
# 모든 DataFrame은 2차원 배열 (Series는 1차원)
# count()는 결측치가 아닌 개수로 DataFrame에서도 활용 가능
movies.shape, movies.size, movies.ndim, len(movies), movies.count().head()

((4916, 28),
 137648,
 2,
 4916,
 color                      4897
 director_name              4814
 num_critic_for_reviews     4867
 duration                   4901
 director_facebook_likes    4814
 dtype: int64)

In [24]:
# min(), max(), mean(), median(), std() 등 기본적 통계량 확인 가능 -> skipna=False 지정하여 결측치가 없는 수치열만 계산 가능(describe에서는 불가)
# DataFrame에서도 describe() 메서드 활용 가능 -> .T를 이용해 전치하여 보기 편하게 변경
# include 파라미터(데이터 타입), percentiles 파라미터(분위 수 지정) 활용 가능
movies.describe(percentiles=[0.2, 0.5, 0.8]).T

Unnamed: 0,count,mean,std,min,20%,50%,80%,max
num_critic_for_reviews,4867.0,137.9889,120.2394,1.0,39.0,108.0,219.0,813.0
duration,4901.0,107.0908,25.28602,7.0,91.0,103.0,121.0,511.0
director_facebook_likes,4814.0,691.0145,2832.954,0.0,3.0,48.0,272.0,23000.0
actor_3_facebook_likes,4893.0,631.2763,1625.875,0.0,94.4,366.0,697.0,23000.0
actor_1_facebook_likes,4909.0,6494.488,15106.99,0.0,510.0,982.0,13000.0,640000.0
gross,4054.0,47644510.0,67372550.0,162.0,2924634.2,25043962.0,74945462.8,760505800.0
num_voted_users,4916.0,82644.92,138322.2,5.0,5526.0,33132.5,120202.0,1689764.0
cast_total_facebook_likes,4916.0,9579.816,18164.32,0.0,1120.0,3049.0,16125.0,656730.0
facenumber_in_poster,4903.0,1.37732,2.023826,0.0,0.0,1.0,2.0,43.0
num_user_for_reviews,4895.0,267.6688,372.9348,1.0,47.0,153.0,384.0,5060.0


In [25]:
movies = movies.rename(columns=lambda col: str(col).replace('facebook_likes', 'fb').replace('_for_revies', ''))

In [33]:
# 메서드를 체인시켜 결측치 개수 파악 가능(DataFrame에서는 hasnans 메서드 활용 불가능)
movies.isna().sum().sum(), movies.isna().any().any(), movies.notna().all().all()

(2654, True, False)

In [37]:
# object 형식의 max()를 확인하기 위해서는 결측치를 없애야
# 메서드 체인이 길어지면 한 줄당 하나의 메서드 호출로 작성하는 것이 용이
(movies
.select_dtypes(include=['object'])
.fillna('')
.max()
)

color                                                          Color
director_name                                          Étienne Faure
actor_2_name                                           Zubaida Sahar
genres                                                       Western
actor_1_name                                           Óscar Jaenada
movie_title                                                 Æon Flux
actor_3_name                                           Óscar Jaenada
plot_keywords                                    zombie|zombie spoof
movie_imdb_link    http://www.imdb.com/title/tt5574490/?ref_=fn_t...
language                                                        Zulu
country                                                 West Germany
content_rating                                                     X
dtype: object

In [38]:
college = pd.read_csv(p1 / 'college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')

In [40]:
# pandas는 뱅커 반올림 연산 -> round(n) 활용 시 소수점 마지막 자리를 더해야
college_ugds.loc['Northwest-Shoals Community College'], (college_ugds.loc['Northwest-Shoals Community College'] + 0.0001).round(2)

(UGDS_WHITE    0.7912
 UGDS_BLACK    0.1250
 UGDS_HISP     0.0339
 UGDS_ASIAN    0.0036
 UGDS_AIAN     0.0088
 UGDS_NHPI     0.0006
 UGDS_2MOR     0.0012
 UGDS_NRA      0.0033
 UGDS_UNKN     0.0324
 Name: Northwest-Shoals Community College, dtype: float64,
 UGDS_WHITE    0.79
 UGDS_BLACK    0.13
 UGDS_HISP     0.03
 UGDS_ASIAN    0.00
 UGDS_AIAN     0.01
 UGDS_NHPI     0.00
 UGDS_2MOR     0.00
 UGDS_NRA      0.00
 UGDS_UNKN     0.03
 Name: Northwest-Shoals Community College, dtype: float64)

In [42]:
# NaN(np.nan)의 특성 : 자기 자신과 같지 않음 -> DataFrame의 동일성 확인을 위해서는 equals() 메서드 사용 필요
# assert_frame_equal() 함수 사용 가능 (from pandas.testing import assert_frame_equal) -> check_dtype=False 활용 가능
assert_frame_equal(college_ugds, college_ugds, check_dtype=False) == None

True

In [45]:
# 기본적으로 pandas의 axis는 0으로 설정 -> axis=1로 변경 가능
college_ugds.count().head(3), college_ugds.count(axis=1).head(3), college_ugds.median().head(3), college_ugds.sum(axis=1).head(3)

(UGDS_WHITE    6874
 UGDS_BLACK    6874
 UGDS_HISP     6874
 dtype: int64,
 INSTNM
 Alabama A & M University               9
 University of Alabama at Birmingham    9
 Amridge University                     9
 dtype: int64,
 UGDS_WHITE    0.55570
 UGDS_BLACK    0.10005
 UGDS_HISP     0.07140
 dtype: float64,
 INSTNM
 Alabama A & M University               1.0000
 University of Alabama at Birmingham    0.9999
 Amridge University                     1.0000
 dtype: float64)

In [47]:
# cumsum() 메서드를 활용하여 누적 가능 -> 데이터에 대한 다른 관점 확인 가능
college_ugds.cumsum(axis=1).head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9686,0.9741,0.976,0.9784,0.9803,0.9803,0.9862,1.0
University of Alabama at Birmingham,0.5922,0.8522,0.8805,0.9323,0.9345,0.9352,0.972,0.9899,0.9999
Amridge University,0.299,0.7182,0.7251,0.7285,0.7285,0.7285,0.7285,0.7285,1.0
University of Alabama in Huntsville,0.6988,0.8243,0.8625,0.9001,0.9144,0.9146,0.9318,0.965,1.0
Alabama State University,0.0158,0.9366,0.9487,0.9506,0.9516,0.9522,0.962,0.9863,1.0


In [48]:
# dropna() 메서드를 통해 NaN이 있는 index 제거
college_ugds = college_ugds.dropna(how='all')
college_ugds.isna().sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [51]:
diversity_metric = (college_ugds
# 0.15 이상을 True, False로 반환
.ge(0.15)
# True가 1인 점을 이용하여 sum() 메서드 활용
.sum(axis=1)
)
# 도수 확인
diversity_metric.value_counts()

1    3042
2    2884
3     876
4      63
0       7
5       2
dtype: int64

In [59]:
# 0.15를 넘는게 0개인 학교 확인
diversity_metric[diversity_metric.eq(0)].index.tolist()

['American Conservatory Theater',
 'Prince Institute-Rocky Mountains',
 'Lyme Academy College of Fine Arts',
 'Professional Business College',
 'Spanish-American Institute',
 'Taft University System',
 'Education and Technology Institute']

In [65]:
# 0.15를 넘는게 4개 이상인 학교 확인
diversity_metric[diversity_metric.ge(4)].index.tolist()
# index를 확인하는 것이므로 loc[] 형태로 연결해야
college_ugds.loc[diversity_metric[diversity_metric.ge(4)].index.tolist()]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
College of Alameda,0.1529,0.2128,0.2196,0.3021,0.0037,0.0041,0.0473,0.0076,0.0500
Golden Gate University-San Francisco,0.2697,0.1183,0.1618,0.1535,0.0041,0.0394,0.0394,0.0145,0.1992
Heald College-Concord,0.2179,0.1797,0.2472,0.0911,0.0081,0.0301,0.0520,0.0000,0.1740
ITT Technical Institute-National City,0.2482,0.1742,0.3243,0.1522,0.0150,0.0090,0.0430,0.0000,0.0340
Laney College,0.1740,0.2471,0.1943,0.2618,0.0028,0.0046,0.0535,0.0083,0.0536
...,...,...,...,...,...,...,...,...,...
Divine Crown Academy of Cosmetology,0.1724,0.4483,0.1724,0.0345,0.0000,0.0000,0.0000,0.1724,0.0000
Computer Training Academy,0.2935,0.2174,0.1630,0.2935,0.0000,0.0000,0.0109,0.0000,0.0217
University of Phoenix-Florida,0.1816,0.2627,0.1603,0.0065,0.0016,0.0029,0.0510,0.0330,0.3003
University of Phoenix-Nevada,0.2506,0.1622,0.1587,0.0181,0.0029,0.0199,0.0773,0.0082,0.3021


In [69]:
# 0.15가 넘는 가장 많은 학교 확인
college_ugds.loc[diversity_metric.ge(diversity_metric.max())]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Central Texas Beauty College-Temple,0.1616,0.2323,0.2626,0.0202,0.0,0.0,0.1717,0.0,0.1515
Regency Beauty Institute-Austin,0.1867,0.2133,0.16,0.0,0.0,0.0,0.1733,0.0,0.2667
