# 다대다 관계 수정하기
- 데이터 : 클리블랜드 미술관 소장품(작가 파일과 매체인용 파일)
    - 작가 데이터 : 소장품에 대한 작가 정보
    - 인용 데이터 : 소장품 별 인용정보 (소장품에 대해 여러 개의 인용행)
---
- 👉 한 개의 데이터셋으로부터 소장품, 인용, 작가 데이터셋을 만든다. 소장품은 인용과 작가와 일대다 관계를 갖는다.

In [2]:
import pandas as pd
cma = pd.read_csv('data/cmacollections.csv')

In [3]:
cma.shape

(12326, 9)

In [5]:
cma.head(2).T

Unnamed: 0,0,1
id,92937,92937
citation,"Milliken, William","Glasier, Jessie C."
creator,George Bellows (Am,George Bellows (Am
title,Stag at Sharkey's,Stag at Sharkey's
birth_year,1882,1882
death_year,1925,1925
collection,American - Painting,American - Painting
type,Painting,Painting
creation_date,1909,1909


### 중복된 인용 및 작성자가 있는 소장품 확인

In [6]:
cma.set_index(['id'], inplace=True)

In [7]:
cma.loc[124733, ['title', 'citation', 'creator', 'birth_year']].head(14)

Unnamed: 0_level_0,title,citation,creator,birth_year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
124733,Dead Blue Roller,"Weigel, J. A. G. <",Albrecht Dürer (Ge,1471
124733,Dead Blue Roller,"Weigel, J. A. G. <",Hans Hoffmann (Ger,1545/50
124733,Dead Blue Roller,"Winkler, Friedrich",Albrecht Dürer (Ge,1471
124733,Dead Blue Roller,"Winkler, Friedrich",Hans Hoffmann (Ger,1545/50
124733,Dead Blue Roller,"Francis, Henry S.",Albrecht Dürer (Ge,1471
124733,Dead Blue Roller,"Francis, Henry S.",Hans Hoffmann (Ger,1545/50
124733,Dead Blue Roller,"Kurz, Otto. <em>Fa",Albrecht Dürer (Ge,1471
124733,Dead Blue Roller,"Kurz, Otto. <em>Fa",Hans Hoffmann (Ger,1545/50
124733,Dead Blue Roller,Minneapolis Instit,Albrecht Dürer (Ge,1471
124733,Dead Blue Roller,Minneapolis Instit,Hans Hoffmann (Ger,1545/50


### 소장품 데이터프레임 생성

In [8]:
collectionsvar = ['title', 'collection', 'type']

In [13]:
cmacollections = cma[collectionsvar].reset_index().drop_duplicates(['id']).set_index(['id'])

In [14]:
cmacollections.shape

(972, 3)

In [15]:
cmacollections.head()

Unnamed: 0_level_0,title,collection,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
92937,Stag at Sharkey's,American - Painting,Painting
94979,Nathaniel Hurd,American - Painting,Painting
137259,Mme L... (Laure Borreau),Mod Euro - Painting,Painting
141639,Twilight in the Wilderness,American - Painting,Painting
93014,"View of Schroon Mountain, Esse",American - Painting,Painting


### 인용 데이터프레임 생성
- 주의 id가 인덱스이므로 Reset index 후에 중복제거해야함

In [16]:
cmacitations = cma[['citation']].reset_index().drop_duplicates(['id', 'citation']).set_index(['id'])

In [42]:
cmacitations.shape

(9758, 1)

In [17]:
cmacitations.loc[124733]

Unnamed: 0_level_0,citation
id,Unnamed: 1_level_1
124733,"Weigel, J. A. G. <"
124733,"Winkler, Friedrich"
124733,"Francis, Henry S."
124733,"Kurz, Otto. <em>Fa"
124733,Minneapolis Instit
124733,"Pilz, Kurt. ""Hans"
124733,"Koschatzky, Walter"
124733,"Johnson, Mark M<em"
124733,"Kaufmann, Thomas D"
124733,"Koreny, Fritz. <em"


### 작가 데이터프레임 생성

In [18]:
creatorsvars = ['creator', 'birth_year', 'death_year']

In [19]:
cmacreators = cma[creatorsvars].reset_index().drop_duplicates(['id', 'creator']).set_index(['id'])

In [43]:
cmacreators.shape

(1055, 3)

In [20]:
cmacreators.loc[124733]

Unnamed: 0_level_0,creator,birth_year,death_year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
124733,Albrecht Dürer (Ge,1471,1528
124733,Hans Hoffmann (Ger,1545/50,1591/92


### 1950년 이후 출생한 작가의 작품수

In [22]:
cmacreators['birth_year'].dtypes

dtype('O')

In [26]:
# 정규표현식 : 하나 이상의 연속된 숫자 찾기
cmacreators['birth_year'] = cmacreators['birth_year'].str.findall('\d+').str[0].astype(float)

In [32]:
youngartists = cmacreators.loc[cmacreators.birth_year > 1950, ['creator']].assign(creatorbornafter1950='Y')

In [33]:
youngartists.shape[0] == youngartists.index.nunique()

True

In [34]:
youngartists

Unnamed: 0_level_0,creator,creatorbornafter1950
id,Unnamed: 1_level_1,Unnamed: 2_level_1
371392,Belkis Ayón (Cuban,Y
162624,Robert Gober (Amer,Y
172588,Rachel Harrison (A,Y
169335,Pae White (America,Y
169862,Fred Wilson (Ameri,Y
312739,"Liu Jing (Chinese,",Y
293323,Zeng Xiaojun (Chin,Y
172539,Fidencio Fifield-P,Y


In [35]:
cmacollections = pd.merge(cmacollections, youngartists, left_on=['id'], right_on=['id'], how='left')

In [37]:
cmacollections.creatorbornafter1950.fillna('N', inplace=True)

In [39]:
cmacollections.shape

(972, 5)

In [41]:
cmacollections.creatorbornafter1950.value_counts()

N    964
Y      8
Name: creatorbornafter1950, dtype: int64