In [None]:
# 데이터 프레임 구현
from IPython.core.display import display, HTML
def df_display(dfs:list, captions:list):
    """ 데이터 프레임을 나란히 보여줌       
    dfs: 데이터 프레임 리스트        
    captions: 각 데이터 테이블의 설명
    """
    output = ""
    combined = dict(zip(captions, dfs))
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
    display(HTML(output))

In [None]:
import numpy as np
import pandas as pd
import random 

pd.set_option("display.precision", 1)

# 임의 숫자 시드 설정
random.seed(20)
np.random.seed(21)

N1=1000
N2=600
N3=400

mu1, sigma1=60, 5
mu2, sigma2=80, 5

beta_positive = 10

# col_A: 정규분포(mu1, sigma1, N1)
col_A = np.random.normal(mu1, sigma1, N1)

# col_B: 정규분포(mu1, sigma1, N1) + Exponential(beta_positive)
X1 = np.random.normal(mu1, sigma1, int(N1*0.8))
X2 = np.random.exponential(beta_positive, int(N1*0.2)) + mu1
col_B = np.concatenate([X1, X2])

# col_C 만들기: col_B에서 flip over 
X3 = np.random.normal(mu1, sigma1, int(N1*0.8))
X4 = np.random.exponential(beta_positive, int(N1*0.2)) + mu1
col_C = np.concatenate([X3, X4])
col_C = (np.mean(col_C) - col_C) + mu1

# col_D 만들기 : Bimodal 
X5 = np.random.normal(mu1, sigma1, N2)
X6 = np.random.normal(mu2, sigma2, N3)
col_D = np.concatenate([X5, X6])

# ID
user_id = np.random.randint(1000, 1999, N1)

df_mt = pd.DataFrame({'ID' : user_id,
                    'col_A': col_A.astype('int64'),
                    'col_B': col_B.astype('int64'),
                    'col_C': col_C.astype('int64'),
                    'col_D': col_D.astype('int64')
                    }) 

In [None]:
print('데이터 사이즈', df_mt.shape)
df_mt.info()

데이터 사이즈 (1000, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   ID      1000 non-null   int64
 1   col_A   1000 non-null   int64
 2   col_B   1000 non-null   int64
 3   col_C   1000 non-null   int64
 4   col_D   1000 non-null   int64
dtypes: int64(5)
memory usage: 39.2 KB
데이터 사이즈 (1000, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   ID      1000 non-null   int64
 1   col_A   1000 non-null   int64
 2   col_B   1000 non-null   int64
 3   col_C   1000 non-null   int64
 4   col_D   1000 non-null   int64
dtypes: int64(5)
memory usage: 39.2 KB


In [None]:
df_display([df_mt.head(), df_mt.tail()], ['처음 5행', '마지막 5행'])

Unnamed: 0,ID,col_A,col_B,col_C,col_D
0,1320,59,67,63,57
1,1339,59,60,67,56
2,1803,65,63,49,64
3,1287,53,53,64,52
4,1637,63,63,57,50

Unnamed: 0,ID,col_A,col_B,col_C,col_D
995,1560,65,64,60,77
996,1402,57,68,48,86
997,1647,51,61,56,77
998,1851,56,65,57,97
999,1838,60,62,39,80


Unnamed: 0,ID,col_A,col_B,col_C,col_D
0,1320,59,67,63,57
1,1339,59,60,67,56
2,1803,65,63,49,64
3,1287,53,53,64,52
4,1637,63,63,57,50

Unnamed: 0,ID,col_A,col_B,col_C,col_D
995,1560,65,64,60,77
996,1402,57,68,48,86
997,1647,51,61,56,77
998,1851,56,65,57,97
999,1838,60,62,39,80


In [None]:
# 계산하고 싶은 열 지정
list_cols = ['col_A', 'col_B', 'col_C', 'col_D']

# 평균을 데이터프레임으로 표현
df_mt_mean = pd.DataFrame(df_mt[list_cols].mean(), columns=['평균값'])
df_mt_mean

Unnamed: 0,평균값
col_A,59.3
col_B,61.2
col_C,59.5
col_D,67.7


Unnamed: 0,평균값
col_A,59.3
col_B,61.2
col_C,59.5
col_D,67.7


In [None]:
# 중앙값을 데이터프레임으로 표현
df_mt_median = pd.DataFrame(df_mt[list_cols].median(), columns=['중앙값'])
df_mt_median

Unnamed: 0,중앙값
col_A,59.0
col_B,61.0
col_C,60.0
col_D,64.0


Unnamed: 0,중앙값
col_A,59.0
col_B,61.0
col_C,60.0
col_D,64.0


In [None]:
# 최빈값 확인
df_mt[list_cols].mode()

Unnamed: 0,col_A,col_B,col_C,col_D
0,62,62,60,58


Unnamed: 0,col_A,col_B,col_C,col_D
0,62,62,60,58


In [None]:
# 중심경향값을 데이터프레임으로 표현
df_mt_mean = pd.DataFrame(df_mt[list_cols].mean(), columns=['평균값'])
df_mt_median = pd.DataFrame(df_mt[list_cols].median(), columns=['중앙값'])
# 최빈값 결과 데이터프레임에서 .T (Transpose)로 행과 열을 바꾼 후, 인덱스 0을 최빈값으로 바꿈
df_mt_mode = df_mt[list_cols].mode().T.rename(columns={0:'최빈값'})

# 중심경향값 확인
df_display([df_mt_mean, df_mt_median, df_mt_mode], ['평균', '중앙값', '최빈값'])

Unnamed: 0,평균값
col_A,59.3
col_B,61.2
col_C,59.5
col_D,67.7

Unnamed: 0,중앙값
col_A,59.0
col_B,61.0
col_C,60.0
col_D,64.0

Unnamed: 0,최빈값
col_A,62
col_B,62
col_C,60
col_D,58


Unnamed: 0,평균값
col_A,59.3
col_B,61.2
col_C,59.5
col_D,67.7

Unnamed: 0,중앙값
col_A,59.0
col_B,61.0
col_C,60.0
col_D,64.0

Unnamed: 0,최빈값
col_A,62
col_B,62
col_C,60
col_D,58


In [None]:
# 특정 열의 기술통계 구하기
cols = ['col_A', 'col_B', 'col_C', 'col_D']
df_mt[list_cols].describe()

Unnamed: 0,col_A,col_B,col_C,col_D
count,1000.0,1000.0,1000.0,1000.0
mean,59.3,61.2,59.5,67.7
std,5.0,6.8,7.7,11.3
min,40.0,46.0,-2.0,41.0
25%,56.0,57.0,57.0,58.0
50%,59.0,61.0,60.0,64.0
75%,63.0,64.0,64.0,78.0
max,75.0,118.0,77.0,97.0


Unnamed: 0,col_A,col_B,col_C,col_D
count,1000.0,1000.0,1000.0,1000.0
mean,59.3,61.2,59.5,67.7
std,5.0,6.8,7.7,11.3
min,40.0,46.0,-2.0,41.0
25%,56.0,57.0,57.0,58.0
50%,59.0,61.0,60.0,64.0
75%,63.0,64.0,64.0,78.0
max,75.0,118.0,77.0,97.0
