# 요약통계와 빈도를 표시하는 함수

### 요약통계 표시 함수

In [1]:
def gettots(df):
    out = {}
    out['min'] = df.min()
    out['per15'] = df.quantile(0.15)
    out['qr1'] = df.quantile(0.25)
    out['med'] = df.median()
    out['qr3'] = df.quantile(0.75)
    out['per85'] = df.quantile(0.85)
    out['max'] = df.max()
    out['count'] = df.count()
    out['mean'] = df.mean()
    out['iqr'] = out['qr3'] - out['qr1']
    return pd.DataFrame(out)

In [2]:
import pandas as pd
import os
import sys

nls97 = pd.read_csv('data/nls97f.csv')
nls97.set_index('personid', inplace=True)

In [3]:
sys.path.append(os.getcwd() + "/helperfunctions")

In [4]:
# 모듈 임포트
import basicdescriptives as bd

In [5]:
# 작성한 함수 사용
bd.gettots(nls97[['satverbal', 'satmath']]).T

Unnamed: 0,satverbal,satmath
min,14.0,7.0
per15,390.0,390.0
qr1,430.0,430.0
med,500.0,500.0
qr3,570.0,580.0
per85,620.0,621.0
max,800.0,800.0
count,1406.0,1407.0
mean,499.72404,500.590618
iqr,140.0,150.0


In [6]:
bd.gettots(nls97.filter(like='weeksworked'))

Unnamed: 0,min,per15,qr1,med,qr3,per85,max,count,mean,iqr
weeksworked00,0.0,0.0,5.0,26.0,50.0,53.0,53.0,8603,26.417761,45.0
weeksworked01,0.0,0.0,10.0,33.0,51.0,52.0,52.0,8564,29.784096,41.0
weeksworked02,0.0,0.0,13.0,38.0,52.0,52.0,52.0,8556,31.8054,39.0
weeksworked03,0.0,0.0,14.0,43.0,52.0,52.0,52.0,8490,33.469611,38.0
weeksworked04,0.0,1.0,18.0,46.0,52.0,52.0,52.0,8458,35.104635,34.0
weeksworked05,0.0,5.0,22.0,50.0,53.0,53.0,53.0,8403,37.316435,31.0
weeksworked06,0.0,9.0,27.0,51.0,52.0,52.0,52.0,8340,38.429976,25.0
weeksworked07,0.0,10.0,30.0,52.0,52.0,52.0,52.0,8272,39.241296,22.0
weeksworked08,0.0,9.0,30.0,52.0,52.0,52.0,52.0,8186,39.287564,22.0
weeksworked09,0.0,0.0,22.0,52.0,52.0,52.0,52.0,8146,37.419961,30.0


### 행별, 열별로 누락값을 세는 함수 작성
- 매개변수 : 데이터프레임, 비율/개수를 표시하는 변수
- 열 누락값과 행별 누락값 반환
- byrowperc=True 전달시, 행별 누락값 빈도를 전체 행에 대한 비율로 표시

In [7]:
def getmissings(df, byrowperc=False):
    return df.isnull().sum(), \
           df.isnull().sum(axis=1).value_counts(normalize=byrowperc).sort_index()

In [23]:
import importlib #(모듈 수정시)
importlib.reload(bd)

<module 'basicdescriptives' from '/Users/angela/Pandas_Data_Cleaning/helperfunctions/basicdescriptives.py'>

In [9]:
missingbycols, missingbyrows = bd.getmissings(nls97[['weeksworked16', 'weeksworked17']], True)

In [10]:
missingbycols

weeksworked16    1916
weeksworked17    2314
dtype: int64

In [11]:
missingbyrows

0    0.739203
1    0.050757
2    0.210040
dtype: float64

In [12]:
missingbycols, missingbyrows = bd.getmissings(nls97[['weeksworked16', 'weeksworked17']])

In [13]:
missingbyrows

0    6641
1     456
2    1887
dtype: int64

### 범주형 변수 전체의 빈도를 계산하는 함수

In [14]:
def makefreqs(df, outfile):
    freqout = open(outfile, 'w')
    for col in df.select_dtypes(include=['category']):
        print(col, "---------------------", "frequencies", df[col].value_counts().sort_index(), 
              "percentages", df[col].value_counts(normalize=True).sort_index(),
               sep="\n\n", end="\n\n\n", file=freqout)
    freqout.close()

In [15]:
nls97.loc[:, nls97.dtypes=='object'] = \
        nls97.select_dtypes(['object']).apply(lambda x: x.astype('category'))

In [16]:
bd.makefreqs(nls97, "views/nlsfreqs.txt")

### 그룹별 개수를 구하는 함수
- 열 이름의 리스트 : cats
- 열 값의 각 조합에 대한 행 수 셈

In [17]:
def getcnts(df, cats, rowsel=None):
    tots = cats[:-1]
    catcnt = df.groupby(cats).size().reset_index(name='catcnt')
    totcnt = df.groupby(tots).size().reset_index(name='totcnt')
    percs = pd.merge(catcnt, totcnt, left_on=tots, right_on=tots, how='left')
    percs['percent'] = percs.catcnt / percs.totcnt
    if (rowsel is not None):
        percs = percs.loc[eval("percs. "+rowsel)]
    return perc

In [21]:
bd.getcnts(nls97, ['maritalstatus', 'gender', 'colenroct00'])

Unnamed: 0,maritalstatus,gender,colenroct00,catcnt,totcnt,percent
0,Divorced,Female,1. Not enrolled,317,393,0.806616
1,Divorced,Female,2. 2-year college,35,393,0.089059
2,Divorced,Female,3. 4-year college,41,393,0.104326
3,Divorced,Male,1. Not enrolled,238,270,0.881481
4,Divorced,Male,2. 2-year college,15,270,0.055556
5,Divorced,Male,3. 4-year college,17,270,0.062963
6,Married,Female,1. Not enrolled,1168,1636,0.713936
7,Married,Female,2. 2-year college,143,1636,0.087408
8,Married,Female,3. 4-year college,325,1636,0.198655
9,Married,Male,1. Not enrolled,1094,1430,0.765035


In [24]:
bd.getcnts(nls97, ['maritalstatus', 'gender', 'colenroct00'],
          "colenroct00.str[0:1]=='1'")

Unnamed: 0,maritalstatus,gender,colenroct00,catcnt,totcnt,percent
0,Divorced,Female,1. Not enrolled,317,393,0.806616
3,Divorced,Male,1. Not enrolled,238,270,0.881481
6,Married,Female,1. Not enrolled,1168,1636,0.713936
9,Married,Male,1. Not enrolled,1094,1430,0.765035
12,Never-married,Female,1. Not enrolled,1094,1307,0.837031
15,Never-married,Male,1. Not enrolled,1268,1459,0.869088
18,Separated,Female,1. Not enrolled,66,79,0.835443
21,Separated,Male,1. Not enrolled,67,75,0.893333
24,Widowed,Female,1. Not enrolled,16,19,0.842105
27,Widowed,Male,1. Not enrolled,3,4,0.75
