### 통계 관련 패키지 불러오기
import numpy as np\
from scipy import stats\
import pandas as pd\
import matplotlib.pyplot as plt\
import seaborn as sns\
from statsmodels.stats.proportion import proportion_ztest

### 환경 설정
- 데이터 시각화를 위한 Matplotlib 라이브러리를 불러오기\
`import matplotlib`

- 한글설치\
`matplotlib.rc('font', family = 'Malgun Gothic')`

- 음수 표시\
`matplotlib.rc('axes', unicode_minus = False)`

- 그래프 출력\
`%matplotlib inline`

- 실행 결과 경고 메세지 출력 제외\
`import warnings`\
`warnings.filterwarnings('ignore')`

## 연속형
### 자동차 연비 Data set에서 기술 통계치 구하기

In [16]:
mycars = pd.read_csv("mycars.csv")

In [17]:
#요약통계량
mycars.describe().round()

Unnamed: 0,displacement,year,cylinder,mpg,highway_mileage
count,234.0,234.0,234.0,234.0,234.0
mean,3.0,2004.0,6.0,17.0,23.0
std,1.0,5.0,2.0,4.0,6.0
min,2.0,1999.0,4.0,9.0,12.0
25%,2.0,1999.0,4.0,14.0,18.0
50%,3.0,2004.0,6.0,17.0,24.0
75%,5.0,2008.0,8.0,19.0,27.0
max,7.0,2008.0,8.0,35.0,44.0


In [19]:
# 자동차 모델.mpg별 데이터 subset
df = mycars[['model', 'mpg']]
df

Unnamed: 0,model,mpg
0,a4,18
1,a4,21
2,a4,20
3,a4,21
4,a4,16
...,...,...
229,passat,19
230,passat,21
231,passat,16
232,passat,18


In [20]:
#모델별 데이터 수 평균 표준편차
df.groupby('model').describe()

Unnamed: 0_level_0,mpg,mpg,mpg,mpg,mpg,mpg,mpg,mpg
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
4runner 4wd,6.0,15.166667,0.752773,14.0,15.0,15.0,15.75,16.0
a4,7.0,18.857143,1.864454,16.0,18.0,18.0,20.5,21.0
a4 quattro,8.0,17.125,1.807722,15.0,15.75,17.0,18.25,20.0
a6 quattro,3.0,16.0,1.0,15.0,15.5,16.0,16.5,17.0
altima,6.0,20.666667,1.966384,19.0,19.0,20.0,22.5,23.0
c1500 suburban 2wd,5.0,12.8,1.30384,11.0,12.0,13.0,14.0,14.0
camry,7.0,19.857143,1.46385,18.0,18.5,21.0,21.0,21.0
camry solara,7.0,19.857143,1.772811,18.0,18.0,21.0,21.0,22.0
caravan 2wd,11.0,15.818182,1.834022,11.0,15.5,16.0,17.0,18.0
civic,9.0,24.444444,1.943651,21.0,24.0,24.0,25.0,28.0


## 범주형
### 제품의 품질을 조사하여, cabbage 결힘과 결함이 발생한 기간을 조사한 table에서 범주형 데이터에 대해 counts, percents, cumulative counts, cumulative percent 계산하기

In [24]:
QC1 = pd.read_csv("EXH_QC1.csv")
df = QC1[['Flaws', 'Period']]

In [25]:
# flaws변수 count, 순서대로 정렬
count = df['Flaws'].value_counts().sort_index()
count

Flaws
Other       6
Peel       15
Scratch    13
Smudge      6
Name: count, dtype: int64

In [28]:
import numpy as np

#cumcnt 계산
cumcnt = np.cumsum(count)
cumcnt

Flaws
Other       6
Peel       21
Scratch    34
Smudge     40
Name: count, dtype: int64

In [29]:
#percent 계산
percent = count / sum(count) * 100
percent

Flaws
Other      15.0
Peel       37.5
Scratch    32.5
Smudge     15.0
Name: count, dtype: float64

In [31]:
#cumpnt 계산
cumpct = np.cumsum(percent)
cumpct

Flaws
Other       15.0
Peel        52.5
Scratch     85.0
Smudge     100.0
Name: count, dtype: float64

In [37]:
#DataFrame으로 취합
count_data = pd.DataFrame({'Count ': count, 'cumcnt' : cumcnt, 'percent' : percent, 'cumpct' : cumpct})

#name컬럼 생성
count_data.columns.name = 'Flaws'

count_data

Flaws,Count,cumcnt,percent,cumpct
Flaws,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Other,6,6,15.0,15.0
Peel,15,21,37.5,52.5
Scratch,13,34,32.5,85.0
Smudge,6,40,15.0,100.0
