In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/HR.csv')

In [3]:
# satisfaction_level  满意度
# last_evaluation 最近一次的评价
# number_project 项目数量
# average_monthly_hours 每月工作时长
# time_spend_company 工龄 
# Work_accident 工作事故 0没有 1有
# left 离职 1离职 0未离职
# promotion_last_5years 最近5年是否有晋升 0未晋升  1晋升
# department 员工的部门
# salary 薪水

df 

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14997,0.11,0.96,6,280,4,0,1,0,support,low
14998,0.37,0.52,2,158,3,0,1,0,support,low
14999,,0.52,2,158,3,0,1,0,support,low
15000,,999999.00,2,158,3,0,1,0,sale,low


## 理论知识

In [4]:
# 集中趋势：数据聚拢位置的一种衡量
# 均值：常用来分析连续值、分布比较均匀的值的趋势
# 中位数：用来分析存在异常值的数据，例如某些值特别大或特别小
# 众数：用来衡量离散值的集中趋势
# 分为数：与其他几个值共同作用。含义：将数据从小到大排列，切分成等分的数据点。常用到的是四分位数

# 四分位数的计算方法
# Q1的位置 = (n+1)*0.25
# Q2的位置 = (n+1)*0.5
# Q3的位置 = (n+1)*0.75

In [5]:
# 离中趋势：数据离散程度的衡量
# 标准差：值越大表示数据越离散，反之聚拢
# 方差：
# 正态分布：数据落在 -1倍标准差到+1倍标准差的概率是69%，-1.96到+1.96的概率是95%， -2.58到+2.58的概率是99%

In [1]:
# 数据分布：偏态与峰度
# 偏态系数与峰态系数
# 偏态系数：数据平均值偏离状态的衡量。值为正，是正偏（均值比较大）。值为负，是负偏（均值比较小）。相对于中位数或平均数
# 峰态系数：数据分布集中强度的衡量。值越大，顶部越尖。值越小，分布越平缓。正太分布的峰态系数一般为3。⚠️可以用来直接拒绝正太分布的假设。

# 正太分布与三大分布
# 卡方分布：几个变量都是标准正太分布（均值为0，方差为1），其平方和满足一个分布，则为卡方分布
# t分布：正太分布的一个随机变量除以一个符合卡方分布分布的变量就是t分布。常用来根据小样本来估计成正态分布且方差未知的总体的均值。
# f分布：是由两个服从卡方分布的随机变量的比构成的。

In [2]:
# 抽样理论 （数据量大，全量计算的成本大）
# 抽样误差与精度
# 抽样方法：完全随机抽样、等差距抽样、分类分层抽样
# 重复抽样：有放回的抽样
# 非重复抽样：无放回的抽样

In [3]:
# 代码实现
import pandas as pd

In [4]:
df = pd.read_csv('./data/HR.csv')

In [5]:
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14997,0.11,0.96,6,280,4,0,1,0,support,low
14998,0.37,0.52,2,158,3,0,1,0,support,low
14999,,0.52,2,158,3,0,1,0,support,low
15000,,999999.00,2,158,3,0,1,0,sale,low


In [6]:
# 均值
df['satisfaction_level'].mean()

0.6128393333333343

In [7]:
# 中位数
df['satisfaction_level'].median()

0.64

In [8]:
# 分为数(四分位数)
df.quantile(q = 0.25)

satisfaction_level         0.44
last_evaluation            0.56
number_project             3.00
average_monthly_hours    156.00
time_spend_company         3.00
Work_accident              0.00
left                       0.00
promotion_last_5years      0.00
Name: 0.25, dtype: float64

In [9]:
# 众数
df.mode()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.1,0.55,4.0,135,3.0,0.0,0.0,0.0,sales,low
1,,,,156,,,,,,


In [10]:
df['satisfaction_level'].mode()

0    0.1
dtype: float64

In [11]:
# 标准差
df.std()

satisfaction_level          0.248623
last_evaluation          8164.407524
number_project              1.232733
average_monthly_hours      49.941815
time_spend_company          1.460053
Work_accident               0.351689
left                        0.426018
promotion_last_5years       0.144267
dtype: float64

In [12]:
# 方差
df.var()

satisfaction_level       6.181359e-02
last_evaluation          6.665755e+07
number_project           1.519630e+00
average_monthly_hours    2.494185e+03
time_spend_company       2.131754e+00
Work_accident            1.236854e-01
left                     1.814911e-01
promotion_last_5years    2.081307e-02
dtype: float64

In [13]:
# 求和
df.sum()

satisfaction_level                                                 9192.59
last_evaluation                                                1.01074e+06
number_project                                                       57048
average_monthly_hours                                              3016028
time_spend_company                                                   52478
Work_accident                                                         2169
left                                                                  3574
promotion_last_5years                                                  319
department               salessalessalessalessalessalessalessalessaless...
salary                   lowmediummediumlowlowlowlowlowlowlowlowlowlowl...
dtype: object

In [14]:
# 偏态系数
df.skew()

satisfaction_level        -0.476438
last_evaluation          122.482652
number_project             0.337774
average_monthly_hours      0.053225
time_spend_company         1.853530
Work_accident              2.021481
left                       1.229057
promotion_last_5years      6.637677
dtype: float64

In [15]:
# 峰度系数
df.kurt()

satisfaction_level          -0.670696
last_evaluation          15001.999987
number_project              -0.495810
average_monthly_hours       -1.135016
time_spend_company           4.774353
Work_accident                2.086664
left                        -0.489485
promotion_last_5years       42.064357
dtype: float64

## 正态分布

In [27]:
# 引入分布函数
import scipy.stats as ss
ss.norm

<scipy.stats._continuous_distns.norm_gen at 0x182821d590>

In [28]:
# mvsk
# m 均值
# v 方差
# s 偏态系数
# k 峰体系数
ss.norm.stats(moments='mvsk')

(array(0.), array(1.), array(0.), array(0.))

In [29]:
# pdf 指定横坐标，使其返回纵坐标的值
ss.norm.pdf(0.0)

0.3989422804014327

In [30]:
# ppf 输入值必须是0-1之间的，表示累积值 从负无穷开始累积
ss.norm.ppf(0.9)

1.2815515655446004

In [31]:
# cdf 从负无穷累积到某一值的概率
ss.norm.cdf(2)

0.9772498680518208

In [32]:
# 负两倍标准差到正两倍标准差之间的概率
ss.norm.cdf(2)-ss.norm.cdf(-2)

0.9544997361036416

In [33]:
# rvs 得到符合正太分布的数字 size代表个数
ss.norm.rvs(size=10)

array([ 0.15718357,  0.21345018,  0.2416509 ,  0.39265044,  0.40924551,
       -0.84083195,  0.78625561,  0.24627867, -1.01507525,  0.56252693])

## 卡方分布

In [34]:
ss.chi2

<scipy.stats._continuous_distns.chi2_gen at 0x1828230290>

## t分布

In [35]:
ss.t

<scipy.stats._continuous_distns.t_gen at 0x18283086d0>

## f分布

In [36]:
ss.f

<scipy.stats._continuous_distns.f_gen at 0x182824f110>

## 抽样

In [37]:
df.sample(n = 10)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
6586,0.15,0.74,6,178,3,0,0,0,sales,low
14951,0.39,0.54,2,154,3,0,1,0,marketing,low
3015,0.91,0.76,3,116,5,0,0,0,technical,low
7883,0.95,0.83,4,252,2,0,0,0,sales,high
4706,0.83,0.87,3,156,3,0,0,0,support,medium
7160,0.46,0.53,3,135,2,0,0,0,management,low
7005,0.82,0.66,4,150,3,0,0,0,technical,low
7010,0.91,0.85,5,214,2,0,0,0,IT,low
783,0.36,0.46,2,132,3,0,1,0,sales,low
3701,0.61,0.59,3,237,2,0,0,0,technical,high


In [38]:
# frac 按照百分比抽样 
df.sample(frac=0.001)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
14764,0.79,1.0,5,257,6,0,1,0,sales,low
11304,0.57,0.9,5,145,3,0,0,0,accounting,low
10785,0.94,0.86,3,221,3,1,0,0,sales,medium
2120,0.67,0.51,5,182,3,0,0,0,management,low
10464,0.24,0.7,6,153,5,1,0,0,marketing,low
13490,0.22,0.91,6,222,8,0,0,0,technical,low
9822,0.92,0.66,4,133,3,0,0,0,support,medium
14035,0.75,0.79,4,263,3,0,0,1,marketing,medium
11022,0.96,0.89,3,142,4,0,0,0,sales,medium
3925,0.14,0.68,4,273,5,1,0,0,hr,medium
