In [1]:
import math
import statistics as st 
import numpy as np
import scipy.stats
import pandas as pd

# Mean

In [2]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]
print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [3]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
print(y)
print(y_with_nan)
print(z_with_nan)

[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


In [4]:
mean = sum(x)/len(x)
mean

8.7

In [5]:
mean_=st.mean(x)
mean_

8.7

In [6]:
mean_ = np.mean(y)
mean_

8.7

In [7]:
#jika terdapat nilai nan maka hasilnya akan muncul nan
print(np.mean(y_with_nan))
print(y_with_nan.mean())

nan
nan


In [8]:
#mengabaikan nilai nan
np.nanmean(y_with_nan)

8.7

In [9]:
z.mean()

8.7

In [10]:
z_with_nan.mean()

8.7

## Weighted Mean
mencari bobot angka pada data

In [11]:
.2 * 2 + .5 * 4 + .3 * 8

4.8

In [12]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [.1, .2, .3, .25, .15]
wmean1 = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
wmean1

6.95

In [13]:
wmean2 = sum(w_ * x_ for w_, x_ in zip(w, x)) / sum(w)
wmean2

6.95

In [14]:
y, z, w = np.array(x), pd.Series(x), np.array(w)
wmean_arr = np.average(y, weights=w)
wmean_arr

6.95

In [15]:
wmean_ser = np.average(x, weights=w)
wmean_ser

6.95

In [16]:
(w * y).sum() / w.sum()

6.95

## Harmonic Mean

In [17]:
hmean = len(x) / sum(1/value for value in x)
hmean

2.7613412228796843

In [18]:
hmean = st.harmonic_mean(x)
hmean

2.7613412228796843

In [19]:
scipy.stats.hmean(y)

2.7613412228796843

In [20]:
scipy.stats.hmean(z)

2.7613412228796843

## Geometric Mean

In [21]:
gmean = 1 
for item in x:
    gmean *= item
    
gmean **= 1/len(x)
gmean

4.677885674856041

In [22]:
scipy.stats.gmean(y)

4.67788567485604

In [23]:
scipy.stats.gmean(z)

4.67788567485604

# Median

In [24]:
#ganjil
n = len(x)

if n%2 :
    median = sorted(x)[round(0.5*(n-1))]
    
median

4

In [25]:
#genap
n = len(x)

if n%2 :
    median = sorted(x)[round(0.5*(n-1))]
else :
    x_ord, index = sorted(x), round(0.5*n)
    median = 0.5 * (x_ord[index-1] + x_ord[index])
median

4

In [26]:
x

[8.0, 1, 2.5, 4, 28.0]

In [27]:
sorted(x)

[1, 2.5, 4, 8.0, 28.0]

In [28]:
#menghilangkan outlier
sorted(x[:-1])

[1, 2.5, 4, 8.0]

In [29]:
st.median_low(x[:-1])

2.5

In [30]:
st.median_high(x[:-1])

4

In [31]:
median = np.median(y)
median

4.0

In [32]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [33]:
median = np.median(y[:-1])
median

3.25

# Mode/Modus

In [34]:
u = [2, 3, 2, 8, 12]
v = [12, 15, 12, 15, 21, 15, 12]

In [35]:
mode = max((u.count(item), item) for item in set(u))[1]
mode

2

In [36]:
mode = st.mode(u)
mode

2

In [37]:
u_arr, v_arr = np.array(u), np.array(v)

In [38]:
mode = scipy.stats.mode(u)
mode

ModeResult(mode=2, count=2)

In [39]:
#untuk mengetahui banyak data yang keluar
mode = scipy.stats.mode(v)
mode

ModeResult(mode=12, count=3)

In [40]:
mode.mode

12

In [41]:
mode.count

3

In [42]:
u, v, w = pd.Series(u), pd.Series(v), pd.Series([2, 2, math.nan])

In [43]:
print(u.mode())
print(v.mode())
print(w.mode())

0    2
dtype: int64
0    12
1    15
dtype: int64
0    2.0
dtype: float64


# Measure of Variability
## Variance

In [44]:
n = len(x)
mean = sum(x) / n
var = sum((item - mean)**2 for item in x)/(n - 1)
var

123.19999999999999

In [45]:
var = st.variance(x)
var

123.2

In [46]:
#ddof untuk menyetel degrees of freedom ke 1 untuk memungkinakan kalkulasi yang tepat dari s^2
var = np.var(y, ddof=1)
var

123.19999999999999

In [47]:
y.var(ddof=1)

123.19999999999999

In [48]:
z.var(ddof=1)

123.19999999999999

In [49]:
z.var()

123.19999999999999

## Standard Deviation
akar dari variance

In [50]:
std = var ** .5
std

11.099549540409285

In [51]:
st.stdev(x)

11.099549540409287

In [52]:
#fungsi
np.std(y, ddof=1)

11.099549540409285

In [53]:
#method
y.std(ddof=1)

11.099549540409285

In [54]:
y_with_nan.std(ddof=1)

nan

In [55]:
#untuk mengabaikan nan
np.nanstd(y_with_nan, ddof=1)

11.099549540409285

In [56]:
z.std()

11.099549540409285

In [57]:
#untuk populasi
st.pstdev(x)

9.927738916792686

In [58]:
#numpy
print(np.std(y, ddof=0))
print(y.std(ddof=0))

9.927738916792684
9.927738916792684


In [59]:
z.std(ddof=0)

9.927738916792684

## Skewness

In [60]:
x = [8.0, 1, 2.5, 4, 28.0]
n = len(x)
mean = sum(x) / n
var = sum((item - mean)**2 for item in x) / (n-1)
std = var**0.5
skew = (sum((item - mean)**3 for item in x)* n/ ((n-1)*(n-2)* std**3))
skew

1.9470432273905929

In [61]:
scipy.stats.skew(y, bias=False)

1.9470432273905927

In [62]:
scipy.stats.skew(y_with_nan, bias=False)

nan

In [63]:
z.skew()

1.9470432273905924

In [64]:
print(scipy.stats.skew.__doc__)

    


Compute the sample skewness of a data set.

For normally distributed data, the skewness should be about zero. For
unimodal continuous distributions, a skewness value greater than zero means
that there is more weight in the right tail of the distribution. The
function `skewtest` can be used to determine if the skewness value
is close enough to zero, statistically speaking.

Parameters
----------
a : ndarray
    Input array.
axis : int or None, default: 0
    If an int, the axis of the input along which to compute the statistic.
    The statistic of each axis-slice (e.g. row) of the input will appear in a
    corresponding element of the output.
    If ``None``, the input will be raveled before computing the statistic.
bias : bool, optional
    If False, then the calculations are corrected for statistical bias.
nan_policy : {'propagate', 'omit', 'raise'}
    Defines how to handle input NaNs.
    
    - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
      whi

In [65]:
z_with_nan.skew()

1.9470432273905924

## Percentile

In [66]:
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]

In [67]:
st.quantiles(x, n=2)

[8.0]

In [68]:
y = np.array(x)
np.percentile(y, 5)

-3.44

In [69]:
np.percentile(y, 95)

34.919999999999995

In [70]:
np.percentile(y, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [71]:
y_with_nan = np.insert(y, 2, np.nan)

In [72]:
y_with_nan

array([-5. , -1.1,  nan,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [73]:
np.nanpercentile(y_with_nan, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [74]:
z, z_with_nan = pd.Series(y), pd.Series(y_with_nan)
z.quantile(0.05)

-3.44

In [75]:
z.quantile([.25, .50, .75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

In [76]:
z_with_nan.quantile([.25, .50, .75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

## Ranges

In [77]:
np.ptp(y)

46.0

In [78]:
np.ptp(z)

46.0

In [79]:
np.ptp(y_with_nan)

nan

In [80]:
np.ptp(z_with_nan)

nan

In [81]:
np.amax(y) - np.amin(y)

46.0

In [83]:
np.nanmax(y_with_nan) - np.nanmin(y_with_nan)

46.0

In [84]:
y.max() - y.min()

46.0

In [87]:
z_with_nan.max() - z_with_nan.min

TypeError: unsupported operand type(s) for -: 'float' and 'method'