# Interpretations

In [2]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [3]:
x = [8.0, 1, 2.5, 4., 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]


print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4.0, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [4]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

print(y)
print(y_with_nan)

[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]


In [5]:
print(z)
print(z_with_nan)

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


# Mean

In [6]:
x

[8.0, 1, 2.5, 4.0, 28.0]

In [9]:
sum(x)

43.5

In [10]:
len(x)

5

In [11]:
mean_ = sum(x) / len(x)
mean_

8.7

In [13]:
mean_ = statistics.mean(x)
mean_

8.7

In [14]:
sum(x_with_nan)

nan

nan dikarenakan ada nilai nan di dalam array = null value

In [16]:
np.mean(y_with_nan)

nan

In [18]:
y_with_nan.mean()

nan

In [19]:
np.nanmean(y_with_nan)

8.7

In [20]:
z

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64

In [21]:
z.mean()

8.7

In [22]:
z_with_nan

0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64

In [23]:
z_with_nan.mean()

8.7

# Weighted Mean

In [25]:
var = [2, 2, 4, 4, 4, 4, 4, 8, 8, 8]
len(var)

10

In [26]:
(0.2 * 2) + (0.5 * 4) + (0.3 * 8)

4.8

In [31]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]


In [32]:
sum(w)

1.0

In [33]:
wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
wmean


6.95

In [35]:
wmean = sum(x_ * w_ for (x_, w_) in zip(x, w)) / sum(w)
print(wmean)

6.95


In [37]:
y, z, w = np.array(x), pd.Series(x), np.array(w)

wmean = np.average(y, weights=w)
wmean

6.95

# Harmonic Mean

In [38]:
(60 + 20) / 2

40.0

In [39]:
(2+4+6+100) / 4

28.0

In [41]:
4 / (1/2 + 1/4 + 1/6 + 1/100) 

4.316546762589928

In [42]:
statistics.harmonic_mean(x)

2.7613412228796843

In [43]:
np.mean(x)

8.7

In [44]:
[8.0, 1, 2.5, 4, 28.0]

[8.0, 1, 2.5, 4, 28.0]

In [45]:
scipy.stats.hmean(y)

2.7613412228796843

In [46]:
scipy.stats.hmean(z)

2.7613412228796843

# Geometric Mean

In [47]:
2, 18

(2, 18)

In [48]:
2 * 18

36

In [49]:
gmean = 1

for item in x:
    gmean *= item
    
# gmean **= 1 / len (x)
gmean

2240.0

In [50]:
gmean **= 1 / len (x)
gmean

4.677885674856041

In [51]:
scipy.stats.gmean(y)

4.67788567485604

In [52]:
scipy.stats.gmean(z)

4.67788567485604

# Median

In [59]:
n = len(x)

if n%2:
    median_ = sorted(x), [round*(0.5*(n-1))]
else
    x_ord, index = sorted(x), round(0.5 * n)
    median = 0.5 * (x_ord[index-1] + x_ord[index])

SyntaxError: invalid syntax (2346446617.py, line 5)

In [61]:
n = len(x)
if n % 2:
    median_ = sorted(x)[round(0.5*(n-1))]
else:
    x_ord, index = sorted(x), round(0.5 * n)
    median_ = 0.5 * (x_ord[index-1] + x_ord[index])


In [57]:
n%2

1

In [62]:
median_

4

In [72]:
sorted(x[:-1])

[1, 2.5, 4, 8.0]

In [63]:
x

[8.0, 1, 2.5, 4, 28.0]

In [64]:
statistics.median_low(x[:-1])

2.5

In [65]:
statistics.median_high(x[:-1])

4

In [68]:
statistics.median(x_with_nan)

6.0

In [69]:
np.median(y)

4.0

In [70]:
np.median(y[:-1])

3.25

# Mode

In [84]:
u = [2, 3, 2, 8, 12]
v = [12, 15, 12, 15, 21, 15, 12]


In [85]:
max((u.count(item), item) for item in set(u))[1]

2

In [76]:
max((u.count(item), item) for item in set(u))[1]

2

In [77]:
statistics.mode(u)

2

In [78]:
u, v = np.array(u), np.array(v)
scipy.stats.mode(u)


ModeResult(mode=array([2]), count=array([2]))

In [79]:
scipy.stats.mode(v)

ModeResult(mode=array([12]), count=array([3]))

In [86]:
u, v, w = pd.Series(u), pd.Series(v), pd.Series(w)

In [87]:
u, v, w = pd.Series(u), pd.Series(v), pd.Series([2, 2, math.nan])

In [88]:
u.mode()

0    2
dtype: int64

In [89]:
v.mode()

0    12
1    15
dtype: int64

In [90]:
w.mode()

0    2.0
dtype: float64

# Measure of Variability

In [92]:
n = len(x)

sum(x) / n

8.7

In [99]:
var_ = sum((item - mean_)**2 for item in x) / (n-1)

In [95]:
statistics.variance(x)

123.2

In [96]:
np.var(y, ddof=1)

123.19999999999999

In [97]:
z

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64

In [98]:
z.var(ddof=0)

98.55999999999999

In [None]:
1,2,3,4,5,6,7,8,9,10

In [101]:
var_ ** 0.5

11.099549540409285

# Standard Deviation

In [123]:
std_ = var_ ** 0.5
std_

11.099549540409285

In [102]:
statistics.stdev(x)

11.099549540409287

In [103]:
np.std(y, ddof=1)

11.099549540409285

In [104]:
z.std(ddof=1)

11.099549540409285

# Skewness

In [105]:
x

[8.0, 1, 2.5, 4, 28.0]

In [117]:
mean_ = sum(x) / n
var_ = sum((item - mean_)**2 for item in x) / (n-1) 
std_ = var_ ** 0.5

skew_ = ((sum((item - mean_)**3 for item in x)) * n / ((n-1) * (n-2) * std_**3))
skew_

1.9470432273905929

In [110]:
mean_ = sum(x) / n
var_ = sum((item - mean_)**2 for item in x) / (n-1)
std_ = var_ ** 0.5

skew_ = (sum((item - mean_)**3 for item in x) * n / ((n - 1) * (n - 2) * std_**3))
skew_

1.9470432273905929

In [111]:
scipy.stats.skew(y, bias=False)

1.9470432273905927

In [112]:
scipy.stats.skew(y_with_nan, bias=False)

nan

In [114]:
z.skew()

1.9470432273905924

In [118]:
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

z.skew()

1.9470432273905924

In [120]:
z_with_nan.skew()

1.9470432273905924

# Percentiles

In [129]:
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]

In [130]:
x

[-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]

In [131]:
statistics.quantiles(x, n=2)

[8.0]

In [132]:
!python --version

Python 3.9.12


In [134]:
y = np.array(x)
np.percentile(y, 5)

-3.44

In [135]:
np.percentile(y, 95)

34.919999999999995

In [136]:
np.percentile(y, 50)

8.0

In [137]:
np.percentile(y_with_nan, 50)

nan

In [138]:
np.nanpercentile(y_with_nan, 50)

4.0

In [139]:
np.quantile(y, 0.05)

-3.44

In [140]:
np.quantile(y, 0.95)

34.919999999999995

# Range

In [141]:
np.ptp(y)

46.0

In [142]:
sorted(y)

[-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]

In [143]:
41 - (-5)

46

In [144]:
np.amax(y)

41.0

In [145]:
np.amax(x)

41.0

In [147]:
np.amax(y) - np.amin(y)

46.0

In [148]:
np.nanmax(y_with_nan) - np.nanmin(y_with_nan)

27.0

In [146]:
z_with_nan.max() - z_with_nan.min()

27.0

# SUmmary of Desc Stat

In [150]:
result = scipy.stats.describe(y, ddof=1, bias=False)
result

DescribeResult(nobs=9, minmax=(-5.0, 41.0), mean=11.622222222222222, variance=228.75194444444446, skewness=0.9249043136685094, kurtosis=0.14770623629658886)

In [151]:
result.nobs

9

In [153]:
result.minmax[0]  #min

-5.0

In [154]:
result.minmax[1] #max

41.0

In [155]:
result.mean

11.622222222222222

In [156]:
result.skewness

0.9249043136685094

In [158]:
result = z.describe()
result

count     5.00000
mean      8.70000
std      11.09955
min       1.00000
25%       2.50000
50%       4.00000
75%       8.00000
max      28.00000
dtype: float64

In [159]:
result['25%']

2.5

In [160]:
result['mean']

8.7

# Correlation

In [161]:
x = list(range(-10, 11))
y = [0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]

In [162]:
x_, y_ = np.array(x), np.array(y)
x__, y__ = pd.Series(x_), pd.Series(y_)

In [163]:
n = len(x)
mean_x, mean_y = sum(x) / n, sum(y) / n

cov_xy = (sum((x[k] - mean_x) * (y[k] - mean_y) for k in range(n)) / (n - 1))
cov_xy

19.95

In [164]:
np.cov(x_, y_)

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [165]:
x_.var(ddof=1)

38.5

In [166]:
y_.var(ddof=1)

13.914285714285711

In [168]:
np.cov(x_, y_)[1, 0]

19.95

In [169]:
np.cov(x_, y_)[0, 1]

19.95

# Correlation Coefficient

In [170]:
var_x = sum((item - mean_x)**2 for item in x) / (n - 1)
var_y = sum((item - mean_y)**2 for item in y) / (n - 1)

std_x, std_y = var_x ** 0.5, var_y ** 0.5
r = cov_xy / (std_x * std_y)
 
r

0.861950005631606

In [171]:
scipy.stats.pearsonr(x_, y_)

(0.8619500056316058, 5.122760847201207e-07)

In [173]:
np.corrcoef(x_, y_)

array([[1.        , 0.86195001],
       [0.86195001, 1.        ]])

In [174]:
np.corrcoef(x_, y_)[0,1]

0.8619500056316061

In [177]:
scipy.stats.linregress(x_ , y_).rvalue

0.861950005631606

y = mx + c

In [178]:
x__.corr(y__)

0.8619500056316061

In [179]:
y__.corr(x__)

0.861950005631606

# 2D Data

In [181]:
a = np.array([[1, 1, 1],
               [2, 3, 1],
               [4, 9, 2],
               [8, 27, 4],
               [16, 1, 1]])



In [182]:
np.mean(a)

5.4

In [183]:
a.mean()

5.4

In [184]:
np.median(a)

2.0

In [185]:
np.median(a, axis=0)

array([4., 3., 1.])

In [186]:
np.median(a, axis=1)

array([1., 2., 4., 8., 1.])

In [187]:
scipy.stats.gmean(a)

array([4.        , 3.73719282, 1.51571657])

In [188]:
scipy.stats.gmean(a, axis=1)

array([1.        , 1.81712059, 4.16016765, 9.52440631, 2.5198421 ])

In [189]:
scipy.stats.gmean(a, axis=None)

2.829705017016332

In [190]:
scipy.stats.describe(a, axis=1, ddof=1, bias=False)

DescribeResult(nobs=3, minmax=(array([1, 1, 2, 4, 1]), array([ 1,  3,  9, 27, 16])), mean=array([ 1.,  2.,  5., 13.,  6.]), variance=array([  0.,   1.,  13., 151.,  75.]), skewness=array([0.        , 0.        , 1.15206964, 1.52787436, 1.73205081]), kurtosis=array([-3. , -1.5, -1.5, -1.5, -1.5]))

# Dataframe

In [192]:
row_names = ['first', 'second', 'third', 'fourth', 'fifth']
col_names = ['A', 'B', 'C']

df = pd.DataFrame(a, index=row_names, colums=col_names)
df

TypeError: __init__() got an unexpected keyword argument 'colums'

In [193]:
row_names = ['first', 'second', 'third', 'fourth', 'fifth']
col_names = ['A', 'B', 'C']

df = pd.DataFrame(a, index=row_names, columns=col_names)
df

Unnamed: 0,A,B,C
first,1,1,1
second,2,3,1
third,4,9,2
fourth,8,27,4
fifth,16,1,1


In [194]:
df.mean()

A    6.2
B    8.2
C    1.8
dtype: float64

In [195]:
df.var()

A     37.2
B    121.2
C      1.7
dtype: float64

In [196]:
df.mean(axis=1)

first      1.0
second     2.0
third      5.0
fourth    13.0
fifth      6.0
dtype: float64

In [197]:
df.var(axis=1)

first       0.0
second      1.0
third      13.0
fourth    151.0
fifth      75.0
dtype: float64

In [199]:
df["A"].mean()

6.2

In [200]:
df["A"].var()

37.20000000000001

In [201]:
df.values

array([[ 1,  1,  1],
       [ 2,  3,  1],
       [ 4,  9,  2],
       [ 8, 27,  4],
       [16,  1,  1]])

In [202]:
df.to_numpy()

array([[ 1,  1,  1],
       [ 2,  3,  1],
       [ 4,  9,  2],
       [ 8, 27,  4],
       [16,  1,  1]])

In [203]:
df.describe()

Unnamed: 0,A,B,C
count,5.0,5.0,5.0
mean,6.2,8.2,1.8
std,6.09918,11.009087,1.30384
min,1.0,1.0,1.0
25%,2.0,1.0,1.0
50%,4.0,3.0,1.0
75%,8.0,9.0,2.0
max,16.0,27.0,4.0
