# Chapter 5 - Getting started with panda | Summarizing and Computing Descriptive Statistic

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def format_pecahan(x):
    return f'{x:.2f}'

In [2]:
frame = pd.DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'], columns=['Satu','Dua'])
frame

Unnamed: 0,Satu,Dua
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [3]:
# Sum method (default is index) menghitung perbaris disetiap kolom
frame.sum()

Satu    9.25
Dua    -5.80
dtype: float64

In [4]:
# sum method dengan axis columns, menghitung melintang kolom
frame.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

When an entire row or column contains all NA values, the sum is 0, whereas if any
value is not NA, then the result is NA. This can be disabled with the skipna option, in
which case any NA value in a row or column names the corresponding result NA:

In [5]:
frame.sum(axis=0, skipna=False)

Satu   NaN
Dua    NaN
dtype: float64

In [6]:
frame.sum(axis=1, skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

Some aggregations, like mean, require at least one non-NA value to yield a value
result, so here we have:

In [7]:
frame.mean()

Satu    3.083333
Dua    -2.900000
dtype: float64

In [8]:
frame.mean(axis=1)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

ome methods, like `idxmin` and `idxmax`, return indirect statistics, like the index value
where the minimum or maximum values are attained:

In [9]:
frame.idxmin()

Satu    d
Dua     b
dtype: object

In [10]:
frame.idxmax()

Satu    b
Dua     d
dtype: object

Another indirext statistic, `cumsum`

In [11]:
frame.cumsum()

Unnamed: 0,Satu,Dua
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


Step by step computation of `cumsum`, contoh dibawah ini mengambil dari kolom **Satu**

| Satu | Accumulation | cumsum |
| :--: | :----------: | :----: |
| 1.4 | 1.4 | *1.4* |
| 7.10 | *1.4* + 7.10 | *8.50* |
| NaN | *8.50* + NaN | NaN |
| 0.75 | *8.50* + 0.75 | *9.25* |


Some method are neither reduction or accumulaition. `describe` is one such
example, producing multiple summary statistics in one shot:

In [12]:
frame.describe().applymap(format_pecahan)

Unnamed: 0,Satu,Dua
count,3.0,2.0
mean,3.08,-2.9
std,3.49,2.26
min,0.75,-4.5
25%,1.07,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


On nonnumeric data, `describe` produces alternative summary statistics:

In [13]:
frame_non_numeric = pd.DataFrame(['a','b','c','d']*4)
frame_non_numeric.describe()

Unnamed: 0,0
count,16
unique,4
top,a
freq,4


## Correlation and Covariance

Some summary statistics, like correlation and covariance, are computed from pairs
of arguments

In [12]:
# Uncomment for list of example datasets
# !ls source/pydata-book/examples/

In [9]:
price = pd.read_pickle('source/pydata-book/examples/yahoo_price.pkl')

In [10]:
price.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,117.550003,779.960022,154.770004,57.220001
2016-10-18,117.470001,795.26001,150.720001,57.66
2016-10-19,117.120003,801.5,151.259995,57.529999
2016-10-20,117.059998,796.969971,151.520004,57.25
2016-10-21,116.599998,799.369995,149.630005,59.66


In [13]:
volume = pd.read_pickle('source/pydata-book/examples/yahoo_volume.pkl')

In [14]:
volume.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,23624900,1089500,5890400,23830000
2016-10-18,24553500,1995600,12770600,19149500
2016-10-19,20034600,116600,4632900,22878400
2016-10-20,24125800,1734200,4023100,49455600
2016-10-21,22384800,1260500,4401900,79974200


Rumus untuk fungsi pct_change -> `(Current-Previous/Previous) * 100`

Dibawah ini saya menghitung prosentasi perubahan dari setiap waktu dari harga

In [15]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


Menghitung *correlation* dan *covariance* antara perubahan harga dari **Microsoft** dan **IBM**

In [16]:
returns.MSFT.corr(returns.IBM, method='pearson')

0.4997636114415114

In [17]:
returns.MSFT.cov(returns.IBM)

8.870655479703546e-05

### Penjelasan Fungsi Covaraiance dan Correlation

In [22]:
datasets = pd.DataFrame({'a':[1,25,8,31,-2],'b':[2,64,21,-2,-10]})
datasets

Unnamed: 0,a,b
0,1,2
1,25,64
2,8,21
3,31,-2
4,-2,-10


#### Correlation menggunakan fungsi Pearson

In [23]:
def pearson_correlation_coefficient(x):
    count = len(x)
    x['a*b'] = x['a'].mul(x['b'])
    x['a^2'] = x['a'].pow(2)
    x['b^2'] = x['b'].pow(2)
    data = x.sum()
    
    print("Diketahui : ")
    print(x,'________________',sep='\r\n')
    print("")
    print ("Summary : ")
    print(data)
    print ("")
    print("Kalkulasi : ")
    a = 5*data["a*b"]
    b = data["a"]*data["b"]
    c = (count * data['a^2'])
    d = (data['a']**2)
    e = (count * data['b^2'])
    f = (data['b']**2)
    result = (a-b)/(np.sqrt((c-d)*(e-f)))
    print (f'n(Σa.b) \t= {a}')
    print (f'(Σa).(Σb) \t= {b}')
    print (f"(n.Σa^2) \t= {c}")
    print (f"(Σa)^2 \t\t= {d}")
    print (f"(n.Σb^2) \t= {e}")
    print (f"(Σb)^2 \t\t= {f}")
    
    print("-"*30)
    print(f'n(Σa.b) - (Σa).(Σb)/')
    print('  ______________________________________')
    print(f'√ ((n.Σa^2)-(Σa)^2) * ((n.Σb^2)-(Σb)^2)')
    
    print("-"*30)
    
    print('Result : ')
    print(f"{result}")
    
    return result

In [24]:
# pearson_correlation_coefficient(datasets)
pearson_correlation_coefficient(datasets)

Diketahui : 
    a   b   a*b  a^2   b^2
0   1   2     2    1     4
1  25  64  1600  625  4096
2   8  21   168   64   441
3  31  -2   -62  961     4
4  -2 -10    20    4   100
________________

Summary : 
a        63
b        75
a*b    1728
a^2    1655
b^2    4645
dtype: int64

Kalkulasi : 
n(Σa.b) 	= 8640
(Σa).(Σb) 	= 4725
(n.Σa^2) 	= 8275
(Σa)^2 		= 3969
(n.Σb^2) 	= 23225
(Σb)^2 		= 5625
------------------------------
n(Σa.b) - (Σa).(Σb)/
  ______________________________________
√ ((n.Σa^2)-(Σa)^2) * ((n.Σb^2)-(Σb)^2)
------------------------------
Result : 
0.4497160834281532


0.4497160834281532

In [25]:
datasets.a.corr(datasets.b)

0.4497160834281533

#### Covariance

In [32]:
datasets = pd.DataFrame({'a':[1,25,8,31,-2],'b':[2,64,21,-2,-10]})
datasets

Unnamed: 0,a,b
0,1,2
1,25,64
2,8,21
3,31,-2
4,-2,-10


In [54]:
def display_table_of_work_for_convariance(x):
    x['ai-a.mean'] = x['a'] - x['a'].mean()
    x['bi-b.mean'] = x['b'] - x['b'].mean()
    x['var'] = x['ai-a.mean']*x['bi-b.mean']
    return x

In [60]:
ds = display_table_of_work_for_convariance(datasets)
ds

Unnamed: 0,a,b,ai-a.mean,bi-b.mean,var
0,1,2,-11.6,-13.0,150.8
1,25,64,12.4,49.0,607.6
2,8,21,-4.6,6.0,-27.6
3,31,-2,18.4,-17.0,-312.8
4,-2,-10,-14.6,-25.0,365.0


hasil diatas memliki summary dibawha ini. Berfungis untuk menghitung covariance

In [57]:
ds.sum()

a             63.0
b             75.0
ai-a.mean      0.0
bi-b.mean      0.0
var          783.0
dtype: float64

In [71]:
# convariance
ds['var'].sum() / (len(ds)-1)

195.75

In [73]:
# Cocok
datasets.a.cov(datasets.b)

195.75

Hasil dari convariance positi, sehingga dapat dikatakan hubungan antara variabel `a` dan variable `b` memiliki relasi yang positif, artimya, kenaikan pada vairable `a` berdampak pada kenaikan pada variable `b`

## Unique Values, Values Counts and Memberships

Another class of related methods extracts information about the values contained in a
one-dimensional Series. To illustrate these, consider this example:

### Unique dan Sort

In [84]:
series = pd.Series(list('bddmaeneasf'))
series

0     b
1     d
2     d
3     m
4     a
5     e
6     n
7     e
8     a
9     s
10    f
dtype: object

Untuk membuat unik anda dapat menggunakan fungsi `unique`, akan membuat Series baru

In [82]:
unique = series.unique()
unique

array(['a', 'b', 'd', 'm', 'e', 'n', 's', 'f'], dtype=object)

Hasil dari fungsi `unique` tidak harus berurut. Gunakan fungsi `sort` untuk mengurutkannya

In [86]:
unique.sort()
unique

array(['a', 'b', 'd', 'e', 'f', 'm', 'n', 's'], dtype=object)

### Fungsi `value_counts`
Untuk menghitung jumlah data pada suatu objek

In [87]:
series.value_counts()

d    2
a    2
e    2
b    1
m    1
n    1
s    1
f    1
dtype: int64

Method `value_counts` juga dapat diakses melalui top level pandas method

Hasil yang direturn dalam urutan descending, namun anda dapat menonaktifkannya dengan argumen *sort* yang diise dengan boolean value

In [90]:
pd.value_counts(series, sort=False)

b    1
d    2
m    1
a    2
e    2
n    1
s    1
f    1
dtype: int64

### `isin` method
Digunakna untuk meng-echek keanggotan dari sebuah list atau scalar pada sebuah objek

In [103]:
series.isin(['a','b'])

0      True
1     False
2     False
3     False
4      True
5     False
6     False
7     False
8      True
9     False
10    False
dtype: bool

In [105]:
# Anda dapat gunakan untuk menampilkan value tertentu saja
series[series.isin(['a','b'])]

0    b
4    a
8    a
dtype: object