# Chapter 5 - Getting started with panda | Introduction to pandas data structure

In [1]:
import numpy as np
import pandas as pd

## Series

In [2]:
obj = pd.Series([1,2,4,5])
obj

0    1
1    2
2    4
3    5
dtype: int64

In [3]:
# Array dari panda data structure
obj.array

<PandasArray>
[1, 2, 4, 5]
Length: 4, dtype: int64

In [4]:
# Mengambil index dari seri via array
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# Membuat seris dengan index costume
obj = pd.Series([1,2,4,5], index=["a", "b", "c", "d"])
obj

a    1
b    2
c    4
d    5
dtype: int64

In [6]:
# Mengambil nilai dari series menggunakan index(label)
obj["d"]

5

In [7]:
# List of label digunakan untuk mengambil list of element
obj[["a","c","b","d"]]

a    1
c    4
b    2
d    5
dtype: int64

> Numpy operasi dan fungsi di pandas

In [8]:
rng = np.random.default_rng(seed=42)
np_array = rng.standard_normal(10)

# 
series = pd.Series(np_array)
series

0    0.304717
1   -1.039984
2    0.750451
3    0.940565
4   -1.951035
5   -1.302180
6    0.127840
7   -0.316243
8   -0.016801
9   -0.853044
dtype: float64

In [9]:
# Filtering using boolean
series[series>0]

0    0.304717
2    0.750451
3    0.940565
6    0.127840
dtype: float64

In [10]:
# Multiplie scalar dengan series
4 * series

0    1.218868
1   -4.159936
2    3.001805
3    3.762259
4   -7.804141
5   -5.208718
6    0.511362
7   -1.264970
8   -0.067205
9   -3.412176
dtype: float64

In [11]:
# Fungsi matematika (exponen) numpy diterapkan pada Pandas series
np.exp(series), type(np.exp(series))

(0    1.356241
 1    0.353460
 2    2.117955
 3    2.561427
 4    0.142127
 5    0.271938
 6    1.136372
 7    0.728883
 8    0.983339
 9    0.426116
 dtype: float64,
 pandas.core.series.Series)

In [12]:
"a" in series

False

**Padas Series and Python's Ditionary**

In [13]:
# Dict berisikan incoming/year
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}

# Membuat series dari dictionary
series_inc = pd.Series(sdata)
series_inc

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [14]:
# Operasi in pada dict dapat diterapkan
"Texas" in series_inc, "Toronto" in series_inc

(True, False)

In [15]:
# Konversi series kembali ke dictionary python
konversi_dict_inc = series_inc.to_dict()
type(konversi_dict_inc)

dict

In [16]:
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
city = ["Oregon","Jakarta","Ohio","Texas"]
costume_series_inc = pd.Series(sdata,index=city)
costume_series_inc

Oregon     16000.0
Jakarta        NaN
Ohio       35000.0
Texas      71000.0
dtype: float64

In [17]:
# Check missing value
costume_series_inc[pd.isna(costume_series_inc)]

Jakarta   NaN
dtype: float64

In [18]:
# Check eksis value
costume_series_inc[pd.notna(costume_series_inc)]

Oregon    16000.0
Ohio      35000.0
Texas     71000.0
dtype: float64

In [19]:
# Fungsi isna dan notna juga bisa diakses menggunakan method pada instance
costume_series_inc.isna()

Oregon     False
Jakarta     True
Ohio       False
Texas      False
dtype: bool

**Data alignment**

Pandas akan mensejajarkan secara otomatis setiap index ketika dua series dikenakan operator aritmatika

In [20]:
series_inc

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [21]:
costume_series_inc

Oregon     16000.0
Jakarta        NaN
Ohio       35000.0
Texas      71000.0
dtype: float64

In [22]:
# Substraction of tow serries
series_inc + costume_series_inc

Jakarta         NaN
Ohio        70000.0
Oregon      32000.0
Texas      142000.0
Utah            NaN
dtype: float64

In [23]:
# nama dari index dan objek series itu sendiri
print(costume_series_inc.name)
print(costume_series_inc.index.name)

None
None


In [24]:
# Menyetela nama dari series dan index series
costume_series_inc.name = 'Pendapatan pertahun'
costume_series_inc.index.name = 'Nama kota'

costume_series_inc

Nama kota
Oregon     16000.0
Jakarta        NaN
Ohio       35000.0
Texas      71000.0
Name: Pendapatan pertahun, dtype: float64

**Replace index by assignment**

In [25]:
city_new = ["Bandung", "Jakarta", "Karawang", "Bogor"]

costume_series_inc.index = city_new
costume_series_inc

Bandung     16000.0
Jakarta         NaN
Karawang    35000.0
Bogor       71000.0
Name: Pendapatan pertahun, dtype: float64

## Data Frame

In [26]:
# Membuat df menggunakan dict python
data = {'jml_crh_hjn':[1_268,2_042,2_405,1_247,1_789,3_129],
        'kota':['Jakarta','Bandung','Semarang','Jogjakarta','Bekasi','Surabaya'],
        'jml_hr_hjn':[150,252,211,209,217,213]}

frame = pd.DataFrame(data)
frame

Unnamed: 0,jml_crh_hjn,kota,jml_hr_hjn
0,1268,Jakarta,150
1,2042,Bandung,252
2,2405,Semarang,211
3,1247,Jogjakarta,209
4,1789,Bekasi,217
5,3129,Surabaya,213


**Fungsi `head` & `tail`**

In [27]:
frame.head()

Unnamed: 0,jml_crh_hjn,kota,jml_hr_hjn
0,1268,Jakarta,150
1,2042,Bandung,252
2,2405,Semarang,211
3,1247,Jogjakarta,209
4,1789,Bekasi,217


In [28]:
frame.tail()

Unnamed: 0,jml_crh_hjn,kota,jml_hr_hjn
1,2042,Bandung,252
2,2405,Semarang,211
3,1247,Jogjakarta,209
4,1789,Bekasi,217
5,3129,Surabaya,213


**Fill columns argumen**

In [152]:
# Set value pada argumen columns
frame2 = pd.DataFrame(data,columns=["kota","jml_crh_hjn","jml_hr_hjn"])
frame2

Unnamed: 0,kota,jml_crh_hjn,jml_hr_hjn
2001,Jakarta,1218,150
2002,Jakarta,1042,252
2003,Jakarta,1405,211
2004,Jakarta,1647,209
2005,Jakarta,1789,217
2006,Jakarta,1129,213


In [30]:
# Elemen pada columns yang tidak ada di key dari dictionary
false_frame = pd.DataFrame(data,columns=["kota","jml_crh_hjn","jml_hr_hjn","latitude","longitude"])
false_frame

Unnamed: 0,kota,jml_crh_hjn,jml_hr_hjn,latitude,longitude
0,Jakarta,1268,150,,
1,Bandung,2042,252,,
2,Semarang,2405,211,,
3,Jogjakarta,1247,209,,
4,Bekasi,1789,217,,
5,Surabaya,3129,213,,


**Mengambil data dari kolom-kolom pada df**

In [31]:
# Menggunakan pengambilan seperti dictionary
frame2["kota"]

0       Jakarta
1       Bandung
2      Semarang
3    Jogjakarta
4        Bekasi
5      Surabaya
Name: kota, dtype: object

In [32]:
# Pengambilan menggunakan dot
frame2.jml_crh_hjn

0    1268
1    2042
2    2405
3    1247
4    1789
5    3129
Name: jml_crh_hjn, dtype: int64

In [33]:
# Column yang diambil dari df akan menjadi Series (one dimentional array)
type(frame2["kota"]), type(frame2.jml_crh_hjn)

(pandas.core.series.Series, pandas.core.series.Series)

In [34]:
# Mengambil banyak kolom
frame2[["kota","jml_crh_hjn"]]

Unnamed: 0,kota,jml_crh_hjn
0,Jakarta,1268
1,Bandung,2042
2,Semarang,2405
3,Jogjakarta,1247
4,Bekasi,1789
5,Surabaya,3129


In [35]:
# Series akan mengambil index yang sama dengan df aslinya
frame_test = pd.DataFrame(data,index=["Satu","Dua","Tiga","Empat","Lima","Enam"])
frame_test.kota

Satu        Jakarta
Dua         Bandung
Tiga       Semarang
Empat    Jogjakarta
Lima         Bekasi
Enam       Surabaya
Name: kota, dtype: object

Fungsi `loc` & `iloc`

- `loc` , *Access a group of rows and columns by label(s) or a boolean array.*
- `iloc` , *Purely integer-location based indexing for selection by position.*

In [36]:
frame_test.loc['Empat']

jml_crh_hjn          1247
kota           Jogjakarta
jml_hr_hjn            209
Name: Empat, dtype: object

In [37]:
frame_test.iloc[3]

jml_crh_hjn          1247
kota           Jogjakarta
jml_hr_hjn            209
Name: Empat, dtype: object

Mengambil beberapa baris

In [38]:
# Mengambil banyak baris berdasarkan nomor posisi menggunakan slice
frame_test.iloc[0:2]

Unnamed: 0,jml_crh_hjn,kota,jml_hr_hjn
Satu,1268,Jakarta,150
Dua,2042,Bandung,252


In [39]:
# Mengambil banyak baris berdasarkan nomor posisi menggunakan list
frame_test.iloc[[0,3,5,4]]

Unnamed: 0,jml_crh_hjn,kota,jml_hr_hjn
Satu,1268,Jakarta,150
Empat,1247,Jogjakarta,209
Enam,3129,Surabaya,213
Lima,1789,Bekasi,217


In [40]:
# Mengambil banyak baris berdasarkan label menggunakan list
frame_test.loc[["Dua","Lima","Satu","Enam"]]

Unnamed: 0,jml_crh_hjn,kota,jml_hr_hjn
Dua,2042,Bandung,252
Lima,1789,Bekasi,217
Satu,1268,Jakarta,150
Enam,3129,Surabaya,213


**Re-assignment** column

In [157]:
# Create new frame
# Set value pada argumen columns
frame3 = pd.DataFrame(data,columns=["kota","jml_crh_hjn","jml_hr_hjn"])
frame3

Unnamed: 0,kota,jml_crh_hjn,jml_hr_hjn
2001,Jakarta,1218,150
2002,Jakarta,1042,252
2003,Jakarta,1405,211
2004,Jakarta,1647,209
2005,Jakarta,1789,217
2006,Jakarta,1129,213


In [158]:
# Re-Assign value to column using asignement
frame3["jml_hr_hjn"] = 200
frame3

Unnamed: 0,kota,jml_crh_hjn,jml_hr_hjn
2001,Jakarta,1218,200
2002,Jakarta,1042,200
2003,Jakarta,1405,200
2004,Jakarta,1647,200
2005,Jakarta,1789,200
2006,Jakarta,1129,200


In [164]:
# Re-assign value with data aseries and index
value = pd.Series([1111,5555,2222,4444,3333,6666],index=[2001,2005,2002,2004,2003,2006])
frame3['jml_crh_hjn'] = value
frame3

Unnamed: 0,kota,jml_crh_hjn,jml_hr_hjn
2001,Jakarta,1111,150
2002,Jakarta,2222,252
2003,Jakarta,3333,211
2004,Jakarta,4444,209
2005,Jakarta,5555,217
2006,Jakarta,6666,213


In [155]:
# Re-Assign value using asignment
frame3["jml_crh_hjn"] = np.arange(6.)
frame3

# Jika panjang adari Series array atau panda series tidak sama dengan panjang dari baris DataFrame
# nilai menjadi missing
values = pd.Series([2.,3.,4.], index=["One", "Two","Three"])
frame3["jml_crh_hjn"] = values
frame3

Unnamed: 0,kota,jml_crh_hjn,jml_hr_hjn
2001,Jakarta,,200
2002,Jakarta,,200
2003,Jakarta,,200
2004,Jakarta,,200
2005,Jakarta,,200
2006,Jakarta,,200


In [150]:
frame3

Unnamed: 0,kota,jml_crh_hjn,jml_hr_hjn
0,Jakarta,,200
1,Bandung,,200
2,Semarang,,200
3,Jogjakarta,,200
4,Bekasi,,200
5,Surabaya,,200


In [171]:
# Create new dataframe
data = {'kota':['Jakarta','Bandung','Semarang','Jogjakarta','Bekasi','Surabaya'],
        'jml_crh_hjn':[1_268,2_042,2_405,1_247,1_789,3_129],
        'jml_hr_hjn':[150,252,211,209,217,213]}

frame4 = pd.DataFrame(data)
frame4

Unnamed: 0,kota,jml_crh_hjn,jml_hr_hjn
0,Jakarta,1268,150
1,Bandung,2042,252
2,Semarang,2405,211
3,Jogjakarta,1247,209
4,Bekasi,1789,217
5,Surabaya,3129,213


In [172]:
# Meng-assign sebuah kolom yang tidak ada akan membuat kolom baru
# Membuat df menggunakan dict python
potensi_banjir = (frame4['jml_crh_hjn']/366>5) & (frame4['jml_hr_hjn']>211)
frame4["ptns_bnjr"] = potensi_banjir
frame4

Unnamed: 0,kota,jml_crh_hjn,jml_hr_hjn,ptns_bnjr
0,Jakarta,1268,150,False
1,Bandung,2042,252,True
2,Semarang,2405,211,False
3,Jogjakarta,1247,209,False
4,Bekasi,1789,217,False
5,Surabaya,3129,213,True


Fungsi **del** digunakna untuk menghapus kolom

In [173]:
try:
    del frame4["ptns_bnjr"]
    print("Kolom berhasil dihapus")
except Exception as ex:
    print("Kolom telah terhapus")
    

Kolom berhasil dihapus


In [174]:
# Kolom potensi banjir telah dihapus
frame4

Unnamed: 0,kota,jml_crh_hjn,jml_hr_hjn
0,Jakarta,1268,150
1,Bandung,2042,252
2,Semarang,2405,211
3,Jogjakarta,1247,209
4,Bekasi,1789,217
5,Surabaya,3129,213


**Nested Dictionary**

Inner index dimension akan menjadi nama index

Outer index akan menjadi nama kolom

In [212]:
data5 = {'kota':{2001:'Jakarta',2002:'Jakarta',2003:'Jakarta',2004:'Jakarta',2005:'Jakarta',2006:'Jakarta'},
        'jml_crh_hjn':{2001:1_218,2002:1_042,2003:1_405,2004:1_647,2005:1_789,2006:1_129},
        'jml_hr_hjn':{2001:150,2002:252,2003:211,2004:209,2005:217,2006:213}}

In [213]:
frame5 = pd.DataFrame(data5)
frame5

Unnamed: 0,kota,jml_crh_hjn,jml_hr_hjn
2001,Jakarta,1218,150
2002,Jakarta,1042,252
2003,Jakarta,1405,211
2004,Jakarta,1647,209
2005,Jakarta,1789,217
2006,Jakarta,1129,213


In [214]:
# Transpose DataFrame
frame5_transpose = frame5.T
frame5_transpose

Unnamed: 0,2001,2002,2003,2004,2005,2006
kota,Jakarta,Jakarta,Jakarta,Jakarta,Jakarta,Jakarta
jml_crh_hjn,1218,1042,1405,1647,1789,1129
jml_hr_hjn,150,252,211,209,217,213


In [215]:
# check tipe data
frame5_transpose[2001].dtype, frame5_transpose[2005].dtype

(dtype('O'), dtype('O'))

In [216]:
# Transposing transposed frame
frame5_tranpose_2times = frame5_transpose.T
frame5_tranpose_2times['jml_hr_hjn'].dtype, frame5['jml_hr_hjn'].dtype

(dtype('O'), dtype('int64'))

Mismatch antara *inner key* (akan dianggap sebagai index pada nested dictionary) dengan argumen index

In [218]:
frame5_mismatch_index = pd.DataFrame(data5, index=[2001,2022,2023,2024,2005,2006])
frame5_mismatch_index

Unnamed: 0,kota,jml_crh_hjn,jml_hr_hjn
2001,Jakarta,1218.0,150.0
2022,,,
2023,,,
2024,,,
2005,Jakarta,1789.0,217.0
2006,Jakarta,1129.0,213.0


Dictonary of the serries

In [228]:
ser_crh_hjn = {"kota":frame5[frame5['jml_crh_hjn'] > 1200]['kota'],"crh_hjn":[123,456,789,1011]}
frame_series = pd.DataFrame(ser_crh_hjn)
frame_series

Unnamed: 0,kota,crh_hjn
2001,Jakarta,123
2003,Jakarta,456
2004,Jakarta,789
2005,Jakarta,1011


In [237]:
frame5.index.name = "Tahun"
frame5.columns.name = "Details"
frame5

Details,kota,jml_crh_hjn,jml_hr_hjn
Tahun,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001,Jakarta,1218,150
2002,Jakarta,1042,252
2003,Jakarta,1405,211
2004,Jakarta,1647,209
2005,Jakarta,1789,217
2006,Jakarta,1129,213


Fungsi **to_numpy**, akan mengkonvers dataframe kedalam dua dimensi array (numpy)

In [246]:
# Dua dimensi array
frame5.to_numpy()

array([['Jakarta', 1218, 150],
       ['Jakarta', 1042, 252],
       ['Jakarta', 1405, 211],
       ['Jakarta', 1647, 209],
       ['Jakarta', 1789, 217],
       ['Jakarta', 1129, 213]], dtype=object)

In [248]:
frame5.T.to_numpy()

array([['Jakarta', 'Jakarta', 'Jakarta', 'Jakarta', 'Jakarta', 'Jakarta'],
       [1218, 1042, 1405, 1647, 1789, 1129],
       [150, 252, 211, 209, 217, 213]], dtype=object)

## Index Object

In [263]:
# Index pada series
obj = pd.Series(np.arange(3), index=[1,2,3])
idx_series = obj.index
idx_series

Int64Index([1, 2, 3], dtype='int64')

In [271]:
# Index pada dataframe
df = pd.DataFrame({"Satu":np.arange(3),"Dua":np.arange(3,6)}, index=[1,2,3])
idx_df = df.index
idx_df[0:1]

Int64Index([1], dtype='int64')

In [269]:
# Index are immutable, tidak bisa diedit
try:
    idx_df[2] = 4
except Exception as ex:
    print(ex)

Index does not support mutable operations


In [273]:
# Create index
our_index = pd.Index(("Satu","Dua","Tiga"))
our_index

Index(['Satu', 'Dua', 'Tiga'], dtype='object')

In [292]:
# Menggunakn our_index pada argumen index
df_1 = pd.DataFrame({"col_1":np.arange(3),"col_2":np.arange(3,6)}, index=our_index)
df_1.index

Index(['Satu', 'Dua', 'Tiga'], dtype='object')

In [293]:
# DF ini menggunakna index yang sama
df_2 = pd.DataFrame({"othr_col_1":np.arange(3,6),"othr_col_2":np.arange(9,12)}, index=our_index)
df_2.index

Index(['Satu', 'Dua', 'Tiga'], dtype='object')

In [288]:
# Bukti kesamaan
df_2.index is our_index

True

In [295]:
# Prilaku index seperti fixed-size set
"Tiga" in df_2.index, "Empat" in df_2.index

(True, False)

In [302]:
# Index objek pada panda dapat menampung duplicate value
index_duplicate = pd.Index(("foo","foo","boo","hoo"))
index_duplicate

Index(['foo', 'foo', 'boo', 'hoo'], dtype='object')

In [311]:
# Df dengan duplicate index
df_duplct_ind = pd.DataFrame(np.arange(4),index_duplicate)
df_duplct_ind.loc["foo"]

Unnamed: 0_level_0,0
This_Index,Unnamed: 1_level_1
foo,0
foo,1
