# Pandas Cheatsheet

## Import Library

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Basic

### Series

In [3]:
s = pd.Series([10, 20, 30, 40, 50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

### Series From List

In [4]:
data_list = [1,2,3,4,5]
s = pd.Series(data_list)
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

### Series with Custom Index/Label

In [5]:
index_custom = ['a','b','c']
data_list = [1,2,3]
s = pd.Series(data=data_list, index=index_custom)
s

a    1
b    2
c    3
dtype: int64

### Series From Dictionary

In [6]:
data_dict = {
    'a': 10,
    'b': 20,
    'c': 30
}
s = pd.Series(data=data_dict)
s

a    10
b    20
c    30
dtype: int64

### Series with Different Data Type

In [7]:
mixed_data = [1,'dua',3.0,4.5,'lima']
s = pd.Series(data=mixed_data)
s

0       1
1     dua
2     3.0
3     4.5
4    lima
dtype: object

### DataFrame

In [8]:
df = pd.DataFrame({
    'nama': ['Orang 1', 'Orang 2', 'Orang 3'],
    'usia': [23, 50, 35],
    'kota': ['Jakarta','Bandung','Depok']
})
df

Unnamed: 0,nama,usia,kota
0,Orang 1,23,Jakarta
1,Orang 2,50,Bandung
2,Orang 3,35,Depok


### DataFrame from List

In [9]:
list_of_list = [
    [1,'Orang 1', 20],
    [2, 'Orang 2', 30],
    [3, 'Orang 3', 40]
]
columns = ['id','nama','usia']

df = pd.DataFrame(data=list_of_list, columns=columns)
df

Unnamed: 0,id,nama,usia
0,1,Orang 1,20
1,2,Orang 2,30
2,3,Orang 3,40


### DataFrame from Dictionary

In [10]:
data_dict = {
    'id': [1,2,3],
    'nama': ['Orang 1', 'Orang 2', 'Orang 3'],
    'usia': [15,20,30]
}

df = pd.DataFrame(data = data_dict)
df

Unnamed: 0,id,nama,usia
0,1,Orang 1,15
1,2,Orang 2,20
2,3,Orang 3,30


### Merge Series into DataFrame

In [11]:
s1 = pd.Series([1,2,3], name='id')
s2 = pd.Series(['Orang 1', 'Orang 2', 'Orang 3'], name='nama')
s3 = pd.Series([15,20,30], name='usia')

## .T (Transponse) dibutuhkan untuk mengubah baris menjadi kolom
df = pd.DataFrame([s1,s2,s3]).T
df

Unnamed: 0,id,nama,usia
0,1,Orang 1,15
1,2,Orang 2,20
2,3,Orang 3,30


### Getting Basic Information from DataFrame

In [13]:
data = {
    'A': np.random.randint(1, 100, 20),
    'B': np.random.randint(1, 100, 20),
    'C': np.random.randint(1, 100, 20),
    'D': np.random.randint(1, 100, 20)
}
df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,A,B,C,D
0,2,28,95,85
1,7,1,17,65
2,57,98,18,12
3,7,42,32,94
4,8,91,50,49


In [14]:
df.tail()

Unnamed: 0,A,B,C,D
15,23,14,35,4
16,21,20,75,90
17,7,28,60,67
18,7,37,47,34
19,43,52,49,8


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       20 non-null     int64
 1   B       20 non-null     int64
 2   C       20 non-null     int64
 3   D       20 non-null     int64
dtypes: int64(4)
memory usage: 768.0 bytes


In [16]:
df.describe()

Unnamed: 0,A,B,C,D
count,20.0,20.0,20.0,20.0
mean,36.95,38.4,51.1,45.9
std,28.036583,28.665769,24.27019,32.738035
min,2.0,1.0,12.0,1.0
25%,7.0,17.75,34.25,21.75
50%,40.0,32.5,48.0,38.5
75%,58.25,54.5,72.0,71.5
max,83.0,98.0,95.0,95.0


### Identifyng Missing Data

In [17]:
data_missing = {
    'A': [1,2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, 4, 5],
    'C': [1,2,3,4, np.nan]
}
df = pd.DataFrame(data=data_missing)
df

Unnamed: 0,A,B,C
0,1.0,,1.0
1,2.0,2.0,2.0
2,,3.0,3.0
3,4.0,4.0,4.0
4,5.0,5.0,


In [18]:
missing_data_count = df.isna().sum()
missing_data_count

A    1
B    1
C    1
dtype: int64

### Checking Data Distribution

In [23]:
ser_categorical = pd.Series(['apple','banana','apple','orange','banana','apple',
                            'apple','orange'])
value_counts = ser_categorical.value_counts()
type(value_counts) ## Series
value_counts

apple     4
banana    2
orange    2
Name: count, dtype: int64

### Checking Unique Values

In [24]:
unique_count = ser_categorical.nunique()
unique_count

3

In [28]:
unique_values = ser_categorical.unique().tolist()
unique_values

['apple', 'banana', 'orange']

### Checking Data Types

In [32]:
data_sample = {
    'A': np.random.randint(1, 100, 3),
    'B': np.random.random(3),
    'C': ['apple','banana','cherry'],
    'D': pd.date_range("20230101", periods=3)
}
df = pd.DataFrame(data=data_sample)
data_types = df.dtypes
data_types

A             int64
B           float64
C            object
D    datetime64[ns]
dtype: object

### Checking DataFrame Size

In [33]:
df_shape = df.shape
df_shape ## (3,4) 3 baris 4 kolom

(3, 4)

### Checking Columns and Index

In [34]:
df_columns = df.columns
df_index = df.index

df_columns, df_index

(Index(['A', 'B', 'C', 'D'], dtype='object'),
 RangeIndex(start=0, stop=3, step=1))

### Checking Memory Usage

In [35]:
memory_usage = df.memory_usage()
memory_usage

Index    128
A         24
B         24
C         24
D         24
dtype: int64

## Accessing Data

### Accessing Data in Series

In [39]:
s = pd.Series([10,20,30,40,50], index = ['a','b','c','d','e'])

# Mengakses element dengan index possisi
ele_pos = s[4]

# Mengakses element dengan index label
ele_label = s['c']

ele_pos, ele_label

  ele_pos = s[4]


(50, 30)

### Accessing Data in DataFrame