### 10 minutes to pandas

In [177]:
import numpy as np
import pandas as pd

### Basic data structures in pandas
#### pandas provides two types of classes for handling data:

- Series: a one-dimensional labeled array holding data of any type
such as integers, strings, Python objects etc.

- DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns.

### Object creation

In [179]:
s = pd.Series([1,3,5, np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [82]:
dates = pd.date_range("20130101", "20130113")
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10', '2013-01-11', '2013-01-12',
               '2013-01-13'],
              dtype='datetime64[ns]', freq='D')

In [13]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [19]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.546659,-0.563374,0.14694,1.509357
2013-01-02,-1.128934,0.018123,1.552942,1.07642
2013-01-03,0.403447,-1.236189,-0.462589,1.161285
2013-01-04,-0.683641,0.516042,1.012592,1.010063
2013-01-05,-0.414748,0.175071,0.31528,0.091154
2013-01-06,0.105973,-0.395834,-0.819956,0.686445


In [23]:
# randn은 '표준(평균 0, 편차 1)'으로 딱 고정된 녀석이지만, 우리가 원하는 **평균(mu)**과 **표준편차(sigma)**를 더해서 변형할 수 있어요.
# 방법: (원하는 표준편차 * np.random.randn(행, 열)) + 원하는 평균
# 예시: 만약 **"평균이 50이고, 위아래로 10 정도 차이 나는(표준편차) 6행 4열"**을 만들고 싶다면?
df1_2 = pd.DataFrame(10 * np.random.randn(6, 4) + 50, index=dates, columns=list("ABCD"))
df1_2

Unnamed: 0,A,B,C,D
2013-01-01,53.960551,48.591169,61.070734,40.590202
2013-01-02,44.161924,64.827572,46.200059,48.172263
2013-01-03,51.418684,49.012599,34.724318,49.640571
2013-01-04,56.550955,43.345415,31.756169,54.23978
2013-01-05,37.367893,63.350041,39.634506,55.256126
2013-01-06,46.231864,56.636633,42.421228,45.145852


In [36]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [38]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

### Viewing data

In [64]:
df.head(), df.tail(3), df.index, df.columns, df.to_numpy()

(                   A         B         C         D
 2013-01-01 -0.546659 -0.563374  0.146940  1.509357
 2013-01-02 -1.128934  0.018123  1.552942  1.076420
 2013-01-03  0.403447 -1.236189 -0.462589  1.161285
 2013-01-04 -0.683641  0.516042  1.012592  1.010063
 2013-01-05 -0.414748  0.175071  0.315280  0.091154,
                    A         B         C         D
 2013-01-04 -0.683641  0.516042  1.012592  1.010063
 2013-01-05 -0.414748  0.175071  0.315280  0.091154
 2013-01-06  0.105973 -0.395834 -0.819956  0.686445,
 DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
                '2013-01-05', '2013-01-06'],
               dtype='datetime64[ns]', freq='D'),
 Index(['A', 'B', 'C', 'D'], dtype='object'),
 array([[-0.54665873, -0.56337386,  0.14694044,  1.50935673],
        [-1.12893408,  0.01812311,  1.55294208,  1.07641996],
        [ 0.40344665, -1.23618879, -0.46258942,  1.16128536],
        [-0.68364067,  0.51604151,  1.01259184,  1.01006324],
        [-0.41474

In [66]:
# 여러가지 출력할때 사용하자!!!!

print(df.head()); print("-" * 30)
print(df.tail(3)); print("-" * 30)
print(df.index); print("-" * 30)
print(df.columns); print("-" * 30)
print(df.to_numpy())

                   A         B         C         D
2013-01-01 -0.546659 -0.563374  0.146940  1.509357
2013-01-02 -1.128934  0.018123  1.552942  1.076420
2013-01-03  0.403447 -1.236189 -0.462589  1.161285
2013-01-04 -0.683641  0.516042  1.012592  1.010063
2013-01-05 -0.414748  0.175071  0.315280  0.091154
------------------------------
                   A         B         C         D
2013-01-04 -0.683641  0.516042  1.012592  1.010063
2013-01-05 -0.414748  0.175071  0.315280  0.091154
2013-01-06  0.105973 -0.395834 -0.819956  0.686445
------------------------------
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
------------------------------
Index(['A', 'B', 'C', 'D'], dtype='object')
------------------------------
[[-0.54665873 -0.56337386  0.14694044  1.50935673]
 [-1.12893408  0.01812311  1.55294208  1.07641996]
 [ 0.40344665 -1.23618879 -0.46258942  1.16128536]
 [-0.6

In [108]:
from IPython.display import display

display(df.head())
print("\n" + "="*50 + "\n") # 시원하게 구분선!
display(df.tail(3))
print("\n" + "="*50 + "\n")
df.index
print("\n" + "="*50 + "\n")
df.columns
print("\n" + "="*50 + "\n")
df.to_numpy()
# 출력이 안되는 문제가 있어서 사용하기 거시기함

Unnamed: 0,A,B,C,D
2013-01-01,-0.546659,-0.563374,0.14694,1.509357
2013-01-02,-1.128934,0.018123,1.552942,1.07642
2013-01-03,0.403447,-1.236189,-0.462589,1.161285
2013-01-04,-0.683641,0.516042,1.012592,1.010063
2013-01-05,-0.414748,0.175071,0.31528,0.091154






Unnamed: 0,A,B,C,D
2013-01-04,-0.683641,0.516042,1.012592,1.010063
2013-01-05,-0.414748,0.175071,0.31528,0.091154
2013-01-06,0.105973,-0.395834,-0.819956,0.686445










Index(['A', 'B', 'C', 'D'], dtype='object')

In [70]:
def check(df):
    print("--- [ Head ] ---")
    display(df.head(2))
    print("\n--- [ Index & Columns ] ---")
    print(f"Index: {df.index}")
    print(f"Columns: {df.columns}")
    print("\n--- [ Numpy Array ] ---")
    print(df.to_numpy()[:2]) # 너무 기니까 2개만 살짝 구경

# 이제부터는 딱 한 줄만 쓰면 끝!
check(df)
# 내눈에는 가독성도 떨어지고 별로임. 더 잘하면 이쁘게 나올려나?

--- [ Head ] ---


Unnamed: 0,A,B,C,D
2013-01-01,-0.546659,-0.563374,0.14694,1.509357
2013-01-02,-1.128934,0.018123,1.552942,1.07642



--- [ Index & Columns ] ---
Index: DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Columns: Index(['A', 'B', 'C', 'D'], dtype='object')

--- [ Numpy Array ] ---
[[-0.54665873 -0.56337386  0.14694044  1.50935673]
 [-1.12893408  0.01812311  1.55294208  1.07641996]]


In [120]:
print(df.describe()); print("-" * 30) # 통계요약
print(df.T); print("-" * 30) # 전치
print(df.sort_index(axis=1, ascending=False)); print("-" * 30) # 축정렬 - 열 내림차순
print(df.sort_values(by="B")); print("-" * 30) # 값정렬 - B열 오름차순

              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean  -0.377427 -0.247694  0.290868  0.922454
std    0.553534  0.621515  0.887317  0.485723
min   -1.128934 -1.236189 -0.819956  0.091154
25%   -0.649395 -0.521489 -0.310207  0.767349
50%   -0.480704 -0.188855  0.231110  1.043242
75%   -0.024207  0.135834  0.838264  1.140069
max    0.403447  0.516042  1.552942  1.509357
------------------------------
   2013-01-01  2013-01-02  2013-01-03  2013-01-04  2013-01-05  2013-01-06
A   -0.546659   -1.128934    0.403447   -0.683641   -0.414748    0.105973
B   -0.563374    0.018123   -1.236189    0.516042    0.175071   -0.395834
C    0.146940    1.552942   -0.462589    1.012592    0.315280   -0.819956
D    1.509357    1.076420    1.161285    1.010063    0.091154    0.686445
------------------------------
                   D         C         B         A
2013-01-01  1.509357  0.146940 -0.563374 -0.546659
2013-01-02  1.076420  1.552942  0.018123 -1.128934
2

### Selection

In [129]:
print(df["A"]); print("-" * 30)
print(df.A); print("-" * 30)
print(df[["B", "A"]]); print("-" * 30)
print(df[0:3]); print("-" * 30)
print(df["20130102":"20130104"]); print("-" * 30)

2013-01-01   -0.546659
2013-01-02   -1.128934
2013-01-03    0.403447
2013-01-04   -0.683641
2013-01-05   -0.414748
2013-01-06    0.105973
Freq: D, Name: A, dtype: float64
------------------------------
2013-01-01   -0.546659
2013-01-02   -1.128934
2013-01-03    0.403447
2013-01-04   -0.683641
2013-01-05   -0.414748
2013-01-06    0.105973
Freq: D, Name: A, dtype: float64
------------------------------
                   B         A
2013-01-01 -0.563374 -0.546659
2013-01-02  0.018123 -1.128934
2013-01-03 -1.236189  0.403447
2013-01-04  0.516042 -0.683641
2013-01-05  0.175071 -0.414748
2013-01-06 -0.395834  0.105973
------------------------------
                   A         B         C         D
2013-01-01 -0.546659 -0.563374  0.146940  1.509357
2013-01-02 -1.128934  0.018123  1.552942  1.076420
2013-01-03  0.403447 -1.236189 -0.462589  1.161285
------------------------------
                   A         B         C         D
2013-01-02 -1.128934  0.018123  1.552942  1.076420
2013-01-03 

In [131]:
df.loc[dates[0]]

A   -0.546659
B   -0.563374
C    0.146940
D    1.509357
Name: 2013-01-01 00:00:00, dtype: float64

In [133]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,-0.546659,-0.563374
2013-01-02,-1.128934,0.018123
2013-01-03,0.403447,-1.236189
2013-01-04,-0.683641,0.516042
2013-01-05,-0.414748,0.175071
2013-01-06,0.105973,-0.395834


In [135]:
df.loc[dates[0], "A"]

-0.5466587330866084

In [137]:
df.at[dates[0], "A"]

-0.5466587330866084

In [155]:
df.iloc[3]

A   -0.683641
B    0.516042
C    1.012592
D    1.010063
Name: 2013-01-04 00:00:00, dtype: float64

In [157]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.546659,-0.563374,0.14694,1.509357
2013-01-02,-1.128934,0.018123,1.552942,1.07642
2013-01-03,0.403447,-1.236189,-0.462589,1.161285
2013-01-04,-0.683641,0.516042,1.012592,1.010063
2013-01-05,-0.414748,0.175071,0.31528,0.091154
2013-01-06,0.105973,-0.395834,-0.819956,0.686445


In [187]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

ValueError: Length of values (6) does not match length of index (13)

In [189]:
df.agg(lambda x: np.mean(x) * 5.6)

A   -2.113591
B   -1.387084
C    1.628862
D    5.165742
dtype: float64