In [1]:
import numpy as np

In [2]:
import pandas as pd

## Intro to data structures
### Series

In [3]:
s = pd.Series(np.random.randn(5), index = ["a","b","c","d","e"])

In [4]:
s

a    0.138713
b    0.598098
c   -1.100923
d    0.429065
e   -0.830120
dtype: float64

In [5]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [23]:
pd.Series(np.random.randn(5))
#if no index is passed, it will make integer index by itself

0   -0.205657
1   -0.549800
2    1.232197
3    0.285916
4    0.862899
dtype: float64

## Object Creation

In [7]:
s = pd.Series([1,3,5,np.nan, 6, 8])
# what is nan? -> NaN is short for Not a number

In [8]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [9]:
dates = pd.date_range("20210801", periods = 6)

In [10]:
dates

DatetimeIndex(['2021-08-01', '2021-08-02', '2021-08-03', '2021-08-04',
               '2021-08-05', '2021-08-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))

In [12]:
df

Unnamed: 0,A,B,C,D
2021-08-01,-1.363362,-1.618411,0.900184,-0.63869
2021-08-02,-0.962103,1.014477,-0.212021,0.504209
2021-08-03,-0.647128,-0.032876,-0.634831,-1.624977
2021-08-04,-0.295441,-2.847912,-0.072864,-0.106676
2021-08-05,0.961947,-1.784362,1.28945,0.066986
2021-08-06,-0.869421,1.091763,1.082725,-0.459594


In [17]:
df2 = pd.DataFrame({
    "A": 1.0,
    "B": pd.Timestamp("20210801"),
    "C": pd.Series(1, index=list(range(4)), dtype="float32"),
    "D": np.array([3]*4, dtype="int32"),
    "E": pd.Categorical(["test","train","test","train"]),
    "F": "foo",
})

In [18]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-08-01,1.0,3,test,foo
1,1.0,2021-08-01,1.0,3,train,foo
2,1.0,2021-08-01,1.0,3,test,foo
3,1.0,2021-08-01,1.0,3,train,foo


In [20]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [21]:
df2.A

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

In [22]:
df2.E

0     test
1    train
2     test
3    train
Name: E, dtype: category
Categories (2, object): ['test', 'train']

## Viewing data

In [24]:
df.head()

Unnamed: 0,A,B,C,D
2021-08-01,-1.363362,-1.618411,0.900184,-0.63869
2021-08-02,-0.962103,1.014477,-0.212021,0.504209
2021-08-03,-0.647128,-0.032876,-0.634831,-1.624977
2021-08-04,-0.295441,-2.847912,-0.072864,-0.106676
2021-08-05,0.961947,-1.784362,1.28945,0.066986


In [25]:
df.tail()

Unnamed: 0,A,B,C,D
2021-08-02,-0.962103,1.014477,-0.212021,0.504209
2021-08-03,-0.647128,-0.032876,-0.634831,-1.624977
2021-08-04,-0.295441,-2.847912,-0.072864,-0.106676
2021-08-05,0.961947,-1.784362,1.28945,0.066986
2021-08-06,-0.869421,1.091763,1.082725,-0.459594


In [26]:
df.head(2)

Unnamed: 0,A,B,C,D
2021-08-01,-1.363362,-1.618411,0.900184,-0.63869
2021-08-02,-0.962103,1.014477,-0.212021,0.504209


In [27]:
df.tail(3)

Unnamed: 0,A,B,C,D
2021-08-04,-0.295441,-2.847912,-0.072864,-0.106676
2021-08-05,0.961947,-1.784362,1.28945,0.066986
2021-08-06,-0.869421,1.091763,1.082725,-0.459594


In [28]:
df.index

DatetimeIndex(['2021-08-01', '2021-08-02', '2021-08-03', '2021-08-04',
               '2021-08-05', '2021-08-06'],
              dtype='datetime64[ns]', freq='D')

In [29]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

* Numpy arrays have one dtype for the entire array.
* while pandas DataFrames have one dtype per column.

* When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. 
* This may end up being object, which requires casting every value to a Python object.

In [30]:
df.to_numpy()

array([[-1.36336222, -1.61841105,  0.90018428, -0.63868964],
       [-0.96210269,  1.01447676, -0.21202149,  0.50420917],
       [-0.64712759, -0.03287638, -0.63483125, -1.62497742],
       [-0.29544071, -2.84791218, -0.07286399, -0.10667598],
       [ 0.96194725, -1.78436197,  1.28944955,  0.06698626],
       [-0.86942146,  1.09176252,  1.08272499, -0.45959392]])

In [31]:
df2.to_numpy()
#relatively expensive! various dtypes

array([[1.0, Timestamp('2021-08-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2021-08-01 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2021-08-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2021-08-01 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [32]:
df.describe()
#shows a quick statistic summary of your data

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.529251,-0.69622,0.392107,-0.376457
std,0.811302,1.626509,0.797006,0.732049
min,-1.363362,-2.847912,-0.634831,-1.624977
25%,-0.938932,-1.742874,-0.177232,-0.593916
50%,-0.758275,-0.825644,0.41366,-0.283135
75%,-0.383362,0.752638,1.03709,0.023571
max,0.961947,1.091763,1.28945,0.504209


In [33]:
df.T
#행열 위치 교환. Transpose

Unnamed: 0,2021-08-01,2021-08-02,2021-08-03,2021-08-04,2021-08-05,2021-08-06
A,-1.363362,-0.962103,-0.647128,-0.295441,0.961947,-0.869421
B,-1.618411,1.014477,-0.032876,-2.847912,-1.784362,1.091763
C,0.900184,-0.212021,-0.634831,-0.072864,1.28945,1.082725
D,-0.63869,0.504209,-1.624977,-0.106676,0.066986,-0.459594


In [34]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2021-08-01,-0.63869,0.900184,-1.618411,-1.363362
2021-08-02,0.504209,-0.212021,1.014477,-0.962103
2021-08-03,-1.624977,-0.634831,-0.032876,-0.647128
2021-08-04,-0.106676,-0.072864,-2.847912,-0.295441
2021-08-05,0.066986,1.28945,-1.784362,0.961947
2021-08-06,-0.459594,1.082725,1.091763,-0.869421


In [35]:
df.sort_index(axis=0)

Unnamed: 0,A,B,C,D
2021-08-01,-1.363362,-1.618411,0.900184,-0.63869
2021-08-02,-0.962103,1.014477,-0.212021,0.504209
2021-08-03,-0.647128,-0.032876,-0.634831,-1.624977
2021-08-04,-0.295441,-2.847912,-0.072864,-0.106676
2021-08-05,0.961947,-1.784362,1.28945,0.066986
2021-08-06,-0.869421,1.091763,1.082725,-0.459594


In [36]:
df.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2021-08-01,-1.363362,-1.618411,0.900184,-0.63869
2021-08-02,-0.962103,1.014477,-0.212021,0.504209
2021-08-03,-0.647128,-0.032876,-0.634831,-1.624977
2021-08-04,-0.295441,-2.847912,-0.072864,-0.106676
2021-08-05,0.961947,-1.784362,1.28945,0.066986
2021-08-06,-0.869421,1.091763,1.082725,-0.459594


In [37]:
df.sort_values(by="B")
#B값 기준으로 정렬. 확인해보면 가장 작은 값을 가지는 8월 4일 행이 가장 위로 가있음.

Unnamed: 0,A,B,C,D
2021-08-04,-0.295441,-2.847912,-0.072864,-0.106676
2021-08-05,0.961947,-1.784362,1.28945,0.066986
2021-08-01,-1.363362,-1.618411,0.900184,-0.63869
2021-08-03,-0.647128,-0.032876,-0.634831,-1.624977
2021-08-02,-0.962103,1.014477,-0.212021,0.504209
2021-08-06,-0.869421,1.091763,1.082725,-0.459594


## Selection

In [38]:
df["A"]

2021-08-01   -1.363362
2021-08-02   -0.962103
2021-08-03   -0.647128
2021-08-04   -0.295441
2021-08-05    0.961947
2021-08-06   -0.869421
Freq: D, Name: A, dtype: float64

In [39]:
df.A

2021-08-01   -1.363362
2021-08-02   -0.962103
2021-08-03   -0.647128
2021-08-04   -0.295441
2021-08-05    0.961947
2021-08-06   -0.869421
Freq: D, Name: A, dtype: float64

In [40]:
df[0:3] #행 slice

Unnamed: 0,A,B,C,D
2021-08-01,-1.363362,-1.618411,0.900184,-0.63869
2021-08-02,-0.962103,1.014477,-0.212021,0.504209
2021-08-03,-0.647128,-0.032876,-0.634831,-1.624977


In [41]:
df["20210802":"20210805"]

Unnamed: 0,A,B,C,D
2021-08-02,-0.962103,1.014477,-0.212021,0.504209
2021-08-03,-0.647128,-0.032876,-0.634831,-1.624977
2021-08-04,-0.295441,-2.847912,-0.072864,-0.106676
2021-08-05,0.961947,-1.784362,1.28945,0.066986


## selection by label
* loc[:,'x2':'x4']: select all columns between x2 and x4(inclusive)
* iloc[:,[1,2,5]]: select columns in positions 1,2 and 5
* loc[df['a'] > 10, ['a','c']]: select rows meeting logical condition, and only the specific columns.

In [42]:
df.loc[dates[0]]

A   -1.363362
B   -1.618411
C    0.900184
D   -0.638690
Name: 2021-08-01 00:00:00, dtype: float64

In [43]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2021-08-01,-1.363362,-1.618411
2021-08-02,-0.962103,1.014477
2021-08-03,-0.647128,-0.032876
2021-08-04,-0.295441,-2.847912
2021-08-05,0.961947,-1.784362
2021-08-06,-0.869421,1.091763


In [45]:
df.iloc[:,["A","B"]]

IndexError: .iloc requires numeric indexers, got ['A' 'B']

In [46]:
df.loc["20210802":"20210805", ["A","C"]]

Unnamed: 0,A,C
2021-08-02,-0.962103,-0.212021
2021-08-03,-0.647128,-0.634831
2021-08-04,-0.295441,-0.072864
2021-08-05,0.961947,1.28945


In [47]:
df.loc[dates[0], "A"] #getting a scalar value!!

-1.3633622169381208

In [48]:
df.loc[dates[0:3],"A"]

2021-08-01   -1.363362
2021-08-02   -0.962103
2021-08-03   -0.647128
Freq: D, Name: A, dtype: float64

## selection by position

In [49]:
df.iloc[3]

A   -0.295441
B   -2.847912
C   -0.072864
D   -0.106676
Name: 2021-08-04 00:00:00, dtype: float64

In [50]:
df.iloc[3:5, 0:2] #3에서 4행 0에서 1열

Unnamed: 0,A,B
2021-08-04,-0.295441,-2.847912
2021-08-05,0.961947,-1.784362
