In [2]:
"""Pandas samples
"""
import numpy as np
import pandas as pd

series = pd.Series(
    [1, 2, 3]
)
series

0    1
1    2
2    3
dtype: int64

In [5]:
series.sum()

6

In [13]:
# Represents two-dimensional array.
df_simple = pd.DataFrame(
    {
        'A': [1, 2],
        'B': [3, 4],
    }
)
df_simple

Unnamed: 0,A,B
0,1,3
1,2,4


In [12]:
df_random = pd.DataFrame(
    np.random.randn(
        5, 3
    )
)
df_random

Unnamed: 0,0,1,2
0,1.162582,0.386717,0.676285
1,-0.387935,1.07792,0.12101
2,0.912256,-1.51664,1.113488
3,-0.311908,-1.203586,-0.534489
4,-1.188391,-0.059747,2.779948


In [21]:
df_any_period = pd.DataFrame(
    np.random.randn(5, 3),
    index=pd.date_range(
        '20221201',
        periods=5
    ),
    columns=['A', 'B', 'C']
)
df_any_period

Unnamed: 0,A,B,C
2022-12-01,-0.332921,-0.71134,1.148911
2022-12-02,-1.009581,-0.257115,0.010697
2022-12-03,1.073745,0.203765,1.208458
2022-12-04,-0.772181,0.434377,-0.536755
2022-12-05,-0.678222,-0.668636,0.167815


In [22]:
df_any_period.T

Unnamed: 0,2022-12-01,2022-12-02,2022-12-03,2022-12-04,2022-12-05
A,-0.332921,-1.009581,1.073745,-0.772181,-0.678222
B,-0.71134,-0.257115,0.203765,0.434377,-0.668636
C,1.148911,0.010697,1.208458,-0.536755,0.167815


In [23]:
df_any_period.head(1)

Unnamed: 0,A,B,C
2022-12-01,-0.332921,-0.71134,1.148911


In [24]:
df_any_period.tail(2)

Unnamed: 0,A,B,C
2022-12-04,-0.772181,0.434377,-0.536755
2022-12-05,-0.678222,-0.668636,0.167815


In [26]:
df_any_period.index

DatetimeIndex(['2022-12-01', '2022-12-02', '2022-12-03', '2022-12-04',
               '2022-12-05'],
              dtype='datetime64[ns]', freq='D')

In [27]:
df_any_period.columns

Index(['A', 'B', 'C'], dtype='object')

In [28]:
df_any_period.values

array([[-0.3329208 , -0.71133961,  1.14891143],
       [-1.00958125, -0.25711458,  0.01069703],
       [ 1.07374527,  0.2037649 ,  1.20845838],
       [-0.77218081,  0.43437719, -0.53675472],
       [-0.67822181, -0.66863602,  0.16781499]])

In [29]:
# Check the characteristics of the DataFrame.
# (std: Standard deviation, 50%: Median)
df_any_period.describe()

Unnamed: 0,A,B,C
count,5.0,5.0,5.0
mean,-0.343832,-0.19979,0.399825
std,0.82888,0.512301,0.757867
min,-1.009581,-0.71134,-0.536755
25%,-0.772181,-0.668636,0.010697
50%,-0.678222,-0.257115,0.167815
75%,-0.332921,0.203765,1.148911
max,1.073745,0.434377,1.208458


In [31]:
df_any_period.sort_values(
    by='C'
)

Unnamed: 0,A,B,C
2022-12-04,-0.772181,0.434377,-0.536755
2022-12-02,-1.009581,-0.257115,0.010697
2022-12-05,-0.678222,-0.668636,0.167815
2022-12-01,-0.332921,-0.71134,1.148911
2022-12-03,1.073745,0.203765,1.208458


In [32]:
df_any_period[1:4]

Unnamed: 0,A,B,C
2022-12-02,-1.009581,-0.257115,0.010697
2022-12-03,1.073745,0.203765,1.208458
2022-12-04,-0.772181,0.434377,-0.536755


In [33]:
df_any_period['2022-12-03' : '2022-12-05']

Unnamed: 0,A,B,C
2022-12-03,1.073745,0.203765,1.208458
2022-12-04,-0.772181,0.434377,-0.536755
2022-12-05,-0.678222,-0.668636,0.167815


In [34]:
# loc property
df_any_period.loc['2022-12-03']

A    1.073745
B    0.203765
C    1.208458
Name: 2022-12-03 00:00:00, dtype: float64

In [35]:
df_any_period.loc[
    '2022-12-03',
    ['A', 'B'],
]

A    1.073745
B    0.203765
Name: 2022-12-03 00:00:00, dtype: float64

In [36]:
df_any_period.loc[
    '2022-12-03' : '2022-12-04',
    ['A', 'B'],
]

Unnamed: 0,A,B
2022-12-03,1.073745,0.203765
2022-12-04,-0.772181,0.434377


In [37]:
df_any_period.loc[
    :,
    ['A', 'B'],
]

Unnamed: 0,A,B
2022-12-01,-0.332921,-0.71134
2022-12-02,-1.009581,-0.257115
2022-12-03,1.073745,0.203765
2022-12-04,-0.772181,0.434377
2022-12-05,-0.678222,-0.668636


In [38]:
# iloc property
df_any_period.iloc[0, 0]

-0.33292080191381745

In [42]:
df_any_period.iloc[
    1:3,
    0:2
]

Unnamed: 0,A,B
2022-12-02,-1.009581,-0.257115
2022-12-03,1.073745,0.203765


In [44]:
# Extract only the data that meet the conditions.(boolean indexing)
df_any_period[
    # extract only rows where the value in column B is greater than 0.
    df_any_period.B > 0
]

Unnamed: 0,A,B,C
2022-12-03,1.073745,0.203765,1.208458
2022-12-04,-0.772181,0.434377,-0.536755


In [45]:
df_any_period[
    # extract only rows where the value in column B is greater than 0.
    df_any_period > 0
]

Unnamed: 0,A,B,C
2022-12-01,,,1.148911
2022-12-02,,,0.010697
2022-12-03,1.073745,0.203765,1.208458
2022-12-04,,0.434377,
2022-12-05,,,0.167815


In [47]:
# Add and Merge
df_any_period_copied = df_any_period.copy()

df_any_period_copied['D'] = [
    1.0,
    1.0,
    2.0,
    3.0,
    4.0,
]
df_any_period_copied

Unnamed: 0,A,B,C,D
2022-12-01,-0.332921,-0.71134,1.148911,1.0
2022-12-02,-1.009581,-0.257115,0.010697,1.0
2022-12-03,1.073745,0.203765,1.208458,2.0
2022-12-04,-0.772181,0.434377,-0.536755,3.0
2022-12-05,-0.678222,-0.668636,0.167815,4.0


In [48]:
df_any_period_copied[
    df_any_period_copied['D'].isin(
        [
            1.0,
            4.0
        ]
    )
]

Unnamed: 0,A,B,C,D
2022-12-01,-0.332921,-0.71134,1.148911,1.0
2022-12-02,-1.009581,-0.257115,0.010697,1.0
2022-12-05,-0.678222,-0.668636,0.167815,4.0


In [49]:
df_any_period_copied.shift(1)

Unnamed: 0,A,B,C,D
2022-12-01,,,,
2022-12-02,-0.332921,-0.71134,1.148911,1.0
2022-12-03,-1.009581,-0.257115,0.010697,1.0
2022-12-04,1.073745,0.203765,1.208458,2.0
2022-12-05,-0.772181,0.434377,-0.536755,3.0


In [51]:
df_new_random = pd.DataFrame(
    np.random.randn(2, 2)
)
df_new_random

Unnamed: 0,0,1
0,-0.971226,0.016182
1,1.758324,-1.422179


In [52]:
pd.concat(
    [df_new_random, df_new_random]
)

Unnamed: 0,0,1
0,-0.971226,0.016182
1,1.758324,-1.422179
0,-0.971226,0.016182
1,1.758324,-1.422179


In [54]:
df_new_sample = pd.DataFrame(
    {
        'A': [
            'hoge',
            'fuga',
            'hoge',
            'fuga'
        ],
        'B': np.random.randn(4)
    }
)
df_new_sample

Unnamed: 0,A,B
0,hoge,-0.754946
1,fuga,-2.859355
2,hoge,-0.276131
3,fuga,0.110085


In [55]:
# Group by column A and find the sum of column B for each group.
df_new_sample.groupby('A').sum()

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
fuga,-2.74927
hoge,-1.031077
