In [1]:
import sqlite3
import uuid
import random
import typing

In [8]:
# def echo_round() -> typing.Generator[int, float, str]:
#     sent = yield 0
#     while sent >= 0:
#         sent = yield round(sent)
#     return 'Done'

def counter(maximum: int) -> typing.Generator[int, int, None]:
    i = 0
    while i < maximum:
        val = yield i

        if val is not None:
            i = val
        else:
            i += 1

it = counter(10)
print(next(it))
print(next(it))
print(it.send(8))
print(next(it))



0
1
8
9


In [32]:
def counter_next(it: typing.Generator[int, int, None]) -> None:
    print("next")
    print("-" * 10)
    value = next(it)
    print(f"value: {value}")
    print("-" * 10)
    print()

def counter_send(it: typing.Generator[int, int, None], v: int) -> None:
    print("send")
    print("-" * 10)
    value = it.send(v)
    print(f"value: {value}")
    print("-" * 10)
    print()

def counter_test(maximum: int) -> typing.Generator[int, int, None]:
    i = 0
    while i < maximum:
        print(f"before count: {i}")
        val = yield i
        if val is not None:
            i = val
        else:
            i += 1

        print(f"after count: {i}")


it = counter_test(10)
counter_next(it)
counter_next(it)
counter_send(it, 8)
counter_next(it)



next
----------
before count: 0
value: 0
----------

next
----------
after count: 1
before count: 1
value: 1
----------

send
----------
after count: 8
before count: 8
value: 8
----------

next
----------
after count: 9
before count: 9
value: 9
----------



In [25]:
def init_users_table(n: int) -> None:
    conn = sqlite3.connect("sample_users.db")
    cursor = conn.cursor()

    cursor.execute("CREATE TABLE users(user_id, age)")

    for _ in range(n):
        user_id = str(uuid.uuid4())
        age = random.randint(0, 100)
        cursor.execute("INSERT INTO users VALUE (?, ?)", (user_id, age))
    
    conn.commit()
    conn.close()

def get_user_ids(age: int) -> typing.List[str]:
    conn = sqlite3.connect("sample_users.db")
    cursor = conn.cursor()

    cursor.execute("SELECT user_id FROM users WHERE age <= ?", (age,))

    return cursor.fetchall()

def get_user_ids_generator(age: int) -> typing.Generator[str, None, None]:
    conn = sqlite3.connect("sample_users.db")
    cursor = conn.cursor()

    cursor.execute("SELECT user_id FROM users WHERE age <= ?", (age, ))

    for user_id in cursor.fetchall():
        yield user_id





In [7]:
import pandas as pd
import numpy as np

df = pd.DataFrame(
    {
        'A': ('Alfa', 'Bravo', 'Charlie', 'Delta', 'Echo'),
        'B': [1, 10, 100, 1000, 10000],
        'C': np.linspace(0, 2, 5),
    },
    index=('a', 'b', 'c', 'd', 'e')
    # {
    #     'A': ('Alfa', 'Bravo', 'Charlie', 'Delta', 'Echo'),
    #     'B': [1, 10, 100, 1000, 10000],
    #     'C': np.linspace(0, 2, 5),
    # },
    # index=('a', 'b', 'c', 'd', 'e')
)

df

Unnamed: 0,A,B,C
a,Alfa,1,0.0
b,Bravo,10,0.5
c,Charlie,100,1.0
d,Delta,1000,1.5
e,Echo,10000,2.0


In [8]:
df['B']

a        1
b       10
c      100
d     1000
e    10000
Name: B, dtype: int64

In [9]:
df.loc['c']

A    Charlie
B        100
C        1.0
Name: c, dtype: object

In [10]:
df.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [11]:
df.values

array([['Alfa', 1, 0.0],
       ['Bravo', 10, 0.5],
       ['Charlie', 100, 1.0],
       ['Delta', 1000, 1.5],
       ['Echo', 10000, 2.0]], dtype=object)

In [None]:
# Section 1-2

In [19]:
import pandas as pd
import numpy as np

N = 5

df = pd.DataFrame({
    'list_int': list(range(N)),
    'list_float': [i*0.1 for i in range(N)],
    'np_float16': np.linspace(0, 1., num=N, dtype='float16'),
    'np_float32': np.linspace(0, 1., num=N, dtype='float32'),
    'category_animal': ('cat', 'dog', 'dog', 'cat', 'cat'),
    'category_size': ['MIDDLE', 'LARGE', 'SMALL', 'EXTRA-SMALL', 'MIDDLE'],
    'date': ['1981-03-05', '1993-04-10', '2005-07-15', '2017-10-20', '2029-12-25']
})

df

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,list_int,list_float,np_float16,np_float32,category_animal,category_size,date
0,0,0.0,0.0,0.0,cat,MIDDLE,1981-03-05
1,1,0.1,0.25,0.25,dog,LARGE,1993-04-10
2,2,0.2,0.5,0.5,dog,SMALL,2005-07-15
3,3,0.3,0.75,0.75,cat,EXTRA-SMALL,2017-10-20
4,4,0.4,1.0,1.0,cat,MIDDLE,2029-12-25


In [17]:
df.dtypes

list_int             int64
list_float         float64
np_float16         float16
np_float32         float32
category_animal     object
category_size       object
date                object
dtype: object

In [20]:
df_change = df.astype({
    'list_int': 'int32',
    'np_float16': 'float64'
})

df_change.dtypes

list_int             int32
list_float         float64
np_float16         float64
np_float32         float32
category_animal     object
category_size       object
date                object
dtype: object

In [21]:
df_change

Unnamed: 0,list_int,list_float,np_float16,np_float32,category_animal,category_size,date
0,0,0.0,0.0,0.0,cat,MIDDLE,1981-03-05
1,1,0.1,0.25,0.25,dog,LARGE,1993-04-10
2,2,0.2,0.5,0.5,dog,SMALL,2005-07-15
3,3,0.3,0.75,0.75,cat,EXTRA-SMALL,2017-10-20
4,4,0.4,1.0,1.0,cat,MIDDLE,2029-12-25


In [24]:
df_change = df_change.astype({
    'category_animal': 'string',
    'category_size': 'string'
})

df_change.dtypes

list_int                    int32
list_float                float64
np_float16                float64
np_float32                float32
category_animal    string[python]
category_size      string[python]
date                       object
dtype: object

In [26]:
df = df.astype({
    'category_animal': 'category',
    'category_size': 'category'
})

df.dtypes

list_int              int64
list_float          float64
np_float16          float16
np_float32          float32
category_animal    category
category_size      category
date                 object
dtype: object

In [27]:
df['category_animal'].unique()

['cat', 'dog']
Categories (2, object): ['cat', 'dog']

In [28]:
df['category_animal'].values

['cat', 'dog', 'dog', 'cat', 'cat']
Categories (2, object): ['cat', 'dog']

In [29]:
df['category_animal'] = df['category_animal'].cat.set_categories(
    ['cat', 'dog'],
    ordered=False
)

In [30]:
df['category_animal'].values

['cat', 'dog', 'dog', 'cat', 'cat']
Categories (2, object): ['cat', 'dog']

In [31]:
df['category_size'].values

['MIDDLE', 'LARGE', 'SMALL', 'EXTRA-SMALL', 'MIDDLE']
Categories (4, object): ['EXTRA-SMALL', 'LARGE', 'MIDDLE', 'SMALL']

In [32]:
df['category_size'] = df['category_size'].cat.set_categories(
    ['EXTRA-SMALL', 'SMALL', 'MIDDLE', 'LARGE', 'EXTRA-LARGE'],
    ordered=True
)
df['category_size'].values

['MIDDLE', 'LARGE', 'SMALL', 'EXTRA-SMALL', 'MIDDLE']
Categories (5, object): ['EXTRA-SMALL' < 'SMALL' < 'MIDDLE' < 'LARGE' < 'EXTRA-LARGE']

In [33]:
df['category_size'].values.codes

array([2, 3, 1, 0, 2], dtype=int8)

In [34]:
df['date'] = pd.to_datetime(df['date'])

df.dtypes

list_int                    int64
list_float                float64
np_float16                float16
np_float32                float32
category_animal          category
category_size            category
date               datetime64[ns]
dtype: object

In [36]:
categories_animal = ['cat', 'dog']
categories_size = ['EXTRA-SMALL', 'SMALL', 'MIDDLE', 'LARGE', 'EXTRA-LARGE']

df = pd.DataFrame({
    'category_animal': pd.Categorical(
        ('cat', 'dog', 'dog', 'cat', 'cat'),
        ordered=False,
        categories=categories_animal),
    'category_size': pd.Categorical(
        ['MIDDLE', 'LARGE', 'SMALL', 'EXTRA-SMALL', 'MIDDLE'],
        ordered=True,
        categories=categories_size),
    'date': pd.to_datetime(['1981-03-05', '1993-04-10', '2005-07-15', '2017-10-20', '2029-12-25'])
})

df.dtypes

category_animal          category
category_size            category
date               datetime64[ns]
dtype: object

In [None]:
# Section 1-3

In [40]:
import pandas as pd
import numpy as np

N = 16

df = pd.DataFrame({
    'A': (
        'Alfa', 'Bravo', 'Charlie', 'Alfa',
        'Bravo', 'Charlie', 'Alfa', 'Bravo',
        'Charlie', 'Alfa', 'Bravo', 'Charlie',
        'Alfa', 'Bravo', 'Charlie', 'Alfa'
    ),
    'B': list(range(N)),
    'C': np.arange(0, 2*N, step=2, dtype='int32'),
    'D': np.linspace(0, 1., num=N, dtype='float32'),
})

df

Unnamed: 0,A,B,C,D
0,Alfa,0,0,0.0
1,Bravo,1,2,0.066667
2,Charlie,2,4,0.133333
3,Alfa,3,6,0.2
4,Bravo,4,8,0.266667
5,Charlie,5,10,0.333333
6,Alfa,6,12,0.4
7,Bravo,7,14,0.466667
8,Charlie,8,16,0.533333
9,Alfa,9,18,0.6


In [61]:
print('------ df.dtypes 型 ------')
print('')
print(df.dtypes)
print('')
print('------ df.shape 行列 ------')
print('')
print(df.shape)
print('')
print('------ df.size 全要素数 ------')
print('')
print(df.size)
print('')
print('------ df.axes 行・列インデックス ------')
print('')
print(df.axes)
print('')
print('------ df.axes 行・列インデックス ------')
print('')
print(df.axes)
print('')
print('------ df.index 行インデックス ------')
print('')
print(df.index)
print('')
print('------ df.columns 列インデックス ------')
print('')
print(df.columns)
print('')
print('------ df.info() インスタンス概要 ------')
print('')
print(df.info())
print('')
print('------ df.describe() ------')
print('')
display(df.describe())
print('')
print("------ df.describe(exclude='number') 質的データ------")
print('')
display(df.describe(exclude='number'))
print('')
print("------ df.describe(include='all') ------")
print('')
display(df.describe(include='all'))
print('')

------ df.dtypes 型 ------

A     object
B      int64
C      int32
D    float32
dtype: object

------ df.shape 行列 ------

(16, 4)

------ df.size 全要素数 ------

64

------ df.axes 行・列インデックス ------

[RangeIndex(start=0, stop=16, step=1), Index(['A', 'B', 'C', 'D'], dtype='object')]

------ df.axes 行・列インデックス ------

[RangeIndex(start=0, stop=16, step=1), Index(['A', 'B', 'C', 'D'], dtype='object')]

------ df.index 行インデックス ------

RangeIndex(start=0, stop=16, step=1)

------ df.columns 列インデックス ------

Index(['A', 'B', 'C', 'D'], dtype='object')

------ df.info() インスタンス概要 ------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       16 non-null     object 
 1   B       16 non-null     int64  
 2   C       16 non-null     int32  
 3   D       16 non-null     float32
dtypes: float32(1), int32(1), int64(1), object(1)
memory usage: 512.0+ bytes
None

------ df.de

Unnamed: 0,B,C,D
count,16.0,16.0,16.0
mean,7.5,15.0,0.5
std,4.760952,9.521905,0.317397
min,0.0,0.0,0.0
25%,3.75,7.5,0.25
50%,7.5,15.0,0.5
75%,11.25,22.5,0.75
max,15.0,30.0,1.0



------ df.describe(exclude='number') 質的データ------



Unnamed: 0,A
count,16
unique,3
top,Alfa
freq,6



------ df.describe(include='all') ------



Unnamed: 0,A,B,C,D
count,16,16.0,16.0,16.0
unique,3,,,
top,Alfa,,,
freq,6,,,
mean,,7.5,15.0,0.5
std,,4.760952,9.521905,0.317397
min,,0.0,0.0,0.0
25%,,3.75,7.5,0.25
50%,,7.5,15.0,0.5
75%,,11.25,22.5,0.75





In [62]:
# Section 1-4

In [63]:
import pandas as pd
import numpy as np

data = np.arange(12).reshape(3, 4)

df = pd.DataFrame(
    data,
    columns=['A', 'B', 'C', 'D'],
    index=['a', 'b', 'c'],
    dtype='int8'
)

df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [64]:
df.dtypes

A    int8
B    int8
C    int8
D    int8
dtype: object

In [65]:
df.to_numpy()

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]], dtype=int8)

In [66]:
data = [
    [ 0,  1,  2,  3],
    [ 4,  5,  6,  7],
    [ 8,  9, 10, 11]
]

df = pd.DataFrame(
    data,
    columns=['A', 'B', 'C', 'D'],
    index=['a', 'b', 'c']
)

df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [67]:
df.to_numpy().tolist()

[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]

In [76]:
df.to_numpy().shape

(3, 4)

In [79]:
type(df.to_numpy().tolist())

list

In [81]:
len(df.to_numpy())

3

In [82]:
df = pd.DataFrame(
    {
        'A': {'a': 0, 'b': 4, 'c': 8},
        'B': {'a': 1, 'b': 5, 'c': 9}, 
        'C': {'a': 2, 'b': 6, 'c': 10}, 
        'D': {'a': 3, 'b': 7, 'c': 11}, 
    }
)

df

Unnamed: 0,A,B,C,D
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [83]:
df.to_dict()

{'A': {'a': 0, 'b': 4, 'c': 8},
 'B': {'a': 1, 'b': 5, 'c': 9},
 'C': {'a': 2, 'b': 6, 'c': 10},
 'D': {'a': 3, 'b': 7, 'c': 11}}

In [84]:
df.to_dict(orient='list')

{'A': [0, 4, 8], 'B': [1, 5, 9], 'C': [2, 6, 10], 'D': [3, 7, 11]}

In [85]:
df.to_dict(orient='series')

{'A': a    0
 b    4
 c    8
 Name: A, dtype: int64,
 'B': a    1
 b    5
 c    9
 Name: B, dtype: int64,
 'C': a     2
 b     6
 c    10
 Name: C, dtype: int64,
 'D': a     3
 b     7
 c    11
 Name: D, dtype: int64}

In [88]:
df.loc['a', :]

A    0
B    1
C    2
D    3
Name: a, dtype: int64

In [89]:
df.to_dict(orient='split')

{'index': ['a', 'b', 'c'],
 'columns': ['A', 'B', 'C', 'D'],
 'data': [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]}

In [90]:
df.to_dict(orient='records')

[{'A': 0, 'B': 1, 'C': 2, 'D': 3},
 {'A': 4, 'B': 5, 'C': 6, 'D': 7},
 {'A': 8, 'B': 9, 'C': 10, 'D': 11}]

In [91]:
# Section 1-4

In [92]:
import pandas as pd
import numpy as np

df = pd.read_csv('sample_1_5.csv')

df.head()

Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
0,-58,32,-5999,33733,-443321446,1684055143,8957714763128186512,13377746478414994568,0.8325,0.235277,0.600021,True,SCByzDL,1700-01-01 13:48:28,SMALL,Rabbit
1,108,11,-3541,4638,-1033826067,1396190299,-4638881529941010573,17552204462794163122,0.845,0.741379,0.526817,True,a5qX,1700-01-02 03:35:14,MIDDLE,Dog
2,92,206,28962,39774,-2059205335,1287593496,-2767984207359509796,8444497977544042195,0.6675,0.025393,0.948699,True,nO1,1700-01-03 17:56:08,EXTRA-LARGE,Rabbit
3,-13,11,-20849,17724,722125942,1777461955,-4014536580365825640,11826527118608492920,0.1276,0.093628,0.72293,True,5E8WXw,1700-01-04 00:08:29,SMALL,Hamster
4,12,212,13470,57662,812726532,1340894438,-1791361854263043924,2901203935713848353,0.62,0.325553,0.450991,False,v0TZ8qlw,1700-01-05 16:37:16,EXTRA-SMALL,Mouse


In [93]:
df.tail()

Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
199995,66,27,-30929,28387,-430623839,4118004906,-5611253302571762236,7378766867233303579,0.7812,0.44647,0.222961,False,tVSnZ,2247-07-28 18:36:19,SMALL,Dog
199996,-89,168,-30225,33137,-834523890,4029728482,-1497397032562367152,426161160883600185,0.2013,0.580881,0.796438,True,lop62Ql,2247-07-29 04:39:44,MIDDLE,Rabbit
199997,67,139,11322,11438,-781825655,1627514738,-2462494486884866133,5838011223012641606,0.3394,0.771871,0.092576,False,RBcp4,2247-07-30 00:04:20,EXTRA-LARGE,Cat
199998,-128,139,-19318,19901,15579203,105928120,-8892563666988164059,14420138694162308271,0.3157,0.740483,0.434867,True,9fwSUP1x,2247-07-31 19:43:42,LARGE,Ferret
199999,97,195,-995,43415,512988475,2416444269,-3183461793452287960,4007191972534931082,0.784,0.247407,0.953553,False,EJEc9prG,2247-08-01 13:58:21,MIDDLE,Dog


In [97]:
def display_df_info(df):
    print('------ df.dtypes 型 ------')
    print('')
    print(df.dtypes)
    print('')
    print('------ df.shape 行列 ------')
    print('')
    print(df.shape)
    print('')
    print('------ df.size 全要素数 ------')
    print('')
    print(df.size)
    print('')
    print('------ df.axes 行・列インデックス ------')
    print('')
    print(df.axes)
    print('')
    print('------ df.axes 行・列インデックス ------')
    print('')
    print(df.axes)
    print('')
    print('------ df.index 行インデックス ------')
    print('')
    print(df.index)
    print('')
    print('------ df.columns 列インデックス ------')
    print('')
    print(df.columns)
    print('')
    print('------ df.info() インスタンス概要 ------')
    print('')
    print(df.info())
    print('')
    print('------ df.describe() ------')
    print('')
    display(df.describe())
    print('')
    print("------ df.describe(exclude='number') 質的データ------")
    print('')
    display(df.describe(exclude='number'))
    print('')
    print("------ df.describe(include='all') ------")
    print('')
    display(df.describe(include='all'))
    print('')

In [98]:
display_df_info(df)

------ df.dtypes 型 ------

col_int8                   int8
col_uint8                 int64
col_int16                 int16
col_uint16                int64
col_int32                 int32
col_uint32                int64
col_int64                 int64
col_uint64               uint64
col_float16             float16
col_float32             float32
col_float64             float64
col_bool                   bool
col_string       string[python]
col_datetime     datetime64[ns]
col_ordered            category
col_unordered          category
dtype: object

------ df.shape 行列 ------

(200000, 16)

------ df.size 全要素数 ------

3200000

------ df.axes 行・列インデックス ------

[RangeIndex(start=0, stop=200000, step=1), Index(['col_int8', 'col_uint8', 'col_int16', 'col_uint16', 'col_int32',
       'col_uint32', 'col_int64', 'col_uint64', 'col_float16', 'col_float32',
       'col_float64', 'col_bool', 'col_string', 'col_datetime', 'col_ordered',
       'col_unordered'],
      dtype='object')]

------ df.axes

  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_datetime
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000
mean,-0.81983,127.2375,3.76616,32757.689815,-1093583.0,2147732000.0,-3423150000000000.0,9.224742e+18,,0.499516,0.499562,1973-10-16 23:58:19.229404832
min,-128.0,0.0,-32768.0,0.0,-2147472000.0,30885.0,-9.223297e+18,22730810000000.0,1e-06,1.2e-05,1.8e-05,1700-01-01 13:48:28
25%,-65.0,63.0,-16315.25,16342.0,-1076584000.0,1069189000.0,-4.628095e+18,4.600351e+18,0.249146,0.249236,0.248078,1836-11-24 10:42:05.750000128
50%,-1.0,127.0,-45.0,32781.5,-5828318.0,2148725000.0,-2.601359e+16,9.24357e+18,0.5,0.498952,0.499372,1973-10-17 04:07:38.500000
75%,63.0,191.0,16372.0,49133.0,1073612000.0,3227945000.0,4.624729e+18,1.382228e+19,0.749512,0.74962,0.75019,2110-09-08 20:02:38.249999872
max,127.0,255.0,32767.0,65535.0,2147443000.0,4294913000.0,9.223351e+18,1.844656e+19,1.0,0.999997,0.999992,2247-08-01 13:58:21
std,73.925213,73.745211,18906.273278,18921.442771,1242073000.0,1242069000.0,5.333178e+18,5.323004e+18,0.0,0.288506,0.288981,



------ df.describe(exclude='number') 質的データ------



Unnamed: 0,col_bool,col_string,col_datetime,col_ordered,col_unordered
count,200000,200000,200000,200000,200000
unique,2,197767,,5,6
top,True,xqD,,EXTRA-LARGE,Rabbit
freq,100172,4,,40183,33602
mean,,,1973-10-16 23:58:19.229404832,,
min,,,1700-01-01 13:48:28,,
25%,,,1836-11-24 10:42:05.750000128,,
50%,,,1973-10-17 04:07:38.500000,,
75%,,,2110-09-08 20:02:38.249999872,,
max,,,2247-08-01 13:58:21,,



------ df.describe(include='all') ------



  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000,200000,200000,200000,200000
unique,,,,,,,,,,,,2,197767,,5,6
top,,,,,,,,,,,,True,xqD,,EXTRA-LARGE,Rabbit
freq,,,,,,,,,,,,100172,4,,40183,33602
mean,-0.81983,127.2375,3.76616,32757.689815,-1093583.0,2147732000.0,-3423150000000000.0,9.224742e+18,,0.499516,0.499562,,,1973-10-16 23:58:19.229404832,,
min,-128.0,0.0,-32768.0,0.0,-2147472000.0,30885.0,-9.223297e+18,22730810000000.0,1e-06,1.2e-05,1.8e-05,,,1700-01-01 13:48:28,,
25%,-65.0,63.0,-16315.25,16342.0,-1076584000.0,1069189000.0,-4.628095e+18,4.600351e+18,0.249146,0.249236,0.248078,,,1836-11-24 10:42:05.750000128,,
50%,-1.0,127.0,-45.0,32781.5,-5828318.0,2148725000.0,-2.601359e+16,9.24357e+18,0.5,0.498952,0.499372,,,1973-10-17 04:07:38.500000,,
75%,63.0,191.0,16372.0,49133.0,1073612000.0,3227945000.0,4.624729e+18,1.382228e+19,0.749512,0.74962,0.75019,,,2110-09-08 20:02:38.249999872,,
max,127.0,255.0,32767.0,65535.0,2147443000.0,4294913000.0,9.223351e+18,1.844656e+19,1.0,0.999997,0.999992,,,2247-08-01 13:58:21,,





In [96]:
df = pd.read_csv(
    'sample_1_5.csv',
    dtype={
        'col_int8': 'int8',
        'col_unit8': 'uint8',
        'col_int16': 'int16',
        'col_unit16': 'uint16',
        'col_int32': 'int32',
        'col_unit32': 'uint32',
        'col_int64': 'int64',
        'col_unit64': 'uint64',
        'col_float16': 'float16',
        'col_float32': 'float32',
        'col_float64': 'float64',
        'col_bool': 'bool',
        'col_string': 'string',
        'col_ordered': 'category',
        'col_unordered': 'category',
    },
    parse_dates=[13]
)

ordered_categories = ['EXTRA-SMALL', 'SMALL', 'MIDDLE', 'LARGE', 'EXTRA-LARGE']

df['col_ordered'] = df['col_ordered'].cat.set_categories(
    ordered_categories,
    ordered=True
)

unordered_categories = [
    'Mouse',
    'Cat',
    'Dog',
    'Hamster',
    'Rabbit',
    'Ferret'
]

df['col_unordered'] = df['col_unordered'].cat.set_categories(
    unordered_categories,
    ordered=False
)

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   col_int8       200000 non-null  int8          
 1   col_uint8      200000 non-null  int64         
 2   col_int16      200000 non-null  int16         
 3   col_uint16     200000 non-null  int64         
 4   col_int32      200000 non-null  int32         
 5   col_uint32     200000 non-null  int64         
 6   col_int64      200000 non-null  int64         
 7   col_uint64     200000 non-null  uint64        
 8   col_float16    200000 non-null  float16       
 9   col_float32    200000 non-null  float32       
 10  col_float64    200000 non-null  float64       
 11  col_bool       200000 non-null  bool          
 12  col_string     200000 non-null  string        
 13  col_datetime   200000 non-null  datetime64[ns]
 14  col_ordered    200000 non-null  category      
 15  

In [99]:
display_df_info(df)

------ df.dtypes 型 ------

col_int8                   int8
col_uint8                 int64
col_int16                 int16
col_uint16                int64
col_int32                 int32
col_uint32                int64
col_int64                 int64
col_uint64               uint64
col_float16             float16
col_float32             float32
col_float64             float64
col_bool                   bool
col_string       string[python]
col_datetime     datetime64[ns]
col_ordered            category
col_unordered          category
dtype: object

------ df.shape 行列 ------

(200000, 16)

------ df.size 全要素数 ------

3200000

------ df.axes 行・列インデックス ------

[RangeIndex(start=0, stop=200000, step=1), Index(['col_int8', 'col_uint8', 'col_int16', 'col_uint16', 'col_int32',
       'col_uint32', 'col_int64', 'col_uint64', 'col_float16', 'col_float32',
       'col_float64', 'col_bool', 'col_string', 'col_datetime', 'col_ordered',
       'col_unordered'],
      dtype='object')]

------ df.axes

  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_datetime
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000
mean,-0.81983,127.2375,3.76616,32757.689815,-1093583.0,2147732000.0,-3423150000000000.0,9.224742e+18,,0.499516,0.499562,1973-10-16 23:58:19.229404832
min,-128.0,0.0,-32768.0,0.0,-2147472000.0,30885.0,-9.223297e+18,22730810000000.0,1e-06,1.2e-05,1.8e-05,1700-01-01 13:48:28
25%,-65.0,63.0,-16315.25,16342.0,-1076584000.0,1069189000.0,-4.628095e+18,4.600351e+18,0.249146,0.249236,0.248078,1836-11-24 10:42:05.750000128
50%,-1.0,127.0,-45.0,32781.5,-5828318.0,2148725000.0,-2.601359e+16,9.24357e+18,0.5,0.498952,0.499372,1973-10-17 04:07:38.500000
75%,63.0,191.0,16372.0,49133.0,1073612000.0,3227945000.0,4.624729e+18,1.382228e+19,0.749512,0.74962,0.75019,2110-09-08 20:02:38.249999872
max,127.0,255.0,32767.0,65535.0,2147443000.0,4294913000.0,9.223351e+18,1.844656e+19,1.0,0.999997,0.999992,2247-08-01 13:58:21
std,73.925213,73.745211,18906.273278,18921.442771,1242073000.0,1242069000.0,5.333178e+18,5.323004e+18,0.0,0.288506,0.288981,



------ df.describe(exclude='number') 質的データ------



Unnamed: 0,col_bool,col_string,col_datetime,col_ordered,col_unordered
count,200000,200000,200000,200000,200000
unique,2,197767,,5,6
top,True,xqD,,EXTRA-LARGE,Rabbit
freq,100172,4,,40183,33602
mean,,,1973-10-16 23:58:19.229404832,,
min,,,1700-01-01 13:48:28,,
25%,,,1836-11-24 10:42:05.750000128,,
50%,,,1973-10-17 04:07:38.500000,,
75%,,,2110-09-08 20:02:38.249999872,,
max,,,2247-08-01 13:58:21,,



------ df.describe(include='all') ------



  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000,200000,200000,200000,200000
unique,,,,,,,,,,,,2,197767,,5,6
top,,,,,,,,,,,,True,xqD,,EXTRA-LARGE,Rabbit
freq,,,,,,,,,,,,100172,4,,40183,33602
mean,-0.81983,127.2375,3.76616,32757.689815,-1093583.0,2147732000.0,-3423150000000000.0,9.224742e+18,,0.499516,0.499562,,,1973-10-16 23:58:19.229404832,,
min,-128.0,0.0,-32768.0,0.0,-2147472000.0,30885.0,-9.223297e+18,22730810000000.0,1e-06,1.2e-05,1.8e-05,,,1700-01-01 13:48:28,,
25%,-65.0,63.0,-16315.25,16342.0,-1076584000.0,1069189000.0,-4.628095e+18,4.600351e+18,0.249146,0.249236,0.248078,,,1836-11-24 10:42:05.750000128,,
50%,-1.0,127.0,-45.0,32781.5,-5828318.0,2148725000.0,-2.601359e+16,9.24357e+18,0.5,0.498952,0.499372,,,1973-10-17 04:07:38.500000,,
75%,63.0,191.0,16372.0,49133.0,1073612000.0,3227945000.0,4.624729e+18,1.382228e+19,0.749512,0.74962,0.75019,,,2110-09-08 20:02:38.249999872,,
max,127.0,255.0,32767.0,65535.0,2147443000.0,4294913000.0,9.223351e+18,1.844656e+19,1.0,0.999997,0.999992,,,2247-08-01 13:58:21,,





In [100]:
df_usecols = pd.read_csv(
    'sample_1_5.csv',
    usecols=[0, 2, 4, 6, 8],
    dtype={
        'col_int8': 'int8',
        'col_int16': 'int16',
        'col_int32': 'int32',
        'col_int64': 'int64',
        'col_float64': 'float16',
    }
)

df_usecols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   col_int8     200000 non-null  int8   
 1   col_int16    200000 non-null  int16  
 2   col_int32    200000 non-null  int32  
 3   col_int64    200000 non-null  int64  
 4   col_float16  200000 non-null  float64
dtypes: float64(1), int16(1), int32(1), int64(1), int8(1)
memory usage: 4.4 MB


In [101]:
display_df_info(df)

------ df.dtypes 型 ------

col_int8                   int8
col_uint8                 int64
col_int16                 int16
col_uint16                int64
col_int32                 int32
col_uint32                int64
col_int64                 int64
col_uint64               uint64
col_float16             float16
col_float32             float32
col_float64             float64
col_bool                   bool
col_string       string[python]
col_datetime     datetime64[ns]
col_ordered            category
col_unordered          category
dtype: object

------ df.shape 行列 ------

(200000, 16)

------ df.size 全要素数 ------

3200000

------ df.axes 行・列インデックス ------

[RangeIndex(start=0, stop=200000, step=1), Index(['col_int8', 'col_uint8', 'col_int16', 'col_uint16', 'col_int32',
       'col_uint32', 'col_int64', 'col_uint64', 'col_float16', 'col_float32',
       'col_float64', 'col_bool', 'col_string', 'col_datetime', 'col_ordered',
       'col_unordered'],
      dtype='object')]

------ df.axes

  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_datetime
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000
mean,-0.81983,127.2375,3.76616,32757.689815,-1093583.0,2147732000.0,-3423150000000000.0,9.224742e+18,,0.499516,0.499562,1973-10-16 23:58:19.229404832
min,-128.0,0.0,-32768.0,0.0,-2147472000.0,30885.0,-9.223297e+18,22730810000000.0,1e-06,1.2e-05,1.8e-05,1700-01-01 13:48:28
25%,-65.0,63.0,-16315.25,16342.0,-1076584000.0,1069189000.0,-4.628095e+18,4.600351e+18,0.249146,0.249236,0.248078,1836-11-24 10:42:05.750000128
50%,-1.0,127.0,-45.0,32781.5,-5828318.0,2148725000.0,-2.601359e+16,9.24357e+18,0.5,0.498952,0.499372,1973-10-17 04:07:38.500000
75%,63.0,191.0,16372.0,49133.0,1073612000.0,3227945000.0,4.624729e+18,1.382228e+19,0.749512,0.74962,0.75019,2110-09-08 20:02:38.249999872
max,127.0,255.0,32767.0,65535.0,2147443000.0,4294913000.0,9.223351e+18,1.844656e+19,1.0,0.999997,0.999992,2247-08-01 13:58:21
std,73.925213,73.745211,18906.273278,18921.442771,1242073000.0,1242069000.0,5.333178e+18,5.323004e+18,0.0,0.288506,0.288981,



------ df.describe(exclude='number') 質的データ------



Unnamed: 0,col_bool,col_string,col_datetime,col_ordered,col_unordered
count,200000,200000,200000,200000,200000
unique,2,197767,,5,6
top,True,xqD,,EXTRA-LARGE,Rabbit
freq,100172,4,,40183,33602
mean,,,1973-10-16 23:58:19.229404832,,
min,,,1700-01-01 13:48:28,,
25%,,,1836-11-24 10:42:05.750000128,,
50%,,,1973-10-17 04:07:38.500000,,
75%,,,2110-09-08 20:02:38.249999872,,
max,,,2247-08-01 13:58:21,,



------ df.describe(include='all') ------



  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000,200000,200000,200000,200000
unique,,,,,,,,,,,,2,197767,,5,6
top,,,,,,,,,,,,True,xqD,,EXTRA-LARGE,Rabbit
freq,,,,,,,,,,,,100172,4,,40183,33602
mean,-0.81983,127.2375,3.76616,32757.689815,-1093583.0,2147732000.0,-3423150000000000.0,9.224742e+18,,0.499516,0.499562,,,1973-10-16 23:58:19.229404832,,
min,-128.0,0.0,-32768.0,0.0,-2147472000.0,30885.0,-9.223297e+18,22730810000000.0,1e-06,1.2e-05,1.8e-05,,,1700-01-01 13:48:28,,
25%,-65.0,63.0,-16315.25,16342.0,-1076584000.0,1069189000.0,-4.628095e+18,4.600351e+18,0.249146,0.249236,0.248078,,,1836-11-24 10:42:05.750000128,,
50%,-1.0,127.0,-45.0,32781.5,-5828318.0,2148725000.0,-2.601359e+16,9.24357e+18,0.5,0.498952,0.499372,,,1973-10-17 04:07:38.500000,,
75%,63.0,191.0,16372.0,49133.0,1073612000.0,3227945000.0,4.624729e+18,1.382228e+19,0.749512,0.74962,0.75019,,,2110-09-08 20:02:38.249999872,,
max,127.0,255.0,32767.0,65535.0,2147443000.0,4294913000.0,9.223351e+18,1.844656e+19,1.0,0.999997,0.999992,,,2247-08-01 13:58:21,,





In [None]:
# Section 2-1

In [102]:
import pandas as pd
import numpy as np

df = pd.DataFrame(
    data=np.arange(20).reshape(5, 4),
    index=['a', 'b', 'c', 'd', 'e'],
    columns=['Alpha', 'Bravo', 'Charlie', 'Delta']
)

df

Unnamed: 0,Alpha,Bravo,Charlie,Delta
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15
e,16,17,18,19


In [103]:
df.at['d', 'Bravo']

np.int64(13)

In [104]:
df.iat[3, 1]

np.int64(13)

In [105]:
df.loc['d', :]

Alpha      12
Bravo      13
Charlie    14
Delta      15
Name: d, dtype: int64

In [106]:
df.loc[['d'], :]

Unnamed: 0,Alpha,Bravo,Charlie,Delta
d,12,13,14,15


In [107]:
df.loc[:, 'Bravo']

a     1
b     5
c     9
d    13
e    17
Name: Bravo, dtype: int64

In [108]:
df.loc[:, ['Bravo']]

Unnamed: 0,Bravo
a,1
b,5
c,9
d,13
e,17


In [109]:
df.loc[['a', 'c', 'e'], :]

Unnamed: 0,Alpha,Bravo,Charlie,Delta
a,0,1,2,3
c,8,9,10,11
e,16,17,18,19


In [110]:
df.loc[:, ['Bravo', 'Delta']]

Unnamed: 0,Bravo,Delta
a,1,3
b,5,7
c,9,11
d,13,15
e,17,19


In [112]:
df.loc[['a', 'c', 'e'], ['Bravo', 'Delta']]

Unnamed: 0,Bravo,Delta
a,1,3
c,9,11
e,17,19


In [113]:
df.iloc[[0, 2, 4], [1, 3]]

Unnamed: 0,Bravo,Delta
a,1,3
c,9,11
e,17,19


In [114]:
df.iloc[1:4, 1:4]

Unnamed: 0,Bravo,Charlie,Delta
b,5,6,7
c,9,10,11
d,13,14,15


In [115]:
row_indices = np.array([0, 2, 3])
col_indices = np.arange(1, 4)

df.iloc[row_indices, col_indices]

Unnamed: 0,Bravo,Charlie,Delta
a,1,2,3
c,9,10,11
d,13,14,15


In [117]:
N = 100000
row_indices = np.random.randint(0, df.shape[0], N)
col_indices = np.random.randint(0, df.shape[1], N)

In [120]:
%%timeit

for row_index, col_index in zip(row_indices, col_indices):
    df.iloc[row_index, col_index]

1.98 s ± 295 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [121]:
%%timeit

for row_index, col_index in zip(row_indices, col_indices):
    df.iat[row_index, col_index]

1.61 s ± 262 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [122]:
df.filter(['Alpha', 'Charlie'])

Unnamed: 0,Alpha,Charlie
a,0,2
b,4,6
c,8,10
d,12,14
e,16,18


In [123]:
df.filter(like='r')

Unnamed: 0,Bravo,Charlie
a,1,2
b,5,6
c,9,10
d,13,14
e,17,18


In [124]:
df.filter(regex='.*l.*a')

Unnamed: 0,Alpha,Delta
a,0,3
b,4,7
c,8,11
d,12,15
e,16,19


In [125]:
df.filter(['a', 'e'], axis=0)

Unnamed: 0,Alpha,Bravo,Charlie,Delta
a,0,1,2,3
e,16,17,18,19


In [126]:
row_mask = [True, False, False, True, True]

df.loc[row_mask]

Unnamed: 0,Alpha,Bravo,Charlie,Delta
a,0,1,2,3
d,12,13,14,15
e,16,17,18,19


In [127]:
col_mask = [False, True, False, True]

df.loc[row_mask, col_mask]

Unnamed: 0,Bravo,Delta
a,1,3
d,13,15
e,17,19


In [128]:
row_mask = df.loc[:, 'Alpha'] > 6

row_mask

a    False
b    False
c     True
d     True
e     True
Name: Alpha, dtype: bool

In [129]:
col_mask = df.loc['d', :] % 2 == 0

col_mask

Alpha       True
Bravo      False
Charlie     True
Delta      False
Name: d, dtype: bool

In [130]:
df.loc[['d'], :] % 2 == 0


Unnamed: 0,Alpha,Bravo,Charlie,Delta
d,True,False,True,False


In [131]:
df.loc[row_mask, col_mask]

Unnamed: 0,Alpha,Charlie
c,8,10
d,12,14
e,16,18


In [132]:
df.loc[~row_mask, ~col_mask]

Unnamed: 0,Bravo,Delta
a,1,3
b,5,7


In [133]:
row_mask2 = df.loc[:, 'Alpha'].isin([8, 16])

col_mask2 = df.loc['b', :] == 7

df.loc[row_mask & row_mask2, col_mask | col_mask2]

Unnamed: 0,Alpha,Charlie,Delta
c,8,10,11
e,16,18,19


In [139]:
df.isin([0, 5, 10, 15])

Unnamed: 0,Alpha,Bravo,Charlie,Delta
a,True,False,False,False
b,False,True,False,False
c,False,False,True,False
d,False,False,False,True
e,False,False,False,False


In [141]:
row_mask = df['Alpha'].isin([0, 8, 16])

df.loc[row_mask]

Unnamed: 0,Alpha,Bravo,Charlie,Delta
a,0,1,2,3
c,8,9,10,11
e,16,17,18,19


In [142]:
df.query('Alpha > 6')

Unnamed: 0,Alpha,Bravo,Charlie,Delta
c,8,9,10,11
d,12,13,14,15
e,16,17,18,19


In [143]:
%%timeit

df.loc[df.loc[:, 'Alpha'] > 6]
df.loc[df.loc[:, 'Bravo'] == 9]
df.loc[df.loc[:, 'Charlie'] < 12]

740 µs ± 110 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [144]:
%%timeit

df.query('Alpha > 6')
df.query('Bravo == 9')
df.query('Charlie < 12')

4.88 ms ± 275 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [147]:
# Section 2-2

In [148]:
df = pd.read_csv(
    'sample_1_5.csv',
    dtype={
        'col_int8': 'int8',
        'col_uint8': 'uint8',
        'col_int16': 'int16',
        'col_uint16': 'uint16',
        'col_int32': 'int32',
        'col_uint32': 'uint32',
        'col_int64': 'int64',
        'col_uint64': 'uint64',
        'col_float16': 'float16',
        'col_float32': 'float32',
        'col_float64': 'float64',
        'col_bool': 'bool',
        'col_string': 'string',
        'col_ordered': 'category',
        'col_unordered': 'category',
    },
    parse_dates=[13]
)

ordered_categories = ['EXTRA-SMALL', 'SMALL', 'MIDDLE', 'LARGE', 'EXTRA-LARGE']

df['col_ordered'] = df['col_ordered'].cat.set_categories(
    ordered_categories,
    ordered=True)

unordered_categories = [
    'Mouse',
    'Cat',
    'Dog',
    'Hamster',
    'Rabbit',
    'Ferret']

df['col_unordered'] = df['col_unordered'].cat.set_categories(
    unordered_categories,
    ordered=False)

df.shape

(200000, 16)

In [149]:
display_df_info(df)

------ df.dtypes 型 ------

col_int8                   int8
col_uint8                 uint8
col_int16                 int16
col_uint16               uint16
col_int32                 int32
col_uint32               uint32
col_int64                 int64
col_uint64               uint64
col_float16             float16
col_float32             float32
col_float64             float64
col_bool                   bool
col_string       string[python]
col_datetime     datetime64[ns]
col_ordered            category
col_unordered          category
dtype: object

------ df.shape 行列 ------

(200000, 16)

------ df.size 全要素数 ------

3200000

------ df.axes 行・列インデックス ------

[RangeIndex(start=0, stop=200000, step=1), Index(['col_int8', 'col_uint8', 'col_int16', 'col_uint16', 'col_int32',
       'col_uint32', 'col_int64', 'col_uint64', 'col_float16', 'col_float32',
       'col_float64', 'col_bool', 'col_string', 'col_datetime', 'col_ordered',
       'col_unordered'],
      dtype='object')]

------ df.axes

  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_datetime
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000
mean,-0.81983,127.2375,3.76616,32757.689815,-1093583.0,2147732000.0,-3423150000000000.0,9.224742e+18,,0.499516,0.499562,1973-10-16 23:58:19.229404832
min,-128.0,0.0,-32768.0,0.0,-2147472000.0,30885.0,-9.223297e+18,22730810000000.0,1e-06,1.2e-05,1.8e-05,1700-01-01 13:48:28
25%,-65.0,63.0,-16315.25,16342.0,-1076584000.0,1069189000.0,-4.628095e+18,4.600351e+18,0.249146,0.249236,0.248078,1836-11-24 10:42:05.750000128
50%,-1.0,127.0,-45.0,32781.5,-5828318.0,2148725000.0,-2.601359e+16,9.24357e+18,0.5,0.498952,0.499372,1973-10-17 04:07:38.500000
75%,63.0,191.0,16372.0,49133.0,1073612000.0,3227945000.0,4.624729e+18,1.382228e+19,0.749512,0.74962,0.75019,2110-09-08 20:02:38.249999872
max,127.0,255.0,32767.0,65535.0,2147443000.0,4294913000.0,9.223351e+18,1.844656e+19,1.0,0.999997,0.999992,2247-08-01 13:58:21
std,73.925213,73.745211,18906.273278,18921.442771,1242073000.0,1242069000.0,5.333178e+18,5.323004e+18,0.0,0.288506,0.288981,



------ df.describe(exclude='number') 質的データ------



Unnamed: 0,col_bool,col_string,col_datetime,col_ordered,col_unordered
count,200000,200000,200000,200000,200000
unique,2,197767,,5,6
top,True,xqD,,EXTRA-LARGE,Rabbit
freq,100172,4,,40183,33602
mean,,,1973-10-16 23:58:19.229404832,,
min,,,1700-01-01 13:48:28,,
25%,,,1836-11-24 10:42:05.750000128,,
50%,,,1973-10-17 04:07:38.500000,,
75%,,,2110-09-08 20:02:38.249999872,,
max,,,2247-08-01 13:58:21,,



------ df.describe(include='all') ------



  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000,200000,200000,200000,200000
unique,,,,,,,,,,,,2,197767,,5,6
top,,,,,,,,,,,,True,xqD,,EXTRA-LARGE,Rabbit
freq,,,,,,,,,,,,100172,4,,40183,33602
mean,-0.81983,127.2375,3.76616,32757.689815,-1093583.0,2147732000.0,-3423150000000000.0,9.224742e+18,,0.499516,0.499562,,,1973-10-16 23:58:19.229404832,,
min,-128.0,0.0,-32768.0,0.0,-2147472000.0,30885.0,-9.223297e+18,22730810000000.0,1e-06,1.2e-05,1.8e-05,,,1700-01-01 13:48:28,,
25%,-65.0,63.0,-16315.25,16342.0,-1076584000.0,1069189000.0,-4.628095e+18,4.600351e+18,0.249146,0.249236,0.248078,,,1836-11-24 10:42:05.750000128,,
50%,-1.0,127.0,-45.0,32781.5,-5828318.0,2148725000.0,-2.601359e+16,9.24357e+18,0.5,0.498952,0.499372,,,1973-10-17 04:07:38.500000,,
75%,63.0,191.0,16372.0,49133.0,1073612000.0,3227945000.0,4.624729e+18,1.382228e+19,0.749512,0.74962,0.75019,,,2110-09-08 20:02:38.249999872,,
max,127.0,255.0,32767.0,65535.0,2147443000.0,4294913000.0,9.223351e+18,1.844656e+19,1.0,0.999997,0.999992,,,2247-08-01 13:58:21,,





In [150]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   col_int8       200000 non-null  int8          
 1   col_uint8      200000 non-null  uint8         
 2   col_int16      200000 non-null  int16         
 3   col_uint16     200000 non-null  uint16        
 4   col_int32      200000 non-null  int32         
 5   col_uint32     200000 non-null  uint32        
 6   col_int64      200000 non-null  int64         
 7   col_uint64     200000 non-null  uint64        
 8   col_float16    200000 non-null  float16       
 9   col_float32    200000 non-null  float32       
 10  col_float64    200000 non-null  float64       
 11  col_bool       200000 non-null  bool          
 12  col_string     200000 non-null  string        
 13  col_datetime   200000 non-null  datetime64[ns]
 14  col_ordered    200000 non-null  category      
 15  

In [153]:
df1 = df.iloc[:150000]
df1.shape

(150000, 16)

In [154]:
df2 = df.iloc[150000:]
df2.shape

(50000, 16)

In [155]:
df3 = df.iloc[:, :12]
df3.shape

(200000, 12)

In [156]:
df4 = df.iloc[:, 12:]
df4.shape

(200000, 4)

In [157]:
gb = df.groupby('col_unordered')
type(gb)

  gb = df.groupby('col_unordered')


pandas.core.groupby.generic.DataFrameGroupBy

In [158]:
gb.size()

col_unordered
Mouse      33383
Cat        33273
Dog        33384
Hamster    33287
Rabbit     33602
Ferret     33071
dtype: int64

In [159]:
df_mouse = gb.get_group('Mouse')
df_mouse.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
4,12,212,13470,57662,812726532,1340894438,-1791361854263043924,2901203935713848353,0.620117,0.325553,0.450991,False,v0TZ8qlw,1700-01-05 16:37:16,EXTRA-SMALL,Mouse
17,-118,72,-14622,57604,1582005113,513547424,1401451633856761704,15366429918558710399,0.737305,0.61086,0.922958,True,BvHzVsau,1700-01-18 13:00:45,MIDDLE,Mouse
23,18,98,-30154,12532,67134227,514303156,1365196833970852386,15568901983481002003,0.760742,0.494002,0.607546,True,JRXRXE,1700-01-24 08:47:02,EXTRA-LARGE,Mouse
27,-44,205,25230,54757,570723259,2591380613,6262849499533420187,1727506916833625085,0.67041,0.066153,0.933833,True,cECY49,1700-01-28 20:33:08,LARGE,Mouse
41,-51,180,4625,26306,1367833466,2546003471,-6956886051460496162,13868885902914802716,0.833984,0.28215,0.223907,True,SfbgF,1700-02-11 21:55:07,MIDDLE,Mouse


In [162]:
df_cat = gb.get_group('Cat')
df_cat.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
5,-61,225,-18121,9830,-1446151776,3013488260,-2701692826071598425,2241484607082862008,0.073425,0.971069,0.018226,True,srbjJLP,1700-01-06 13:51:19,LARGE,Cat
8,-23,54,-7424,65385,-1324881698,953993731,6128203876391204943,14000964572561896882,0.140381,0.604176,0.108115,True,mXjeG6,1700-01-09 21:10:42,EXTRA-SMALL,Cat
16,-98,22,-16646,58094,-658607969,655607873,3404345805479656430,5096669087158830124,0.07605,0.832736,0.199611,True,TIic,1700-01-17 04:50:04,SMALL,Cat
31,-111,187,-16968,42828,-1960399259,822734557,-4765912514916795068,18406417208120733228,0.828125,0.309161,0.219022,True,yR6pM,1700-02-01 04:58:36,LARGE,Cat
34,66,152,12013,4952,-820819230,3512962893,-8713382684225017206,2055208308306467596,0.740234,0.258134,0.437826,False,15C4mC2,1700-02-04 09:05:33,EXTRA-LARGE,Cat


In [163]:
for group, df_group in gb:
    print(group, df_group.shape)

Mouse (33383, 16)
Cat (33273, 16)
Dog (33384, 16)
Hamster (33287, 16)
Rabbit (33602, 16)
Ferret (33071, 16)


In [164]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

df_train.shape, df_test.shape

((160000, 16), (40000, 16))

In [165]:
df_train.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
127478,-72,27,3853,57491,-1112717343,3009994274,-2993066590894730154,4856804777963727627,0.265625,0.767206,0.196115,True,tWM,2049-01-09 18:47:58,MIDDLE,Cat
155552,94,36,5166,28348,1370340258,2042364264,-3777001664252634560,6705984686542068610,0.45459,0.142779,0.72772,False,462l7q8,2125-11-21 10:48:47,MIDDLE,Mouse
75475,10,29,3365,29491,-1965935571,947581500,7570720735943845926,15179047376142702404,0.654297,0.65585,0.840668,False,0se2ZxuD,1906-08-25 14:16:24,EXTRA-SMALL,Hamster
186114,29,93,5840,27722,575550766,2230778676,3868248375348797193,10046415197673935811,0.540527,0.407988,0.555073,True,hTh3fTs,2209-07-26 05:46:58,SMALL,Rabbit
93717,-65,165,-2511,994,61908793,2286780953,-5675846870268755869,6211416484931316168,0.745117,0.731461,0.311473,False,kcOWM,1956-08-04 01:10:50,EXTRA-SMALL,Ferret


In [166]:
df_train = df_train.reset_index(drop=True)
df_train.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
0,-72,27,3853,57491,-1112717343,3009994274,-2993066590894730154,4856804777963727627,0.265625,0.767206,0.196115,True,tWM,2049-01-09 18:47:58,MIDDLE,Cat
1,94,36,5166,28348,1370340258,2042364264,-3777001664252634560,6705984686542068610,0.45459,0.142779,0.72772,False,462l7q8,2125-11-21 10:48:47,MIDDLE,Mouse
2,10,29,3365,29491,-1965935571,947581500,7570720735943845926,15179047376142702404,0.654297,0.65585,0.840668,False,0se2ZxuD,1906-08-25 14:16:24,EXTRA-SMALL,Hamster
3,29,93,5840,27722,575550766,2230778676,3868248375348797193,10046415197673935811,0.540527,0.407988,0.555073,True,hTh3fTs,2209-07-26 05:46:58,SMALL,Rabbit
4,-65,165,-2511,994,61908793,2286780953,-5675846870268755869,6211416484931316168,0.745117,0.731461,0.311473,False,kcOWM,1956-08-04 01:10:50,EXTRA-SMALL,Ferret


In [168]:
df_test.reset_index(drop=True, inplace=True)
df_test.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
0,-18,60,10529,21271,197061330,1639714456,1257327177626768709,1723013210922501670,0.886719,0.991829,0.792836,True,RCr4k5I,1849-02-07 00:11:20,MIDDLE,Ferret
1,-69,89,7214,29271,1818787157,1302698397,1204508283466257026,10474355017457038888,0.731445,0.611232,0.162229,False,9XRF,2024-11-04 00:13:00,LARGE,Cat
2,-49,101,-25236,63527,-201377119,492804185,-3153268850932708637,4950203708192017949,0.710449,0.595511,0.062612,True,n8TbGiO,1857-02-28 15:29:48,MIDDLE,Ferret
3,15,204,32555,62806,1288807388,1557849753,2720881038481201732,3210356497066780303,0.191162,0.477569,0.13378,False,BBx57b,2015-10-01 16:29:55,MIDDLE,Mouse
4,38,35,-15893,60334,1084023158,1797398521,1908318620199937126,2261059718305035407,0.777344,0.565411,0.517109,True,eL8HBU,1896-04-07 20:00:40,MIDDLE,Dog


In [169]:
# Section 2-3

In [170]:
import pandas as pd

df1 = pd.DataFrame(
    {
        'Name': ['Alpha', 'Bravo', 'Charlie'],
        'Value': [100, 200, 300],
    },
    index=['a', 'b', 'c']
)

df1

Unnamed: 0,Name,Value
a,Alpha,100
b,Bravo,200
c,Charlie,300


In [171]:
df2 = pd.DataFrame(
    {
        'Name': ['Delta', 'Echo', 'Foxtrot'],
        'Value': [400, 500, 600]
    },
    index=['d', 'e', 'f']
)

df2

Unnamed: 0,Name,Value
d,Delta,400
e,Echo,500
f,Foxtrot,600


In [172]:
df = pd.concat([df1, df2], axis=0)

df

Unnamed: 0,Name,Value
a,Alpha,100
b,Bravo,200
c,Charlie,300
d,Delta,400
e,Echo,500
f,Foxtrot,600


In [173]:
df3 = pd.DataFrame(
    {
        'Name': ['Delta', 'Echo', 'Foxtrot'],
        'ID': [111, 222, 333]
    }
)

df3

Unnamed: 0,Name,ID
0,Delta,111
1,Echo,222
2,Foxtrot,333


In [176]:
df = pd.concat([df1, df3], axis=0)

df

Unnamed: 0,Name,Value,ID
a,Alpha,100.0,
b,Bravo,200.0,
c,Charlie,300.0,
0,Delta,,111.0
1,Echo,,222.0
2,Foxtrot,,333.0


In [177]:
df = pd.concat([df1, df3], axis=0, join='inner')

df

Unnamed: 0,Name
a,Alpha
b,Bravo
c,Charlie
0,Delta
1,Echo
2,Foxtrot


In [178]:
df4 = pd.DataFrame(
    {
        'ID': [11, 22, 44]
    },
    index=['a', 'b', 'd']
)

df4

Unnamed: 0,ID
a,11
b,22
d,44


In [179]:
df = pd.concat([df1, df4], axis=1)

df

Unnamed: 0,Name,Value,ID
a,Alpha,100.0,11.0
b,Bravo,200.0,22.0
c,Charlie,300.0,
d,,,44.0


In [180]:
df = df1.join(df4, how='inner')

df

Unnamed: 0,Name,Value,ID
a,Alpha,100,11
b,Bravo,200,22


In [181]:
df = df1.join(df4, how='left')

df

Unnamed: 0,Name,Value,ID
a,Alpha,100,11.0
b,Bravo,200,22.0
c,Charlie,300,


In [182]:
df = df1.join(df4, how='right')

df

Unnamed: 0,Name,Value,ID
a,Alpha,100.0,11
b,Bravo,200.0,22
d,,,44


In [183]:
df5 = pd.DataFrame(
    {
        'Value5': [1000, 2000, 3000]
    },
    index=['Charlie', 'Delta', 'Echo']
)

df5

Unnamed: 0,Value5
Charlie,1000
Delta,2000
Echo,3000


In [184]:
df = df1.join(df5, on='Name')

df

Unnamed: 0,Name,Value,Value5
a,Alpha,100,
b,Bravo,200,
c,Charlie,300,1000.0


In [186]:
df1 = pd.DataFrame(
    {
        'Name': ['Alpha', 'Bravo', 'Charlie', 'Delta'],
        'ID': [11, 22, 33, 44],
        'Value1': [100, 200, 100, 400]
    },
    index=['a', 'b', 'c', 'd']
)

df1

Unnamed: 0,Name,ID,Value1
a,Alpha,11,100
b,Bravo,22,200
c,Charlie,33,100
d,Delta,44,400


In [187]:
df2 = pd.DataFrame(
    {
        'Name': ['Echo', 'Delta', 'Charlie', 'Bravo'],
        'Number': [11, 22, 33, 44],
        'Value2': [200, 100, 400, 200]
    },
    index=['e', 'd', 'c', 'b']
)

df2

Unnamed: 0,Name,Number,Value2
e,Echo,11,200
d,Delta,22,100
c,Charlie,33,400
b,Bravo,44,200


In [188]:
df = pd.merge(df1, df2, how='inner', on='Name')

df

Unnamed: 0,Name,ID,Value1,Number,Value2
0,Bravo,22,200,44,200
1,Charlie,33,100,33,400
2,Delta,44,400,22,100


In [191]:
df = pd.merge(df1, df2, how='left', on=['Name'])

df

Unnamed: 0,Name,ID,Value1,Number,Value2
0,Alpha,11,100,,
1,Bravo,22,200,44.0,200.0
2,Charlie,33,100,33.0,400.0
3,Delta,44,400,22.0,100.0


In [193]:
df = pd.merge(df1, df2, how='right', on='Name')

df

Unnamed: 0,Name,ID,Value1,Number,Value2
0,Echo,,,11,200
1,Delta,44.0,400.0,22,100
2,Charlie,33.0,100.0,33,400
3,Bravo,22.0,200.0,44,200


In [195]:
df = pd.merge(df1, df2, how='outer', on='Name')

df

Unnamed: 0,Name,ID,Value1,Number,Value2
0,Alpha,11.0,100.0,,
1,Bravo,22.0,200.0,44.0,200.0
2,Charlie,33.0,100.0,33.0,400.0
3,Delta,44.0,400.0,22.0,100.0
4,Echo,,,11.0,200.0


In [196]:
display(df1)
display(df2)

Unnamed: 0,Name,ID,Value1
a,Alpha,11,100
b,Bravo,22,200
c,Charlie,33,100
d,Delta,44,400


Unnamed: 0,Name,Number,Value2
e,Echo,11,200
d,Delta,22,100
c,Charlie,33,400
b,Bravo,44,200


In [197]:
df = pd.merge(df1, df2, left_on='Value1', right_on='Value2')

df

Unnamed: 0,Name_x,ID,Value1,Name_y,Number,Value2
0,Alpha,11,100,Delta,22,100
1,Bravo,22,200,Echo,11,200
2,Bravo,22,200,Bravo,44,200
3,Charlie,33,100,Delta,22,100
4,Delta,44,400,Charlie,33,400


In [198]:
df = pd.merge(df1, df2, left_on='Value1', right_on='Value2', suffixes=['-1', '-2'])

df

Unnamed: 0,Name-1,ID,Value1,Name-2,Number,Value2
0,Alpha,11,100,Delta,22,100
1,Bravo,22,200,Echo,11,200
2,Bravo,22,200,Bravo,44,200
3,Charlie,33,100,Delta,22,100
4,Delta,44,400,Charlie,33,400


In [199]:
display(df1)
display(df2)

Unnamed: 0,Name,ID,Value1
a,Alpha,11,100
b,Bravo,22,200
c,Charlie,33,100
d,Delta,44,400


Unnamed: 0,Name,Number,Value2
e,Echo,11,200
d,Delta,22,100
c,Charlie,33,400
b,Bravo,44,200


In [201]:
df = pd.merge(df1, df2, left_on=['Name', 'ID'], right_on=['Name', 'Number'], how='left')

df

Unnamed: 0,Name,ID,Value1,Number,Value2
0,Alpha,11,100,,
1,Bravo,22,200,,
2,Charlie,33,100,33.0,400.0
3,Delta,44,400,,


In [202]:
# Section 3-1

In [214]:
def load_3_1():
    df = pd.read_csv(
        'sample_3_1.csv',
        dtype={
            'val1': 'float64',
            'val2': 'float64',
        },
        index_col=0
    )
    return df

df = load_3_1()
df

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,148.083426,481.382983
2010-01-02,117.490787,466.259767
2010-01-03,147.009196,474.221712
2010-01-04,150.086981,493.239400
2010-01-05,162.703330,491.120699
...,...,...
2020-12-27,1029.031359,3521.016894
2020-12-28,1026.100656,3593.560824
2020-12-29,1002.146012,3424.869380
2020-12-30,1014.642041,3445.015116


In [215]:
df.iat[0, 0] += 2.
df.iat[1, 0] -= 2.
df.iat[2, 0] *= 2.
df.iat[3, 0] /= 2.

df.iat[0, 1] %= 2.
df.iat[1, 1] //= 2.
df.iat[2, 1] **= 2.

df.head()

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,150.083426,1.382983
2010-01-02,115.490787,233.0
2010-01-03,294.018392,224886.232291
2010-01-04,75.043491,493.2394
2010-01-05,162.70333,491.120699


In [216]:
df['val1']

date
2010-01-01     150.083426
2010-01-02     115.490787
2010-01-03     294.018392
2010-01-04      75.043491
2010-01-05     162.703330
                 ...     
2020-12-27    1029.031359
2020-12-28    1026.100656
2020-12-29    1002.146012
2020-12-30    1014.642041
2020-12-31    1027.480381
Name: val1, Length: 4018, dtype: float64

In [217]:
df['val1'] *= 1000

In [218]:
df['val1']

date
2010-01-01    1.500834e+05
2010-01-02    1.154908e+05
2010-01-03    2.940184e+05
2010-01-04    7.504349e+04
2010-01-05    1.627033e+05
                  ...     
2020-12-27    1.029031e+06
2020-12-28    1.026101e+06
2020-12-29    1.002146e+06
2020-12-30    1.014642e+06
2020-12-31    1.027480e+06
Name: val1, Length: 4018, dtype: float64

In [219]:
df.iloc[1:4, 1]

date
2010-01-02       233.000000
2010-01-03    224886.232291
2010-01-04       493.239400
Name: val2, dtype: float64

In [220]:
df.iloc[1:4, 1] /= 1000
df.iloc[1:4, 1]

date
2010-01-02      0.233000
2010-01-03    224.886232
2010-01-04      0.493239
Name: val2, dtype: float64

In [221]:
df = load_3_1()

In [222]:
%%timeit

for i in range(len(df)):
    df.iloc[i, 0] /= 1000

405 ms ± 66.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [223]:
df = load_3_1()

In [224]:
%%timeit

df.iloc[:, 0] /= 1000

387 µs ± 21.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [225]:
df.iat[0, 0] = 0
df.iloc[1:4, 1] = 0

df.head()

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,0.0,481.382983
2010-01-02,0.0,0.0
2010-01-03,0.0,0.0
2010-01-04,0.0,0.0
2010-01-05,0.0,491.120699


In [227]:
df.iloc[1:4, :] = np.array([
    [1, 2], [3, 4], [5, 6]
])

df.head()

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,0.0,481.382983
2010-01-02,1.0,2.0
2010-01-03,3.0,4.0
2010-01-04,5.0,6.0
2010-01-05,0.0,491.120699


In [228]:
df = load_3_1()
boolean_mask = df['val1'] < 150

boolean_mask

date
2010-01-01     True
2010-01-02     True
2010-01-03     True
2010-01-04    False
2010-01-05    False
              ...  
2020-12-27    False
2020-12-28    False
2020-12-29    False
2020-12-30    False
2020-12-31    False
Name: val1, Length: 4018, dtype: bool

In [229]:
df_mask = df.mask(boolean_mask, -1)

df_mask

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,-1.000000,-1.000000
2010-01-02,-1.000000,-1.000000
2010-01-03,-1.000000,-1.000000
2010-01-04,150.086981,493.239400
2010-01-05,162.703330,491.120699
...,...,...
2020-12-27,1029.031359,3521.016894
2020-12-28,1026.100656,3593.560824
2020-12-29,1002.146012,3424.869380
2020-12-30,1014.642041,3445.015116


In [230]:
df.mask(boolean_mask)

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,,
2010-01-02,,
2010-01-03,,
2010-01-04,150.086981,493.239400
2010-01-05,162.703330,491.120699
...,...,...
2020-12-27,1029.031359,3521.016894
2020-12-28,1026.100656,3593.560824
2020-12-29,1002.146012,3424.869380
2020-12-30,1014.642041,3445.015116


In [231]:
df_where = df.where(boolean_mask, -1)

df_where

Unnamed: 0_level_0,val1,val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,148.083426,481.382983
2010-01-02,117.490787,466.259767
2010-01-03,147.009196,474.221712
2010-01-04,-1.000000,-1.000000
2010-01-05,-1.000000,-1.000000
...,...,...
2020-12-27,-1.000000,-1.000000
2020-12-28,-1.000000,-1.000000
2020-12-29,-1.000000,-1.000000
2020-12-30,-1.000000,-1.000000


In [232]:
df = load_3_1()

df['feat1'] = df['val1'] * df['val2']

df.head()

Unnamed: 0_level_0,val1,val2,feat1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,148.083426,481.382983,71284.841343
2010-01-02,117.490787,466.259767,54781.226971
2010-01-03,147.009196,474.221712,69714.952737
2010-01-04,150.086981,493.2394,74028.812646
2010-01-05,162.70333,491.120699,79906.973278


In [234]:
df['feat2'] = np.log(df['val2'])

df.head()

Unnamed: 0_level_0,val1,val2,feat1,feat2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,148.083426,481.382983,71284.841343,6.176663
2010-01-02,117.490787,466.259767,54781.226971,6.144743
2010-01-03,147.009196,474.221712,69714.952737,6.161675
2010-01-04,150.086981,493.2394,74028.812646,6.200995
2010-01-05,162.70333,491.120699,79906.973278,6.19669


In [235]:
df['feat3'] = df['val1'].shift(1)
df['feat4'] = df['val1'].shift(2)

df.head()

Unnamed: 0_level_0,val1,val2,feat1,feat2,feat3,feat4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-01,148.083426,481.382983,71284.841343,6.176663,,
2010-01-02,117.490787,466.259767,54781.226971,6.144743,148.083426,
2010-01-03,147.009196,474.221712,69714.952737,6.161675,117.490787,148.083426
2010-01-04,150.086981,493.2394,74028.812646,6.200995,147.009196,117.490787
2010-01-05,162.70333,491.120699,79906.973278,6.19669,150.086981,147.009196


In [238]:
df['feat5'] = df['val1'] - df['val1'].shift(1)

df.head()

Unnamed: 0_level_0,val1,val2,feat1,feat2,feat3,feat4,feat5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-01,148.083426,481.382983,71284.841343,6.176663,,,
2010-01-02,117.490787,466.259767,54781.226971,6.144743,148.083426,,-30.592639
2010-01-03,147.009196,474.221712,69714.952737,6.161675,117.490787,148.083426,29.518409
2010-01-04,150.086981,493.2394,74028.812646,6.200995,147.009196,117.490787,3.077785
2010-01-05,162.70333,491.120699,79906.973278,6.19669,150.086981,147.009196,12.616349


In [241]:
df['feat6'] = df['val2'].cumsum()

df.head()

Unnamed: 0_level_0,val1,val2,feat1,feat2,feat3,feat4,feat5,feat6
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-01,148.083426,481.382983,71284.841343,6.176663,,,,481.382983
2010-01-02,117.490787,466.259767,54781.226971,6.144743,148.083426,,-30.592639,947.642751
2010-01-03,147.009196,474.221712,69714.952737,6.161675,117.490787,148.083426,29.518409,1421.864463
2010-01-04,150.086981,493.2394,74028.812646,6.200995,147.009196,117.490787,3.077785,1915.103863
2010-01-05,162.70333,491.120699,79906.973278,6.19669,150.086981,147.009196,12.616349,2406.224562


In [253]:
df = load_3_1()
df['smooth_val1'] = df['val1'].rolling(3).mean()
df['smooth_val2'] = df['val2'].rolling(3, center=True).mean()

df

Unnamed: 0_level_0,val1,val2,smooth_val1,smooth_val2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,148.083426,481.382983,,
2010-01-02,117.490787,466.259767,,473.954821
2010-01-03,147.009196,474.221712,137.527803,477.906960
2010-01-04,150.086981,493.239400,138.195655,486.193937
2010-01-05,162.703330,491.120699,153.266503,491.101692
...,...,...,...,...
2020-12-27,1029.031359,3521.016894,1011.627288,3483.898095
2020-12-28,1026.100656,3593.560824,1019.780682,3513.149032
2020-12-29,1002.146012,3424.869380,1019.092676,3487.815107
2020-12-30,1014.642041,3445.015116,1014.296236,3454.476891


In [254]:
# Section 3-2

In [256]:
import pandas as pd
import numpy as np
import seaborn as sns

def load_diamonds():
    df = sns.load_dataset('diamonds')

    df = df.astype({
        'cut'    : 'category',
        'color'  : 'category',
        'clarity': 'category'
    })

    cut = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
    df['cut'] = df['cut'].cat.set_categories(cut, ordered=True)

    color = ['J', 'I', 'H', 'G', 'F', 'E', 'D']
    df['color'] = df['color'].cat.set_categories(color, ordered=True)

    clarity = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
    df['clarity'] = df['clarity'].cat.set_categories(clarity, ordered=True)

    return df

df = load_diamonds()

df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [257]:
display_df_info(df)

------ df.dtypes 型 ------

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
price         int64
x           float64
y           float64
z           float64
dtype: object

------ df.shape 行列 ------

(53940, 10)

------ df.size 全要素数 ------

539400

------ df.axes 行・列インデックス ------

[RangeIndex(start=0, stop=53940, step=1), Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')]

------ df.axes 行・列インデックス ------

[RangeIndex(start=0, stop=53940, step=1), Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')]

------ df.index 行インデックス ------

RangeIndex(start=0, stop=53940, step=1)

------ df.columns 列インデックス ------

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

------ df.info() インスタンス概要 ------

<class 'pandas.core.frame.DataFrame'>
Ran

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8



------ df.describe(exclude='number') 質的データ------



Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065



------ df.describe(include='all') ------



Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
count,53940.0,53940,53940,53940,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
unique,,5,7,8,,,,,,
top,,Ideal,G,SI1,,,,,,
freq,,21551,11292,13065,,,,,,
mean,0.79794,,,,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,,,,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,,,,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,,,,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,,,,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,,,,62.5,59.0,5324.25,6.54,6.54,4.04





In [258]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [259]:
def mm_to_cm(size):
    return size / 10.

In [260]:
df.loc[:, ['x', 'y']] = df.loc[:, ['x', 'y']].applymap(mm_to_cm)

df.head()

  df.loc[:, ['x', 'y']] = df.loc[:, ['x', 'y']].applymap(mm_to_cm)


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,0.395,0.398,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,0.389,0.384,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,0.405,0.407,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,0.42,0.423,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,0.434,0.435,2.75


In [274]:
df.loc[:, ['x', 'y']].apply(mm_to_cm)

Unnamed: 0,x,y
0,0.0395,0.0398
1,0.0389,0.0384
2,0.0405,0.0407
3,0.0420,0.0423
4,0.0434,0.0435
...,...,...
53935,0.0575,0.0576
53936,0.0569,0.0575
53937,0.0566,0.0568
53938,0.0615,0.0612


In [275]:
df.loc[:, 'z'] = df.loc[:, 'z'].map(mm_to_cm)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,0.395,0.398,0.243
1,0.21,Premium,E,SI1,59.8,61.0,326,0.389,0.384,0.231
2,0.23,Good,E,VS1,56.9,65.0,327,0.405,0.407,0.231
3,0.29,Premium,I,VS2,62.4,58.0,334,0.42,0.423,0.263
4,0.31,Good,J,SI2,63.3,58.0,335,0.434,0.435,0.275


In [277]:
df.loc[:, 'x'].map(mm_to_cm)

0        0.0395
1        0.0389
2        0.0405
3        0.0420
4        0.0434
          ...  
53935    0.0575
53936    0.0569
53937    0.0566
53938    0.0615
53939    0.0583
Name: x, Length: 53940, dtype: float64

In [282]:
df = load_diamonds()
df[['x', 'y', 'z']] /= 10.

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,0.395,0.398,0.243
1,0.21,Premium,E,SI1,59.8,61.0,326,0.389,0.384,0.231
2,0.23,Good,E,VS1,56.9,65.0,327,0.405,0.407,0.231
3,0.29,Premium,I,VS2,62.4,58.0,334,0.42,0.423,0.263
4,0.31,Good,J,SI2,63.3,58.0,335,0.434,0.435,0.275


In [285]:
def calc_point(record):
    if record['cut'] in ['Premium', 'Ideal']:
#        point = (record['carat'] + 1.) ** 2. -1.
        point = 1
    elif record['cut'] in ['Good', 'Very Good']:
#        point = 2. * record['carat']
        point = 2
    else:
#        point = np.log(record['carat'] + 1.)
        point = 3
    
    return point

In [286]:
df['point'] = df.apply(calc_point, axis=1)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,point
0,0.23,Ideal,E,SI2,61.5,55.0,326,0.395,0.398,0.243,1
1,0.21,Premium,E,SI1,59.8,61.0,326,0.389,0.384,0.231,1
2,0.23,Good,E,VS1,56.9,65.0,327,0.405,0.407,0.231,2
3,0.29,Premium,I,VS2,62.4,58.0,334,0.42,0.423,0.263,1
4,0.31,Good,J,SI2,63.3,58.0,335,0.434,0.435,0.275,2


In [288]:
# Section 3-3

In [290]:
for index, record in df.iterrows():
    print(f'index {index}: price {record["price"]}')

    print(record[['carat', 'cut', 'color', 'clarity']], '\n')

    if index > 2:
        break

index 0: price 326
carat       0.23
cut        Ideal
color          E
clarity      SI2
Name: 0, dtype: object 

index 1: price 326
carat         0.21
cut        Premium
color            E
clarity        SI1
Name: 1, dtype: object 

index 2: price 327
carat      0.23
cut        Good
color         E
clarity     VS1
Name: 2, dtype: object 

index 3: price 334
carat         0.29
cut        Premium
color            I
clarity        VS2
Name: 3, dtype: object 



In [None]:
# Section 5-1

In [310]:
from plotly import graph_objects as go
import numpy as np

figure = go.Figure()

figure

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
layout = go.Layout()

layout

Layout()

In [None]:
layout.update({
    'title': 'グラフタイトル'
})

layout

Layout({
    'title': {'text': 'グラフタイトル'}
})

In [None]:
figure.update(layout=layout)

figure

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
scatter_trace = go.Scatter(
    x = np.arange(50),
    y = np.random.rand(50)
)

scatter_trace

Scatter({
    'x': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
                18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
    'y': array([0.36654949, 0.11449629, 0.06127286, 0.32850214, 0.59329367, 0.00958528,
                0.46254289, 0.89335218, 0.53279449, 0.62762301, 0.91339661, 0.40626347,
                0.64118051, 0.95047163, 0.65176915, 0.67944017, 0.00150502, 0.88704382,
                0.37765354, 0.62134424, 0.76920819, 0.16223047, 0.83033992, 0.63239526,
                0.09789707, 0.14413574, 0.23240924, 0.09909627, 0.32872822, 0.45670365,
                0.07986554, 0.69544588, 0.70073855, 0.40587609, 0.9021841 , 0.21799312,
                0.71331683, 0.16878389, 0.60150984, 0.44229668, 0.56965495, 0.40799061,
                0.99137319, 0.57916482, 0.23315504, 0.95732531, 0.53424027, 0.24871815,
                0.79397172, 0.193998

In [None]:
figure.update(data=scatter_trace)
figure

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
bar_trace = go.Bar(
    x = np.arange(50),
    y = np.random.rand(50)
)

bar_trace

Bar({
    'x': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
                18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
    'y': array([0.04190325, 0.66187605, 0.57983608, 0.97537216, 0.60083301, 0.07996614,
                0.51013383, 0.35478309, 0.98620771, 0.21508768, 0.78712673, 0.52169062,
                0.92688468, 0.17300984, 0.26652738, 0.64615522, 0.98468221, 0.79148165,
                0.00640943, 0.4819536 , 0.76915918, 0.78287422, 0.8275072 , 0.66062773,
                0.19233773, 0.31497907, 0.53167525, 0.65166618, 0.91301206, 0.66919712,
                0.23026501, 0.23812037, 0.96478804, 0.14072566, 0.50018625, 0.36624651,
                0.47062432, 0.96957396, 0.22609593, 0.97784257, 0.42510395, 0.2362029 ,
                0.8072216 , 0.29080752, 0.47544732, 0.05502164, 0.21584614, 0.08008415,
                0.65782167, 0.1918724 ])

In [None]:
figure.add_trace(bar_trace)

figure

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [312]:
# Section 5-2

In [314]:
from plotly import graph_objects as go
from plotly import express as px
import pandas as pd
from sklearn import datasets

df_X, df_y = datasets.load_iris(return_X_y=True, as_frame=True)
df = pd.concat([df_X, df_y], axis=1)

df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [315]:
# Traceのlistを作成
traces = [
    go.Scatter(
        x=df['sepal length (cm)'],
        y=df['sepal width (cm)'],
        mode='markers',
        marker={
            'symbol': 'circle',
            'opacity': 0.5
        },
        name='sepal'
    ),  # 萼（がく）の長さと幅の散布図
    go.Scatter(
        x=df['petal length (cm)'],
        y=df['petal width (cm)'],
        mode='markers',
        marker={
            'symbol': 'diamond',
            'opacity': 0.5
        },
        name='petal'
    )   # 花弁の長さと幅の散布図
]

# Layoutを作成
layout=go.Layout({
    'title': 'Iris dataset',
    'xaxis': {
        'title': 'length [cm]'
    },
    'yaxis': {
        'title': 'width [cm]'
    }
})

# Graph ObjectsからFigureを作成
figure = go.Figure(data=traces, layout=layout)

figure

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [316]:
figure = px.scatter(
    data_frame=df, 
    x='sepal length (cm)',
    y='sepal width (cm)',
    title='iris-dataset'
)

figure

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [317]:
pd.options.plotting.backend = 'plotly'

figure = df.plot.scatter(
    x = 'petal length (cm)',
    y = 'petal width (cm)'
)

figure

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [318]:
dict_data = {
    'sepal_length': df['sepal length (cm)'].values,
    'petal_length': df['petal length (cm)'].values
}

figure = px.scatter(
    data_frame=dict_data, 
    x = 'sepal_length',
    y = 'petal_length',
    title = 'from dict'
)

figure

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
from typing import Callable, TypeVar, Any
from dataclasses import dataclass

@dataclass(frozen=True)
class Ok[T]:
    value: T

U = TypeVar('U')


@dataclass(frozen=True)
class Err[E]:
    value: E




In [321]:
import pandas as pd
# df = pd.read_csv('./data.csv')
dict =  {'id': [1, 2, 3, 4, 5], 'name': ['A','B','C','D','E'], 'score': [45, 32, 67, 40, 55]}
df = pd.DataFrame.from_dict(dict)

df_score = df['score']
df_score_std = df_score.std(ddof=0)
df_score_mean = df_score.mean()
# df['DeviationValue'] = df_score.map(lambda x: round((x - df_score_mean) / df_score_std * 10 + 50)).astype(int)
df['DeviationValue'] = df_score.map(lambda x: (x - df_score_mean) / df_score_std * 10 + 50)

df

Unnamed: 0,id,name,score,DeviationValue
0,1,A,45,47.696546
1,2,B,32,37.001937
2,3,C,67,65.795115
3,4,D,40,43.583235
4,5,E,55,55.923168


In [340]:
dict =  {'id': [1, 2, 3, 4, 5], 'name': ['A','B','C','D','E'], 'score': [45, 32, 67, 40, 55]}
df = pd.DataFrame.from_dict(dict)

# curried function
# def calc_deviation_val(df: pd.DataFrame):
#     def calc_deviation_val(col_name: str):
#         ser_std: float = df[col_name].std(ddof=0)
#         ser_mean: float = df[col_name].mean()
#         return lambda ser: (ser - ser_mean) / ser_std * 10 + 50
#     return calc_deviation_val

def calc_deviation_val(col_name: str):
    def calc_deviation_val(df: pd.DataFrame):
        ser_std: float = df[col_name].std(ddof=0)
        ser_mean: float = df[col_name].mean()
        return lambda ser: (ser - ser_mean) / ser_std * 10 + 50
    return calc_deviation_val

calc_deviation_val_for_score = calc_deviation_val('score')

df['DeviationValue'] = df['score'].map(calc_deviation_val_for_score(df))

In [341]:
df

Unnamed: 0,id,name,score
0,1,A,45
1,2,B,32
2,3,C,67
3,4,D,40
4,5,E,55


In [342]:
calc_deviation_val_for_score = calc_deviation_val('score')

df['DeviationValue'] = df['score'].map(calc_deviation_val_for_score(df))
df

Unnamed: 0,id,name,score,DeviationValue
0,1,A,45,47.696546
1,2,B,32,37.001937
2,3,C,67,65.795115
3,4,D,40,43.583235
4,5,E,55,55.923168


In [344]:
df = pd.DataFrame(
    {'c_0': ['A', 'A', 'B', 'B', 'B', 'B'],
     'c_1': ['X', 'Y', 'X', 'Y', 'X', 'Y'],
     'c_2': [0, 1, 4, 9, 16, 25],
     'c_3': [125, 64, 27, 16, 1, 0]},
    index=['r_0', 'r_1', 'r_2', 'r_3', 'r_4', 'r_5']
)
df

Unnamed: 0,c_0,c_1,c_2,c_3
r_0,A,X,0,125
r_1,A,Y,1,64
r_2,B,X,4,27
r_3,B,Y,9,16
r_4,B,X,16,1
r_5,B,Y,25,0


In [346]:
grouped = df.groupby('c_0')
print(grouped)
print(type(grouped))

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002320B2D8B80>
<class 'pandas.core.groupby.generic.DataFrameGroupBy'>


In [347]:
df_mean = grouped.mean(numeric_only=True)
# grouped.mean(numeric_only=False) はエラーになる。c_1があるから。
print(df_mean)

      c_2   c_3
c_0            
A     0.5  94.5
B    13.5  11.0


In [349]:
grouped.mean(numeric_only=False)

Unnamed: 0_level_0,c_2,c_3
c_0,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.5,94.5
B,13.5,11.0


In [352]:
print(df.groupby('c_0')['c_2'].mean(), '\n')
print(df.groupby('c_0')[['c_2', 'c_3']].mean())

c_0
A     0.5
B    13.5
Name: c_2, dtype: float64 

      c_2   c_3
c_0            
A     0.5  94.5
B    13.5  11.0


In [354]:
print(df.groupby('c_0').sum(numeric_only=True), '\n')
print(df.groupby('c_0').count())

     c_2  c_3
c_0          
A      1  189
B     54   44 

     c_1  c_2  c_3
c_0               
A      2    2    2
B      4    4    4


In [355]:
print(df.groupby(['c_0', 'c_1']).mean())

          c_2    c_3
c_0 c_1             
A   X     0.0  125.0
    Y     1.0   64.0
B   X    10.0   14.0
    Y    17.0    8.0


In [356]:
df

Unnamed: 0,c_0,c_1,c_2,c_3
r_0,A,X,0,125
r_1,A,Y,1,64
r_2,B,X,4,27
r_3,B,Y,9,16
r_4,B,X,16,1
r_5,B,Y,25,0


In [357]:
print(df.groupby('c_0', as_index=False).mean(numeric_only=True),'\n')
print(df.groupby(['c_0', 'c_1'], as_index=False).mean())

  c_0   c_2   c_3
0   A   0.5  94.5
1   B  13.5  11.0 

  c_0 c_1   c_2    c_3
0   A   X   0.0  125.0
1   A   Y   1.0   64.0
2   B   X  10.0   14.0
3   B   Y  17.0    8.0


In [358]:
df

Unnamed: 0,c_0,c_1,c_2,c_3
r_0,A,X,0,125
r_1,A,Y,1,64
r_2,B,X,4,27
r_3,B,Y,9,16
r_4,B,X,16,1
r_5,B,Y,25,0


In [359]:
type(df['c_0'])

pandas.core.series.Series

In [360]:
df['c_0'].index

Index(['r_0', 'r_1', 'r_2', 'r_3', 'r_4', 'r_5'], dtype='object')

In [361]:
df.index

Index(['r_0', 'r_1', 'r_2', 'r_3', 'r_4', 'r_5'], dtype='object')

In [None]:
colnames_rating_industry_ = ['c_0', 'c_1']

In [362]:
class A:
    count = 0
    def __init__(self):
        A.count += 1
    def exclaim(self):
        print("I'm an A!")
    @classmethod
    def kids(cls):
        print("A has", cls.count, "little objects.")
    @staticmethod
    def commercial():
        print('This CoyoteWeapon has been brought to you by Acme')

In [366]:
def rects_collide(rect1, rect2):
    """ 長方形が衝突しているかどうかを検知する """
    return (
        rect1.x1 < rect2.x2 and
        rect1.x2 > rect2.x1 and
        rect1.y1 < rect2.y2 and
        rect1.y2 > rect2.y1
    )

import itertools

def find_collisions(objects):
    return [
        (item1, item2)
        for item1, item2
        in itertools.combinations(objects, 2)
        if rects_collide(
            item1.bounding_box,
            item2.bounding_box
        )
    ]

from dataclasses import dataclass

@dataclass
class Square:
    x: float
    y: float
    size: float

    @property
    def bounding_box(self):
        return Box(
            self.x,
            self.y,
            self.x + self.size,
            self.y + self.size
        )

@dataclass
class Rect:
    x: float
    y: float
    width: float
    height: float

    @property
    def bounding_box(self):
        return Box(
            self.x,
            self.y,
            self.x + self.width,
            self.y + self.height
        )

@dataclass
class Circle:
    x: float
    y: float
    radius: float

    @property
    def bounding_box(self):
        return Box(
            self.x - self.radius,
            self.y - self.radius,
            self.x + self.radius,
            self.y + self.radius
        )

@dataclass
class Box:
    x1: float
    y1: float
    x2: float
    y2: float

@dataclass
class Point:
    x: float


for collision in find_collisions([
    Square(0, 0, 10),
    Rect(5, 5, 20, 20),
    Square(15, 20, 5),
    Circle(1, 1, 2),
]):
    print(collision)

(Square(x=0, y=0, size=10), Rect(x=5, y=5, width=20, height=20))
(Square(x=0, y=0, size=10), Circle(x=1, y=1, radius=2))
(Rect(x=5, y=5, width=20, height=20), Square(x=15, y=20, size=5))


In [367]:
from abc import ABC, abstractmethod

class DummyInterface(ABC):
    @abstractmethod
    def dummy_method(self):
        pass

    @property
    @abstractmethod
    def dummy_property(self):
        pass

In [368]:
from abc import ABC, abstractmethod
from dataclasses import dataclass

class ColliderABC(ABC):
    @property
    @abstractmethod
    def bounding_box(self):
        pass

    @classmethod
    def __subclasshook__(cls, __subclass: type):
        if cls is ColliderABC:
            if any("bounding_box" in B.__dict__ for B in __subclass.__mro__):
                return True
        return NotImplemented

@dataclass
class Square(ColliderABC):
    pass

@dataclass
class Circle(ColliderABC):
    pass



In [369]:
from typing import Protocol, runtime_checkable

@runtime_checkable
class IBox(Protocol):
    x1: float
    y1: float
    x2: float
    y2: float

@runtime_checkable
class ICollider(Protocol):
    @property
    def bounding_box(self) -> IBox:
        pass



In [371]:
import itertools
from dataclasses import dataclass
from typing import Iterable, Protocol, runtime_checkable

@runtime_checkable
class IBox(Protocol):
    x1: float
    y1: float
    x2: float
    y2: float

@runtime_checkable
class ICollider(Protocol):
    @property
    def bounding_box(self) -> IBox:
        pass

def rects_collide(rect1: IBox, rect2: IBox):
    """ 長方形が衝突しているかどうかを検知する """
    return (
        rect1.x1 < rect2.x2 and
        rect1.x2 > rect2.x1 and
        rect1.y1 < rect2.y2 and
        rect1.y1 > rect2.y1
    )

def find_collisions(objects: Iterable[ICollider]):
    for item in objects:
        if not isinstance(item, ICollider):
            raise TypeError(f"{item}は衝突検知対象外です")
    return [
        (item1, item2)
        for item1, item2
        in itertools.combinations(object, 2)
        if rects_collide(
            item1.bounding_box,
            item2.bounding_box
        )
    ]

In [372]:
from typing import Protocol

class Animal(Protocol):
    def sound(self) -> str:
        ...

class Dog():
    def sound(self) -> str:
        return "Bow-wow"

class Book():
    def read(self) -> str:
        return "hogeeee"

class Cat():
    def sound(self) -> None:
        print("Meow")

def func(animal: Animal):
    animal.sound()

func(Dog())
func(Book())
func(Cat())


AttributeError: 'Book' object has no attribute 'sound'

In [373]:
df = pd.DataFrame(
    {'c_0': ['A', 'A', 'B', 'B', 'B', 'B'],
     'c_1': ['X', 'Y', 'X', 'Y', 'X', 'Y'],
     'c_2': [0, 1, 4, 9, 16, 25],
     'c_3': [125, 64, 27, 16, 1, 0]},
    index=['r_0', 'r_1', 'r_2', 'r_3', 'r_4', 'r_5']
)
df

Unnamed: 0,c_0,c_1,c_2,c_3
r_0,A,X,0,125
r_1,A,Y,1,64
r_2,B,X,4,27
r_3,B,Y,9,16
r_4,B,X,16,1
r_5,B,Y,25,0


In [374]:
print(df.groupby(['c_0', 'c_1'], as_index=False).mean())
print(df.groupby(['c_0', 'c_1'], as_index=True).mean())

  c_0 c_1   c_2    c_3
0   A   X   0.0  125.0
1   A   Y   1.0   64.0
2   B   X  10.0   14.0
3   B   Y  17.0    8.0
          c_2    c_3
c_0 c_1             
A   X     0.0  125.0
    Y     1.0   64.0
B   X    10.0   14.0
    Y    17.0    8.0


In [383]:
print(df.groupby(['c_0'], as_index=False).mean(numeric_only=True), '\n')
print(df.groupby(['c_0'], as_index=True).mean(numeric_only=True), '\n')
print(df.groupby(['c_0']).mean(numeric_only=True), '\n')

  c_0   c_2   c_3
0   A   0.5  94.5
1   B  13.5  11.0 

      c_2   c_3
c_0            
A     0.5  94.5
B    13.5  11.0 

      c_2   c_3
c_0            
A     0.5  94.5
B    13.5  11.0 



In [386]:
print(df[['c_0', 'c_2']].groupby(['c_0'], as_index=False).agg(['count', 'calc_deviation_val_for_score']))

AttributeError: 'SeriesGroupBy' object has no attribute 'calc_deviation_val_for_score'

In [385]:
calc_deviation_val_for_score

<function __main__.calc_deviation_val.<locals>.calc_deviation_val(df: pandas.core.frame.DataFrame)>

In [395]:
df = pd.DataFrame({
    'city': ['osaka', 'osaka', 'osaka', 'osaka', 'tokyo', 'tokyo', 'tokyo'],
    'food': ['apple', 'orange', 'banana', 'banana', 'apple', 'apple', 'banana'],
    'price': [100, 200, 250, 300, 150, 200, 400],
    'quantity': [1, 2, 3, 4, 5, 6, 7]
})
df

Unnamed: 0,city,food,price,quantity
0,osaka,apple,100,1
1,osaka,orange,200,2
2,osaka,banana,250,3
3,osaka,banana,300,4
4,tokyo,apple,150,5
5,tokyo,apple,200,6
6,tokyo,banana,400,7


In [397]:
def transformation_sample(s):
    return (s / s.sum() * 100).astype(str) + '%'

df.groupby('city').transform(transformation_sample)

TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [406]:
def square_sum(vs: list[int]) -> int:
    return sum(map(lambda x: x**2, vs))

def deviation_val(vs: list[float]) -> float:
    df_score_mean = sum(vs) / len(vs)
    df_score_dev = sum(map(lambda x: (x - df_score_mean)**2))
    # return map(lambda x: round((x - df_score_mean) / df_score_std * 10 + 50))
    return df_score_mean

df.groupby(['city', 'food']).agg({"price": square_sum})

Unnamed: 0_level_0,Unnamed: 1_level_0,price
city,food,Unnamed: 2_level_1
osaka,apple,10000
osaka,banana,152500
osaka,orange,40000
tokyo,apple,62500
tokyo,banana,160000


In [408]:
dict =  {'id': [1, 2, 3, 4, 5], 'name': ['A','B','C','D','E'], 'score': [45, 32, 67, 40, 55]}
df = pd.DataFrame.from_dict(dict)

# curried function
def calc_deviation_val(col_name: str):
    def calc_deviation_val(df: pd.DataFrame):
        ser_std: float = df[col_name].std(ddof=0)
        ser_mean: float = df[col_name].mean()
        return lambda ser: (ser - ser_mean) / ser_std * 10 + 50
    return calc_deviation_val

calc_deviation_val_for_score = calc_deviation_val('score')

df['DeviationValue'] = df['score'].map(calc_deviation_val_for_score(df))
df

Unnamed: 0,id,name,score,DeviationValue
0,1,A,45,47.696546
1,2,B,32,37.001937
2,3,C,67,65.795115
3,4,D,40,43.583235
4,5,E,55,55.923168


In [409]:
df

Unnamed: 0,id,name,score,DeviationValue
0,1,A,45,47.696546
1,2,B,32,37.001937
2,3,C,67,65.795115
3,4,D,40,43.583235
4,5,E,55,55.923168


In [410]:
cities: list[str] = ['osaka', 'osaka', 'osaka', 'osaka', 'tokyo', 'tokyo', 'tokyo']


df = pd.DataFrame({
    'city': ['osaka', 'osaka', 'osaka', 'osaka', 'tokyo', 'tokyo', 'tokyo'],
    'food': ['apple', 'orange', 'banana', 'banana', 'apple', 'apple', 'banana'],
    'price': [100, 200, 250, 300, 150, 200, 400],
    'quantity': [1, 2, 3, 4, 5, 6, 7]
})
df

Unnamed: 0,city,food,price,quantity
0,osaka,apple,100,1
1,osaka,orange,200,2
2,osaka,banana,250,3
3,osaka,banana,300,4
4,tokyo,apple,150,5
5,tokyo,apple,200,6
6,tokyo,banana,400,7


In [411]:

df_ = df[df['city'] == 'tokyo']
df_


Unnamed: 0,city,food,price,quantity
4,tokyo,apple,150,5
5,tokyo,apple,200,6
6,tokyo,banana,400,7


In [None]:
figure = make_subplots(
    rows=1,
    cols=2,
    horizontal_spacing=0.2,
    subplot_titles=['Age - Rooms', 'Population - Occupancy']
)

figure.add_trace(
    go.Scatter(
        x=df['HouseAge'],
        y=df['AveRooms'],
        mode='markers',
        name='rooms'
    ),
    row=1,
    col=1
)
figure.add_trace(
    go.Scatter(
        x=df['HouseAge'],
        y=df['AveBedrms'],
        mode='markers',
        name='bedrooms'
    ),
    row=1,
    col=1
)
figure.add_trace(
    go.Scatter(
        x=df['Population'],
        y=df['AveOccup'],
        mode='markers',
        name='occupancy'
    ),
    row=1,
    col=2
)
figure