In [21]:
import pandas as pd
import numpy as np

## Select dtypes

In [16]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html

df = pd.DataFrame({'col1': [1, 2] * 3,
                   'col2': [True, False] * 3,
                   'col3': [1.0, 2.0] * 3,
                   'col4':['a', 'b', 'c']*2})

In [17]:
df.dtypes

col1      int64
col2       bool
col3    float64
col4     object
dtype: object

In [18]:
df.select_dtypes(include='number')

Unnamed: 0,col1,col3
0,1,1.0
1,2,2.0
2,1,1.0
3,2,2.0
4,1,1.0
5,2,2.0


In [19]:
df.select_dtypes(include='object')

Unnamed: 0,col4
0,a
1,b
2,c
3,a
4,b
5,c


In [20]:
df.select_dtypes(exclude='object').agg('mean')

col1    1.5
col2    0.5
col3    1.5
dtype: float64

## pd.cut

https://pandas.pydata.org/docs/reference/api/pandas.cut.html#pandas.cut
https://pandas.pydata.org/docs/reference/api/pandas.qcut.html

In [41]:
dane = np.array([1, 7, 5, 4, 6, 3])
print(dane)

df = pd.DataFrame()
df['dane'] = dane


[1 7 5 4 6 3]


In [None]:
## dzieli na 3 grupy
pd.cut(dane, 3)


[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]]

In [43]:
df['dane_cut'] = pd.cut(dane, 3)
df

Unnamed: 0,dane,dane_cut
0,1,"(0.994, 3.0]"
1,7,"(5.0, 7.0]"
2,5,"(3.0, 5.0]"
3,4,"(3.0, 5.0]"
4,6,"(5.0, 7.0]"
5,3,"(0.994, 3.0]"


In [None]:
## mozna podac punkty odciecia grup
# zwrocmy uwage, ze domyslnie lewy przedzial jest otwarty

pd.cut(dane, [0, 3, 5, 7])

[NaN, (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (1.0, 3.0]]
Categories (4, interval[int64, right]): [(1, 3] < (3, 5] < (5, 7] < (7, 9]]

In [None]:
# zwrocmy uwage, ze domyslnie lewy przedzial jest otwarty; jezeli zaczne dzielic od 1, to 1 nie wpada w ten przedzial
print(pd.cut(dane, [1, 3, 5, 7]))

print(pd.cut(dane, [1, 3, 5, 7], include_lowest = True)) # ale zmieniluy sie nam etykiety

[NaN, (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (1.0, 3.0]]
Categories (3, interval[int64, right]): [(1, 3] < (3, 5] < (5, 7]]
[(0.999, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.999, 3.0]]
Categories (3, interval[float64, right]): [(0.999, 3.0] < (3.0, 5.0] < (5.0, 7.0]]


In [35]:
## mozna zdefiniowac etykiety
print(pd.cut(dane, [1, 3, 5, 7], 
             include_lowest = True, 
             labels= ['[1, 3]', '(3, 5]','(5, 7]'])
 ) # ale zmieniluy sie nam etykiety

['[1, 3]', '(5, 7]', '(3, 5]', '(3, 5]', '(5, 7]', '[1, 3]']
Categories (3, object): ['[1, 3]' < '(3, 5]' < '(5, 7]']


In [None]:
## mozna tez po kwantylach, kwartylach dzielic:

print(pd.qcut(dane, 4)) ## kwartyle
print("\n")
print(pd.qcut(dane, 10)) ## decyle
print("\n")

print(pd.qcut(dane, [0, .1, .5, .9, 1.] )) ## dowolna lista kwantyli


[(0.999, 3.25], (5.75, 7.0], (4.5, 5.75], (3.25, 4.5], (5.75, 7.0], (0.999, 3.25]]
Categories (4, interval[float64, right]): [(0.999, 3.25] < (3.25, 4.5] < (4.5, 5.75] < (5.75, 7.0]]


[(0.999, 2.0], (6.5, 7.0], (4.5, 5.0], (3.5, 4.0], (5.5, 6.0], (2.0, 3.0]]
Categories (10, interval[float64, right]): [(0.999, 2.0] < (2.0, 3.0] < (3.0, 3.5] < (3.5, 4.0] ... (5.0, 5.5] < (5.5, 6.0] < (6.0, 6.5] < (6.5, 7.0]]


[1 7 5 4 6 3]
[(0.999, 2.0], (6.5, 7.0], (4.5, 6.5], (2.0, 4.5], (4.5, 6.5], (2.0, 4.5]]
Categories (4, interval[float64, right]): [(0.999, 2.0] < (2.0, 4.5] < (4.5, 6.5] < (6.5, 7.0]]


In [48]:
df['dane_cut'] = pd.qcut(dane, [0, .1, .5, .9, 1.], labels = ['(0, 0.1)', '(0.1, 0.5)', '(0.5, 0.9)', '(0.9, 1)' ] )
df

Unnamed: 0,dane,dane_cut
0,1,"(0, 0.1)"
1,7,"(0.9, 1)"
2,5,"(0.5, 0.9)"
3,4,"(0.1, 0.5)"
4,6,"(0.5, 0.9)"
5,3,"(0.1, 0.5)"


In [49]:
df.dtypes

dane           int64
dane_cut    category
dtype: object

## where 

1. pandas: Replace values where the condition is False.

* https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.where.html#pandas.DataFrame.where
* https://pandas.pydata.org/docs/reference/api/pandas.Series.where.html#pandas.Series.where


2. numpy:
* https://numpy.org/doc/stable/reference/generated/numpy.where.html
* argwhere: https://numpy.org/doc/stable/reference/generated/numpy.argwhere.html 

NOTE: The signature for DataFrame.where() differs from numpy.where(). Roughly df1.where(m, df2) is equivalent to np.where(m, df1, df2).


In [63]:
df.where(df.dane > 2)

Unnamed: 0,dane,dane_cut
0,,
1,7.0,"(0.9, 1)"
2,5.0,"(0.5, 0.9)"
3,4.0,"(0.1, 0.5)"
4,6.0,"(0.5, 0.9)"
5,3.0,"(0.1, 0.5)"


In [68]:
s = pd.Series(range(5))
t = pd.Series([True, False, True])
s.where(t, 99)

0     0
1    99
2     2
3    99
4    99
dtype: int64

In [None]:


ser = pd.Series(['DUZE LITERY', 'male litery', 'Mieszane Litery', '3-45', np.nan])

ser.str.isupper()

ser.where(ser.str.isupper())






np.int64(1)

## Sequence of numbers:
* range([start], stop, [step]))

    range zwraca obiekt typu range, który reprezentuje sekwencję liczb całkowitych od 0 do 4 (czyli 0, 1, 2, 3, 4).

    To nie jest lista, ale iterator - leniwie generowany obiekt, który zajmuje mało pamięci. Aby zobaczyć wartości, możesz:

    Przekonwertować do listy: list(range(5)) → [0, 1, 2, 3, 4]
    Iterować w pętli: for i in range(5): print(i)
    Użyć w funkcjach akceptujących iteratory

* np.arange([start], stop, [step]) : https://numpy.org/doc/2.3/reference/generated/numpy.arange.html

    zwraca np.ndarray z wartościami od start do stop z krokiem step. Domyślnie start=0 i step=1

* np.linspace(start, stop, num=50, ...): https://numpy.org/doc/2.3/reference/generated/numpy.linspace.html

    zwraca num wartości równomiernie rozmieszczonych między start a stop (włącznie). Domyślnie num=50

In [109]:
range(5)


range(0, 5)

In [118]:
range(5)


range(0, 5)

In [113]:
list(range(5))

[0, 1, 2, 3, 4]

In [None]:
for i in range(5):
    print(i, end = ' ')

0 1 2 3 4 

In [110]:
np.arange(5)

array([0, 1, 2, 3, 4])

In [111]:
np.linspace(0, 5, 5)

array([0.  , 1.25, 2.5 , 3.75, 5.  ])

In [112]:
np.linspace(0, 5, 5, endpoint=False)

array([0., 1., 2., 3., 4.])