In [24]:
import pandas as pd
import numpy as np

In [25]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)


## How to convert the index of a series into a column of a dataframe?

In [26]:
df = ser.to_frame().reset_index()
print(df.head())

  index  0
0     a  0
1     b  1
2     c  2
3     e  3
4     d  4


## How to combine many series to form a dataframe?

In [27]:
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

In [28]:
df = pd.concat([ser1, ser2], axis=1)
print(df.head())

   0  1
0  a  0
1  b  1
2  c  2
3  e  3
4  d  4


## How to assign name to the series’ index?

In [29]:
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser.name = 'alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

## How to get the items of series A not present in series B?

In [30]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])
ser1.name = 'ser1'
# isin() valida se o valor está presente na série, e o ~ inverte o resultado
ser1[~ser1.isin(ser2)]

0    1
1    2
2    3
Name: ser1, dtype: int64

## How to get the items not common to both series A and series B?

In [31]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

In [32]:
ser_u = pd.Series(np.union1d(ser1, ser2))  # union / unique no repeat
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect / common
print(ser_i)

0    4
1    5
dtype: int64


In [33]:
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

In [34]:
~ser_u.isin(ser_i)

0     True
1     True
2     True
3    False
4    False
5     True
6     True
7     True
dtype: bool

In [35]:
ser_u.isin(ser_i)

0    False
1    False
2    False
3     True
4     True
5    False
6    False
7    False
dtype: bool

## How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

In [36]:
ser = pd.Series(np.random.normal(10, 5, 25))

In [37]:
print(ser)

0      7.297011
1      8.530715
2      6.466971
3      9.066038
4      8.741166
5      9.621535
6     13.170319
7     11.700705
8      6.460049
9      7.993254
10     1.964238
11    17.779239
12    13.535066
13     3.843639
14    23.118758
15     5.523505
16     5.065392
17     7.541536
18     6.057875
19     6.553236
20    10.706587
21     8.177666
22    13.571916
23     5.164086
24     8.352853
dtype: float64


In [38]:
np.percentile(ser, q=[0, 25, 50, 75, 100])

array([ 1.96423848,  6.46004877,  8.17766611, 10.7065872 , 23.11875793])

In [39]:
ser.describe()

count    25.000000
mean      9.040134
std       4.560412
min       1.964238
25%       6.460049
50%       8.177666
75%      10.706587
max      23.118758
dtype: float64

## How to get frequency counts of unique items of a series?

In [40]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

In [43]:
ser.value_counts()

g    7
d    6
b    5
a    4
f    3
h    3
e    2
Name: count, dtype: int64

# How to keep only top 2 most frequent values as it is and replace everything else as Other?

In [45]:
np.random.RandomState(100)
#cria uma Series chamada ser que contém 12 números inteiros aleatórios entre 1 e 4.
ser = pd.Series(np.random.randint(1, 5, [12]))

In [55]:
# ser.value_counts().index[:2] retorna os dois índices com as maiores contagens.
# ~ser.isin(ser.value_counts().index[:2]) retorna uma Series booleana que é True para os valores em ser que não estão nos dois índices com as maiores contagens
# ser[~ser.isin(ser.value_counts().index[:2])] = 'Other' substitui todos os valores em ser que não estão nos dois índices com as maiores contagens por 'Other'.

In [57]:
print("Top 2 freq:", ser.value_counts())
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
ser

Top 2 freq: Other    7
2        5
Name: count, dtype: int64


0     Other
1         2
2     Other
3     Other
4     Other
5         2
6     Other
7     Other
8         2
9         2
10    Other
11        2
dtype: object

# How to bin a numeric series to 10 groups of equal size?

In [60]:
ser = pd.Series(np.random.random(20))
print(ser.head())

0    0.052151
1    0.618334
2    0.324701
3    0.601158
4    0.655394
dtype: float64


In [62]:
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th'])

0      1st
1      8th
2      4th
3      7th
4      8th
5      5th
6      6th
7      1st
8      2nd
9      3rd
10     4th
11     6th
12     3rd
13    10th
14     9th
15     5th
16    10th
17     9th
18     2nd
19     7th
dtype: category
Categories (10, object): ['1st' < '2nd' < '3rd' < '4th' ... '7th' < '8th' < '9th' < '10th']