In [2]:
import pandas as pd
import numpy as np

In [3]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)


## How to convert the index of a series into a column of a dataframe?

In [4]:
df = ser.to_frame().reset_index()
print(df.head())

  index  0
0     a  0
1     b  1
2     c  2
3     e  3
4     d  4


## How to combine many series to form a dataframe?

In [5]:
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

In [6]:
df = pd.concat([ser1, ser2], axis=1)
print(df.head())

   0  1
0  a  0
1  b  1
2  c  2
3  e  3
4  d  4


## How to assign name to the series’ index?

In [7]:
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser.name = 'alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

## How to get the items of series A not present in series B?

In [8]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])
ser1.name = 'ser1'
# isin() valida se o valor está presente na série, e o ~ inverte o resultado
ser1[~ser1.isin(ser2)]

0    1
1    2
2    3
Name: ser1, dtype: int64

## How to get the items not common to both series A and series B?

In [9]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

In [10]:
ser_u = pd.Series(np.union1d(ser1, ser2))  # union / unique no repeat
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect / common
print(ser_i)

0    4
1    5
dtype: int64


In [11]:
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

In [12]:
~ser_u.isin(ser_i)

0     True
1     True
2     True
3    False
4    False
5     True
6     True
7     True
dtype: bool

In [13]:
ser_u.isin(ser_i)

0    False
1    False
2    False
3     True
4     True
5    False
6    False
7    False
dtype: bool

## How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

In [14]:
ser = pd.Series(np.random.normal(10, 5, 25))

In [15]:
print(ser)

0      8.110420
1     12.107878
2     15.782444
3      9.963149
4     14.486342
5     12.802871
6     19.120164
7     -1.814310
8      2.293028
9     10.724034
10     2.092178
11     9.677246
12    14.617441
13    11.961504
14    15.179017
15    11.568372
16    15.967326
17     3.173345
18     2.948116
19     7.996335
20    14.065631
21     9.829216
22     7.610574
23    11.891953
24    -1.365075
dtype: float64


In [16]:
np.percentile(ser, q=[0, 25, 50, 75, 100])

array([-1.81431036,  7.61057427, 10.72403446, 14.06563064, 19.12016418])

In [17]:
ser.describe()

count    25.000000
mean      9.631568
std       5.615361
min      -1.814310
25%       7.610574
50%      10.724034
75%      14.065631
max      19.120164
dtype: float64

## How to get frequency counts of unique items of a series?

In [18]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

In [19]:
ser.value_counts()

f    5
g    5
b    5
h    4
c    4
d    4
e    2
a    1
Name: count, dtype: int64

# How to keep only top 2 most frequent values as it is and replace everything else as Other?

In [20]:
np.random.RandomState(100)
#cria uma Series chamada ser que contém 12 números inteiros aleatórios entre 1 e 4.
ser = pd.Series(np.random.randint(1, 5, [12]))

In [21]:
# ser.value_counts().index[:2] retorna os dois índices com as maiores contagens.
# ~ser.isin(ser.value_counts().index[:2]) retorna uma Series booleana que é True para os valores em ser que não estão nos dois índices com as maiores contagens
# ser[~ser.isin(ser.value_counts().index[:2])] = 'Other' substitui todos os valores em ser que não estão nos dois índices com as maiores contagens por 'Other'.

In [22]:
print("Top 2 freq:", ser.value_counts())
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
ser

Top 2 freq: 4    5
3    3
1    3
2    1
Name: count, dtype: int64


  ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'


0         4
1         3
2         3
3     Other
4     Other
5     Other
6     Other
7         4
8         4
9         3
10        4
11        4
dtype: object

# How to bin a numeric series to 10 groups of equal size?

In [23]:
ser = pd.Series(np.random.random(20))
print(ser.head())

0    0.186864
1    0.697595
2    0.829726
3    0.421755
4    0.020442
dtype: float64


In [24]:
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th'])

0      2nd
1      6th
2      9th
3      4th
4      1st
5      2nd
6      4th
7      3rd
8      5th
9      7th
10     3rd
11     8th
12     5th
13    10th
14     1st
15     7th
16     8th
17    10th
18     6th
19     9th
dtype: category
Categories (10, object): ['1st' < '2nd' < '3rd' < '4th' ... '7th' < '8th' < '9th' < '10th']

# How to convert a numpy array to a dataframe of given shape?

In [25]:
ser = pd.Series(np.random.randint(1, 10, 35))
print(ser.head())

0    7
1    9
2    6
3    3
4    8
dtype: int64


In [26]:
teste = ser.values.reshape(7, 5)
print(teste)

[[7 9 6 3 8]
 [4 3 1 4 9]
 [1 3 3 4 1]
 [4 9 9 1 1]
 [1 3 3 4 2]
 [4 5 3 2 3]
 [5 6 8 7 1]]


In [27]:
print(teste[1])

[4 3 1 4 9]


In [28]:
df = pd.DataFrame(ser.values.reshape(7, 5))

In [29]:
df

Unnamed: 0,0,1,2,3,4
0,7,9,6,3,8
1,4,3,1,4,9
2,1,3,3,4,1
3,4,9,9,1,1
4,1,3,3,4,2
5,4,5,3,2,3
6,5,6,8,7,1


# How to find the positions of numbers that are multiples of 3 from a series?

In [30]:
# cria uma Series chamada ser com 7 valores aleatórios entre 1 e 10.
ser = pd.Series(np.random.randint(1, 10, 7))
ser.head(7)

0    3
1    6
2    5
3    9
4    4
5    9
6    7
dtype: int64

In [31]:
np.argwhere(ser % 3==0) #retorna os índices dos valores em ser que são múltiplos de 3.

array([[0],
       [1],
       [3],
       [5]])

# How to extract items at given positions from a series

In [32]:
# # ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
# # ser = pd.Series(np.array(list('abcdefghijklmnopqrstuvwxyz')))
# ser = pd.Series(np.array(list('abcdefghijklmnopqrstuvwxyz')))
# pos = [0, 4, 8, 14, 20]
# # pos

In [33]:
# ser = pd.Series(np.array(list('abcdefghijklmnopqrstuvwxyz')))
# pos = np.array(pos)
# 
# print(ser[pos])


# How to stack two series vertically and horizontally ?

In [34]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(['a', 'b', 'c', 'd', 'e'])

In [35]:
pd.concat([ser1, ser2], axis=1)

Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


# How to get the positions of items of series A in another series B?

In [36]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

In [37]:
[np.where(i == ser1)[0].tolist()[0] for i in ser2]

[5, 4, 0, 8]

In [38]:
[pd.Index(ser1).get_loc(i) for i in ser2]

[5, 4, 0, 8]

In [39]:
for i in ser2:
    print(i)

1
3
10
13


In [40]:
pd.Index(ser1).get_loc(9)

1

# How to compute the mean squared error on a truth and predicted series?

In [41]:
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10) # cria uma Series com 10 valores, começando em 0 e terminando em 9. A cada valor é adicionado um valor aleatório entre 0 e 1.

In [42]:
np.mean((truth-pred)**2)

0.30452815062405236

In [43]:
truth - pred

0   -0.584388
1   -0.561688
2   -0.174896
3   -0.746742
4   -0.223972
5   -0.961787
6   -0.372991
7   -0.515433
8   -0.378505
9   -0.526126
dtype: float64

In [44]:
(truth - pred)**2

0    0.341509
1    0.315494
2    0.030589
3    0.557624
4    0.050164
5    0.925033
6    0.139122
7    0.265672
8    0.143266
9    0.276809
dtype: float64

# How to convert the first character of each element in a series to uppercase?

In [45]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

In [46]:
ser.map(lambda x: x.title())

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [47]:
ser.map(lambda x: x[0].upper() + x[1:])

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [48]:
pd.Series([i.title() for i in ser])

0     How
1      To
2    Kick
3    Ass?
dtype: object

# How to calculate the number of character in each word in a series

In [49]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

In [50]:
ser.map(lambda x: len(x))

0    3
1    2
2    4
3    4
dtype: int64

# How to compute difference of differences between consequtive numbers of a series?

In [51]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

In [52]:
print(ser.diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]


In [53]:
print(ser.diff().diff().tolist())

[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]
