In [1]:
import pandas as pd
import numpy as np

In [2]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)


## How to convert the index of a series into a column of a dataframe?

In [3]:
df = ser.to_frame().reset_index()
print(df.head())

  index  0
0     a  0
1     b  1
2     c  2
3     e  3
4     d  4


## How to combine many series to form a dataframe?

In [4]:
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

In [5]:
df = pd.concat([ser1, ser2], axis=1)
print(df.head())

   0  1
0  a  0
1  b  1
2  c  2
3  e  3
4  d  4


## How to assign name to the series’ index?

In [6]:
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser.name = 'alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

## How to get the items of series A not present in series B?

In [7]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])
ser1.name = 'ser1'
# isin() valida se o valor está presente na série, e o ~ inverte o resultado
ser1[~ser1.isin(ser2)]

0    1
1    2
2    3
Name: ser1, dtype: int64

## How to get the items not common to both series A and series B?

In [8]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

In [9]:
ser_u = pd.Series(np.union1d(ser1, ser2))  # union / unique no repeat
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect / common
print(ser_i)

0    4
1    5
dtype: int64


In [10]:
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

In [11]:
~ser_u.isin(ser_i)

0     True
1     True
2     True
3    False
4    False
5     True
6     True
7     True
dtype: bool

In [12]:
ser_u.isin(ser_i)

0    False
1    False
2    False
3     True
4     True
5    False
6    False
7    False
dtype: bool

## How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

In [13]:
ser = pd.Series(np.random.normal(10, 5, 25))

In [14]:
print(ser)

0      9.164891
1      4.621890
2      4.261217
3     15.964626
4      1.204652
5     12.030346
6      4.722124
7      5.856227
8     20.578387
9      8.718009
10    14.912550
11    -0.235066
12    -2.732618
13    -0.310633
14     2.930576
15    16.271006
16    12.420679
17    14.192814
18     8.620373
19    13.406398
20    11.580480
21     5.462625
22     6.477577
23    17.648936
24    21.167628
dtype: float64


In [15]:
np.percentile(ser, q=[0, 25, 50, 75, 100])

array([-2.73261767,  4.62189028,  8.71800926, 14.19281434, 21.16762767])

In [16]:
ser.describe()

count    25.000000
mean      9.157428
std       6.654319
min      -2.732618
25%       4.621890
50%       8.718009
75%      14.192814
max      21.167628
dtype: float64

## How to get frequency counts of unique items of a series?

In [17]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

In [18]:
ser.value_counts()

h    6
f    5
a    5
e    4
c    3
d    3
b    2
g    2
Name: count, dtype: int64

# How to keep only top 2 most frequent values as it is and replace everything else as Other?

In [19]:
np.random.RandomState(100)
#cria uma Series chamada ser que contém 12 números inteiros aleatórios entre 1 e 4.
ser = pd.Series(np.random.randint(1, 5, [12]))

In [20]:
# ser.value_counts().index[:2] retorna os dois índices com as maiores contagens.
# ~ser.isin(ser.value_counts().index[:2]) retorna uma Series booleana que é True para os valores em ser que não estão nos dois índices com as maiores contagens
# ser[~ser.isin(ser.value_counts().index[:2])] = 'Other' substitui todos os valores em ser que não estão nos dois índices com as maiores contagens por 'Other'.

In [21]:
print("Top 2 freq:", ser.value_counts())
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
ser

Top 2 freq: 2    7
4    2
1    2
3    1
Name: count, dtype: int64


  ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'


0         2
1         2
2         2
3         2
4         2
5         2
6         4
7         4
8     Other
9     Other
10    Other
11        2
dtype: object

# How to bin a numeric series to 10 groups of equal size?

In [22]:
ser = pd.Series(np.random.random(20))
print(ser.head())

0    0.126737
1    0.119945
2    0.541650
3    0.668392
4    0.749375
dtype: float64


In [23]:
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th'])

0      2nd
1      2nd
2      6th
3      7th
4      8th
5      1st
6     10th
7      5th
8      8th
9      4th
10    10th
11     9th
12     7th
13     3rd
14     5th
15     9th
16     4th
17     1st
18     6th
19     3rd
dtype: category
Categories (10, object): ['1st' < '2nd' < '3rd' < '4th' ... '7th' < '8th' < '9th' < '10th']

# How to convert a numpy array to a dataframe of given shape?

In [24]:
ser = pd.Series(np.random.randint(1, 10, 35))
print(ser.head())

0    8
1    5
2    7
3    8
4    2
dtype: int64


In [25]:
teste = ser.values.reshape(7, 5)
print(teste)

[[8 5 7 8 2]
 [7 7 3 2 5]
 [1 5 9 5 4]
 [5 7 5 4 8]
 [4 8 3 6 3]
 [6 4 3 8 4]
 [6 7 1 1 7]]


In [26]:
print(teste[1])

[7 7 3 2 5]


In [27]:
df = pd.DataFrame(ser.values.reshape(7, 5))

In [28]:
df

Unnamed: 0,0,1,2,3,4
0,8,5,7,8,2
1,7,7,3,2,5
2,1,5,9,5,4
3,5,7,5,4,8
4,4,8,3,6,3
5,6,4,3,8,4
6,6,7,1,1,7


# How to find the positions of numbers that are multiples of 3 from a series?

In [29]:
# cria uma Series chamada ser com 7 valores aleatórios entre 1 e 10.
ser = pd.Series(np.random.randint(1, 10, 7))
ser.head(7)

0    6
1    2
2    1
3    5
4    8
5    3
6    6
dtype: int64

In [30]:
np.argwhere(ser % 3==0) #retorna os índices dos valores em ser que são múltiplos de 3.

array([[0],
       [5],
       [6]])

# How to extract items at given positions from a series

In [31]:
# # ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
# # ser = pd.Series(np.array(list('abcdefghijklmnopqrstuvwxyz')))
# ser = pd.Series(np.array(list('abcdefghijklmnopqrstuvwxyz')))
# pos = [0, 4, 8, 14, 20]
# # pos

In [32]:
# ser = pd.Series(np.array(list('abcdefghijklmnopqrstuvwxyz')))
# pos = np.array(pos)
# 
# print(ser[pos])


# How to stack two series vertically and horizontally ?

In [33]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(['a', 'b', 'c', 'd', 'e'])

In [34]:
pd.concat([ser1, ser2], axis=1)

Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


# How to get the positions of items of series A in another series B?

In [35]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

In [36]:
[np.where(i == ser1)[0].tolist()[0] for i in ser2]

[5, 4, 0, 8]

In [37]:
[pd.Index(ser1).get_loc(i) for i in ser2]

[5, 4, 0, 8]

In [38]:
for i in ser2:
    print(i)

1
3
10
13


In [39]:
pd.Index(ser1).get_loc(9)

1

# How to compute the mean squared error on a truth and predicted series?

In [40]:
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10) # cria uma Series com 10 valores, começando em 0 e terminando em 9. A cada valor é adicionado um valor aleatório entre 0 e 1.

In [41]:
np.mean((truth-pred)**2)

0.26738835270582173

In [42]:
truth - pred

0   -0.561336
1   -0.202666
2   -0.216564
3   -0.887173
4   -0.485689
5   -0.480932
6   -0.290807
7   -0.407182
8   -0.289117
9   -0.826192
dtype: float64

In [43]:
(truth - pred)**2

0    0.315098
1    0.041073
2    0.046900
3    0.787076
4    0.235893
5    0.231295
6    0.084569
7    0.165797
8    0.083589
9    0.682593
dtype: float64

# How to convert the first character of each element in a series to uppercase?

In [44]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

In [45]:
ser.map(lambda x: x.title())

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [46]:
ser.map(lambda x: x[0].upper() + x[1:])

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [47]:
pd.Series([i.title() for i in ser])

0     How
1      To
2    Kick
3    Ass?
dtype: object

# How to calculate the number of character in each word in a series

In [48]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

In [49]:
ser.map(lambda x: len(x))

0    3
1    2
2    4
3    4
dtype: int64

# How to compute difference of differences between consequtive numbers of a series?

In [50]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

In [51]:
print(ser.diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]


In [52]:
print(ser.diff().diff().tolist())

[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]


# How to convert a series of date-string to a timeseries?

In [57]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

In [62]:
pd.to_datetime(ser)

ValueError: time data "02-02-2011" doesn't match format "%d %b %Y", at position 1. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [70]:
from dateutil.parser import parse
ser.map(lambda x: parse(x))

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

# How to get the day of month, week number, day of year and day of week from a series of date strings?

In [71]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

In [72]:
ser_ts = ser.map(lambda x: parse(x))
ser_ts

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [80]:
print("Date: ", ser_ts.dt.day.tolist())
print("Week number: ", ser_ts.dt.isocalendar().week.tolist())
print("day number of year: ",ser_ts.dt.dayofyear.tolist())
print("Day of week: ", ser_ts.dt.day_name().tolist())

Date:  [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
day number of year:  [1, 33, 63, 94, 125, 157]
Day of week:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']


# How to convert year-month string to dates corresponding to the 4th day of the month?

In [81]:
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

In [83]:
ser_ts = ser.map(lambda x: parse('04 ' + x))
ser_ts

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

# How to filter words that contain atleast 2 vowels from a series?

In [84]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

In [85]:
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) for i in list('aeiou')]) >= 2)
ser[mask]

0     Apple
1    Orange
4     Money
dtype: object

# How to filter valid emails from a series?

In [86]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern = '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

In [90]:
import re
emails.str.findall(pattern, flags=re.IGNORECASE)

0                     []
1    [rameses@egypt.com]
2            [matt@t.co]
3    [narendra@modi.com]
dtype: object

In [91]:
[x[0] for x in [x for x in emails.str.findall(pattern) if len(x) > 0]]

['rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']

# How to get the mean of a series grouped by another series?

In [92]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10)) # cria uma Series com 10 valores, começando em 1 e terminando em 10.
print(weights.tolist())
print(fruit.tolist())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['apple', 'carrot', 'carrot', 'carrot', 'carrot', 'apple', 'banana', 'banana', 'carrot', 'apple']


In [95]:
weights.groupby(fruit).mean()

apple     5.666667
banana    7.500000
carrot    4.600000
dtype: float64

# How to compute the euclidean distance between two series?

In [96]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

In [97]:
sum((p - q)**2)**.5

18.16590212458495

In [98]:
np.linalg.norm(p-q)

18.16590212458495

In [99]:
p-q

0   -9
1   -7
2   -5
3   -3
4   -1
5    1
6    3
7    5
8    7
9    9
dtype: int64

# How to find all the local maxima (or peaks) in a numeric series?

In [100]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

In [101]:
dd = np.diff(np.sign(np.diff(ser)))
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

array([1, 5, 7])

In [102]:
ser[peak_locs]

1    10
5    10
7     7
dtype: int64

In [103]:
np.diff(ser)

array([ 8, -7,  1,  5,  1, -8,  5, -4])

In [104]:
np.sign(np.diff(ser))

array([ 1, -1,  1,  1,  1, -1,  1, -1])

In [105]:
np.diff(np.sign(np.diff(ser)))

array([-2,  2,  0,  0, -2,  2, -2])

# How to replace missing spaces in a string with the least frequent character?

In [106]:
my_str = 'dbc deb abed gade'

In [108]:
ser = pd.Series(list('dbc deb abed gade'))
freq = ser.value_counts()
freq

d    4
b    3
     3
e    3
a    2
c    1
g    1
Name: count, dtype: int64

In [113]:
least_freq = freq.dropna().index[-1]
"".join(ser.replace(' ', least_freq))

'dbcgdebgabedggade'

In [114]:
print(least_freq)

g


# How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values?

In [116]:
ser = pd.Series(np.random.randint(1, 10, 10), pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
ser

2000-01-01    3
2000-01-08    2
2000-01-15    7
2000-01-22    4
2000-01-29    3
2000-02-05    2
2000-02-12    4
2000-02-19    5
2000-02-26    8
2000-03-04    5
Freq: W-SAT, dtype: int64

In [118]:
np.random.randint(1, 10, 10) # cria uma Series com 10 valores aleatórios entre 1 e 10.
pd.date_range('2000-01-01', periods=10, freq='W-SAT') # cria uma Series com 10 datas, começando em 2000-01-01 e terminando em 2000-03-04, com frequência semanal aos sábados.

DatetimeIndex(['2000-01-01', '2000-01-08', '2000-01-15', '2000-01-22',
               '2000-01-29', '2000-02-05', '2000-02-12', '2000-02-19',
               '2000-02-26', '2000-03-04'],
              dtype='datetime64[ns]', freq='W-SAT')