21. How to convert a series of date-strings to a timeseries

In [1]:
import pandas as pd
import numpy as np



In [2]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

In [3]:
ser = pd.to_datetime(ser)

In [4]:
ser

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

22. How to get the day of month, week number, day of year and day of week from a series of date strings

In [5]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])


In [6]:
ser_2 = pd.to_datetime(ser)


In [7]:
ser_2.dt.dayofweek

0    4
1    2
2    5
3    3
4    0
5    5
dtype: int64

In [8]:
ser_2.dt.isocalendar().week


0    53
1     5
2     9
3    14
4    19
5    23
Name: week, dtype: UInt32

23. How to convert year-month string to dates corresponding to the 4th day of the month

In [9]:
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])


In [10]:
ser_2 = pd.to_datetime(ser)

In [11]:
ser_2

0   2010-01-01
1   2011-02-01
2   2012-03-01
dtype: datetime64[ns]

In [13]:
ser_2.apply(lambda x: x+pd.DateOffset(3))

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

s

24. How to filter words that contain atleast 2 vowels from a series

In [82]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])


In [83]:
# check how many vovels in word
def two_vovels(word):
    num_vovels = 0
    for char in word:
        if char in "aeiouAEIOU":
            num_vovels = num_vovels + 1
    return (num_vovels == 2)


In [84]:
df_ser = ser.to_frame()

In [85]:
df_ser.rename(columns={0:'wrd'}, inplace=True)

In [86]:
df_ser[df_ser['wrd'].apply(two_vovels)]


Unnamed: 0,wrd
0,Apple
4,Money


25. How to filter valid emails from a series

In [118]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

In [170]:
df_emails = emails.to_frame().reset_index()

In [166]:

def check_email(sentence):
    words= sentence.split()
    if len(words) > 1:
        return False
    else:
        return bool(re.match(pattern,words[0]))

In [174]:
check_email('rameses@egypt.com')

True

In [172]:
df_emails[df_emails[0].apply(check_email)]

Unnamed: 0,index,0
1,1,rameses@egypt.com
2,2,matt@t.co
3,3,narendra@modi.com


In [171]:
df_emails

Unnamed: 0,index,0
0,0,buying books at amazom.com
1,1,rameses@egypt.com
2,2,matt@t.co
3,3,narendra@modi.com


26. How to get the mean of a series grouped by another series
Compute the mean of weights of each fruit.


In [176]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weight = pd.Series(np.linspace(1, 10, 10))
print(weight.tolist())
print(fruit.tolist())
#> [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
#> ['banana', 'carrot', 'apple', 'carrot', 'carrot', 'apple', 'banana', 'carrot', 'apple', 'carrot']

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['apple', 'apple', 'carrot', 'banana', 'apple', 'banana', 'carrot', 'apple', 'carrot', 'carrot']


In [181]:
df_frukt = pd.concat([fruit,weight], axis=1)

In [182]:
df_frukt.columns = ['name','weight']

In [185]:
df_frukt.groupby(['name']).agg({'weight':'mean'}).reset_index()

Unnamed: 0,name,weight
0,apple,4.0
1,banana,5.0
2,carrot,7.25


In [186]:
df_frukt

Unnamed: 0,name,weight
0,apple,1.0
1,apple,2.0
2,carrot,3.0
3,banana,4.0
4,apple,5.0
5,banana,6.0
6,carrot,7.0
7,apple,8.0
8,carrot,9.0
9,carrot,10.0


27. How to compute the euclidean distance between two series
Compute the euclidean distance between series (points) p and q, without using a packaged formula

In [187]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

In [188]:
# Solution
sum((p - q)**2)**.5

18.16590212458495

28. How to find all the local maxima (or peaks) in a numeric series
Get the positions of peaks (values surrounded by smaller values on both sides) in ser.

In [190]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])


In [197]:
df_ser = ser.to_frame()

In [199]:
df_ser.columns = {'original'}

In [201]:
df_ser['next'] = df_ser['original'].shift(1)

In [203]:
df_ser['prev'] = df_ser['original'].shift(-1)

In [204]:
df_ser.head(3)

Unnamed: 0,original,next,prev
0,2,,10.0
1,10,2.0,3.0
2,3,10.0,4.0


In [208]:
df_ser.query('original>next and original>prev')

Unnamed: 0,original,next,prev
1,10,2.0,3.0
5,10,9.0,2.0
7,7,2.0,3.0


In [209]:
df_ser.query('original>next and original>prev').index.to_list()


[1, 5, 7]

29. How to replace missing spaces in a string with the least frequent character

In [229]:
my_str = 'dbc decb abed ggade'


In [239]:
def lest_freq(my_str):
    arr = list(my_str)
    #build dict with letter encounter
    char_dic = {}
    for c in arr:
        if c in char_dic:
            char_dic[c] =  char_dic[c]+1
        else:
            char_dic[c] = 1

    #find least value
    least_val = len(char_dic)+2
    for letter in char_dic:
        if char_dic[letter] < least_val:
            least_val= char_dic[letter]

    return letter

In [240]:
lest_freq(my_str)

'g'

In [241]:
def replace(my_str):
    rep_char = lest_freq(my_str)
    my_str2 = my_str.replace(' ',rep_char)
    return my_str2

30. How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values

In [244]:
ser = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10, freq='W-SAT'))


In [245]:
ser

2000-01-01    6
2000-01-08    9
2000-01-15    8
2000-01-22    4
2000-01-29    9
2000-02-05    3
2000-02-12    6
2000-02-19    6
2000-02-26    8
2000-03-04    5
Freq: W-SAT, dtype: int32

31. How to fill an intermittent time series so all missing dates show up with values of previous non-missing date

In [246]:
ser = pd.Series([1,10,3,np.nan], index=pd.to_datetime(['2000-01-01', '2000-01-03', '2000-01-06', '2000-01-08']))

In [248]:
df_ser = ser.to_frame()

In [249]:
df_ser.columns = ['val']

In [251]:
df_ser['prev'] = df_ser['val'].shift(-1)

In [254]:
ser.resample('D').ffill()

2000-01-01     1.0
2000-01-02     1.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     NaN
Freq: D, dtype: float64