---

_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-text-mining/resources/d9pwm) course resource._

---

# Working with Text Data in pandas

In [2]:
import pandas as pd

time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30...
2,"Wednesday: At 7:00pm, there is a basketball game!"
3,Thursday: Be back home by 11:15 pm at the latest.
4,"Friday: Take the train at 08:10 am, arrive at ..."


In [3]:
type(df['text'])

pandas.core.series.Series

In [4]:
# find the number of characters for each string in df['text']
df['text'].str.len()

0    46
1    50
2    49
3    49
4    54
Name: text, dtype: int64

In [5]:
len(df['text'])  #Number of elements in teh column - text

5

In [6]:
# find the number of tokens for each string in df['text']
df['text'].str.split().str.len()

0     7
1     8
2     8
3    10
4    10
Name: text, dtype: int64

In [7]:
# find which entries contain the word 'appointment'
df['text'].str.contains('appointment')

0     True
1     True
2    False
3    False
4    False
Name: text, dtype: bool

In [8]:
# find how many times a digit occurs in each string
df['text'].str.count(r'\d')

0    3
1    4
2    3
3    4
4    8
Name: text, dtype: int64

In [9]:
# find all occurances of the digits
df['text'].str.findall(r'\d')

0                   [2, 4, 5]
1                [1, 1, 3, 0]
2                   [7, 0, 0]
3                [1, 1, 1, 5]
4    [0, 8, 1, 0, 0, 9, 0, 0]
Name: text, dtype: object

In [10]:
# group and find the hours and minutes
df['text'].str.findall(r'(\d?\d):(\d\d)')  # ? - true if matches zero or one occurence the char preceeding ? in the string

0               [(2, 45)]
1              [(11, 30)]
2               [(7, 00)]
3              [(11, 15)]
4    [(08, 10), (09, 00)]
Name: text, dtype: object

In [11]:
df['text'].str.findall(r'.+day')

0       [Monday]
1      [Tuesday]
2    [Wednesday]
3     [Thursday]
4       [Friday]
Name: text, dtype: object

In [12]:
# replace weekdays with '???'
df['text'].str.replace(r'\w+day\b', '???')

0          ???: The doctor's appointment is at 2:45pm.
1       ???: The dentist's appointment is at 11:30 am.
2          ???: At 7:00pm, there is a basketball game!
3         ???: Be back home by 11:15 pm at the latest.
4    ???: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [13]:
 df['text'][:3]

0       Monday: The doctor's appointment is at 2:45pm.
1    Tuesday: The dentist's appointment is at 11:30...
2    Wednesday: At 7:00pm, there is a basketball game!
Name: text, dtype: object

In [14]:
# replace weekdays with 3 letter abbrevations
df['text'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])

0          Mon: The doctor's appointment is at 2:45pm.
1       Tue: The dentist's appointment is at 11:30 am.
2          Wed: At 7:00pm, there is a basketball game!
3         Thu: Be back home by 11:15 pm at the latest.
4    Fri: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [15]:
# create new columns from first match of extracted groups
df['text'].str.extract(r'(\d?\d):(\d\d)')

  


Unnamed: 0,0,1
0,2,45
1,11,30
2,7,0
3,11,15
4,8,10


In [16]:
# extract the entire time, the hours, the minutes, and the period
df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


In [17]:
# extract the entire time, the hours, the minutes, and the period with group names
df['text'].str.extractall(r'(?P<time>(?P<hour>\d?\d):(?P<minute>\d\d) ?(?P<period>[ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,time,hour,minute,period
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


REGEX MORE Practice

In [18]:
s = "Khadak singh ke khadakne se khadakati hain khidkiya, Khidkiyon ke khadakane se khadakata hain khadak singh"

In [19]:
import re

[t for t in s.split() if re.search('^kh',t)]  # ^ - true if matches at start of string

['khadakne', 'khadakati', 'khidkiya,', 'khadakane', 'khadakata', 'khadak']

In [20]:
[t for t in s.split() if re.search('n$',t)]  # $ - true if matches at end of string

['hain', 'Khidkiyon', 'hain']

In [21]:
[t for t in s.split() if re.search('.n',t)]  # . - true if matches at any index of string   
# it is like * during search operation.    filename =  *.scala    or ram*.scala

['singh', 'khadakne', 'hain', 'Khidkiyon', 'khadakane', 'hain', 'singh']

In [22]:
[t for t in s.split() if re.search('n.',t)]

['singh', 'khadakne', 'khadakane', 'singh']

In [23]:
[t for t in s.split() if re.search('.,',t)] # or re.search('[,]',t)

['khidkiya,']

In [24]:
[t for t in s.split() if re.search('[A-N]',t)]  # [pattern] - true if matches the pattern inside [] 

['Khadak', 'Khidkiyon']

In [25]:
[t for t in s.split() if re.search('[^a-z]',t)]  # [^pattern] - true if doesn't matches the pattern inside [] 

['Khadak', 'khidkiya,', 'Khidkiyon']

In [26]:
[t for t in s.split() if re.search('singh|khadak',t)]  # a|b- true if matches either of the 2 strings a or b

['singh', 'khadakne', 'khadakati', 'khadakane', 'khadakata', 'khadak', 'singh']

In [27]:
[t for t in s.split() if re.search('kh.*k',t)] # * - true if matches zero or more times the char preceeding * in the string t

['khadakne', 'khadakati', 'khidkiya,', 'khadakane', 'khadakata', 'khadak']

In [28]:
[t for t in s.split() if re.search('khai*.',t)] 
# trying to look for 'i' character 0 or more times in the string t with 'kha' in the start

['khadakne', 'khadakati', 'khadakane', 'khadakata', 'khadak']

In [29]:
[t for t in s.split() if re.search('kha+',t)] # + - true if matches one or more times the char preceeding + in the string t

['khadakne', 'khadakati', 'khadakane', 'khadakata', 'khadak']

In [30]:
[t for t in s.split() if re.search('khi+',t)] 
# trying to look for 'i' character atleast one times in the string t with 'kh' in the start

['khidkiya,']

In [31]:
# ? - true if matches zero or one occurence the char preceeding ? in the string t
[t for t in s.split() if re.search('khi?',t)] 

['khadakne', 'khadakati', 'khidkiya,', 'khadakane', 'khadakata', 'khadak']

In [32]:
# {n} - exactly n repetitions
# {n,m} - atleast n & at most m repetitions
# {n,} - atleast n repetitions 
# {,m} - atmost m repetitions

In [33]:
[t for t in s.split() if re.search('Kh{1}',t)] 

['Khadak', 'Khidkiyon']

In [34]:
import datetime
date_1 = datetime.datetime.now()

date_1

datetime.datetime(2019, 4, 25, 7, 17, 53, 390686)

In [35]:
today = str(date_1.day) + '-' + str(date_1.month) + '-' + str(date_1.year)
time = str(date_1.hour) + ':' + str(date_1.minute) + ':' + str(date_1.second)

In [36]:
today

'25-4-2019'

In [37]:
time

'7:17:53'

In [38]:
# \d - true if matches any digit in the search string, any single digit 0 to 9.      ~[0-9] 
# \D - true if matches any non-digit, anything that is not 0 to 9.    ~[^0-9]
# \s - true if whitespace character. That is matches space or a tab, or a new line, or \r and \f and \v. 
# \S - true if matches any non-whitespace character just the way d and capital D are opposites of each other.
# \w - true if matches any alphanumeric character.
# \W - true if matches any non-alphanumeric character.

In [39]:
try_test = 'Ever tri9d, E7er fa1led. T2y a9ai7 6ai1 a8a1n.'

In [40]:
t1 = [t for t in try_test.split(',')]
t2 = [t.strip().split('.') for t in [t for t in try_test.split(',')]]
t2

[['Ever tri9d'], ['E7er fa1led', ' T2y a9ai7 6ai1 a8a1n', '']]

In [41]:
t3=[]
local_t = []
for i in range(len(t2)):
    for j in range(len(t2[i])):
        local_t = t2[i][j].split()
        for k in range(len(local_t)):
            t3.append(local_t[k])

In [42]:
# custom split function to split a string on (./,/'')
def cust_split(st):
    t1 = [t.strip().split('.') for t in [t for t in st.split(',')]]
    t2=[]
    local_t = []
    for i in range(len(t1)):
        for j in range(len(t1[i])):
            local_t = t1[i][j].split()
            for k in range(len(local_t)):
                t2.append(local_t[k])
    
    return t2

In [43]:
cust_split(try_test)

['Ever', 'tri9d', 'E7er', 'fa1led', 'T2y', 'a9ai7', '6ai1', 'a8a1n']

In [44]:
t3 += ['Ram']

In [45]:
t3

['Ever', 'tri9d', 'E7er', 'fa1led', 'T2y', 'a9ai7', '6ai1', 'a8a1n', 'Ram']

In [46]:
[t for t in t3 if re.search('\d',t)] # \d - true if matches any digit in the search string, any single digit 0 to 9. ~[0-9] 

['tri9d', 'E7er', 'fa1led', 'T2y', 'a9ai7', '6ai1', 'a8a1n']

In [52]:
[t for t in t3 if re.search('[^\d]',t)]

['Ever', 'tri9d', 'E7er', 'fa1led', 'T2y', 'a9ai7', '6ai1', 'a8a1n', 'Ram']

In [48]:
# \s - true if whitespace character. That is matches space or a tab, or a new line, or \r and \f and \v. 
t4 = ['\tMohan', '  ', 'Gopal\n', 'Shyam']
[t for t in t4 if re.search('[\s]',t)]

['\tMohan', '  ', 'Gopal\n']

In [49]:
# \S - true if matches any non-whitespace character. 
[t for t in t4 if re.search('[\S]',t)]

['\tMohan', 'Gopal\n', 'Shyam']