## [Working with Text Data in Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html)

In [43]:
import pandas as pd
import numpy as np

import re

In [3]:
s = pd.Series(['A','B','C','Asba','Baca',np.nan, 'CABA','cat'])

In [4]:
s

0       A
1       B
2       C
3    Asba
4    Baca
5     NaN
6    CABA
7     cat
dtype: object

In [6]:
s.str.upper()

0       A
1       B
2       C
3    ASBA
4    BACA
5     NaN
6    CABA
7     CAT
dtype: object

In [8]:
s.str.lower()

0       a
1       b
2       c
3    asba
4    baca
5     NaN
6    caba
7     cat
dtype: object

In [10]:
s.str.len()

0    1.0
1    1.0
2    1.0
3    4.0
4    4.0
5    NaN
6    4.0
7    3.0
dtype: float64

In [13]:
idx = pd.Index([' jack','jill ',' jesse ','frank'])

In [15]:
idx.str.strip()

Index(['jack', 'jill', 'jesse', 'frank'], dtype='object')

In [21]:
df = pd.DataFrame(data = np.random.randn(3,2),\
        columns=[' Column A ', ' Column B '], index = range(3))

In [22]:
df.head()

Unnamed: 0,Column A,Column B
0,0.013063,-1.026454
1,1.255923,0.266052
2,1.045978,-1.205609


In [24]:
df.columns.str.strip()

Index(['Column A', 'Column B'], dtype='object')

In [26]:
df.columns.str.lower()

Index([' column a ', ' column b '], dtype='object')

In [27]:
 df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [29]:
df

Unnamed: 0,column_a,column_b
0,0.013063,-1.026454
1,1.255923,0.266052
2,1.045978,-1.205609


### Splitting and Replacing Strings

In [30]:
s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])

In [31]:
s2

0    a_b_c
1    c_d_e
2      NaN
3    f_g_h
dtype: object

In [33]:
s2.str.split('_')

0    [a, b, c]
1    [c, d, e]
2          NaN
3    [f, g, h]
dtype: object

In [35]:
s2.str.split('_').get(1)

['c', 'd', 'e']

In [36]:
s2.str.split('_',expand = True)

Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


In [39]:
s2.str.split('_',expand = True, n = 1)

Unnamed: 0,0,1
0,a,b_c
1,c,d_e
2,,
3,f,g_h


In [54]:
tmp = 'Pragya is an artist'

In [58]:
pattern = re.compile(r'artist')
res = re.finditer(pattern,tmp)
for tomato in res:
    print(tomato)

<_sre.SRE_Match object; span=(13, 19), match='artist'>


In [41]:
s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', '', np.nan, 'CABA', 'dog', 'cat'])

In [42]:
s3

0       A
1       B
2       C
3    Aaba
4    Baca
5        
6     NaN
7    CABA
8     dog
9     cat
dtype: object

In [65]:
s3.str.replace('^.a|dog', 'XX-XX ', case=False)

0           A
1           B
2           C
3    XX-XX ba
4    XX-XX ca
5            
6         NaN
7    XX-XX BA
8      XX-XX 
9     XX-XX t
dtype: object

#### Be careful using $

In [66]:
dollars = pd.Series(['12', '-$10', '$10,000'])

In [67]:
dollars

0         12
1       -$10
2    $10,000
dtype: object

In [69]:
dollars.str.replace('$','')

0        12
1       -10
2    10,000
dtype: object

In [70]:
dollars.str.replace('-\$','')

0         12
1         10
2    $10,000
dtype: object

<span style='color:red'> This creates problem at it searches for $ at end</span>

In [71]:
dollars.str.replace('-$','')

0         12
1       -$10
2    $10,000
dtype: object

### Concatenation

In [73]:
s = pd.Series(['a', 'b', 'c', 'd'])

In [75]:
s.str.cat(sep='_')

'a_b_c_d'

### Extracting Substrings

In [83]:
pd.Series(['a1', 'b2', 'c3']).str.extract(r'([ab])(\d)')

Unnamed: 0,0,1
0,a,1.0
1,b,2.0
2,,


In [84]:
 pd.Series(['a1', 'b2', 'c3']).str.extract(r'(?P<letter>[ab])(?P<digit>\d)')

Unnamed: 0,letter,digit
0,a,1.0
1,b,2.0
2,,


## Method Summary

<table border="1" class="colwidths-given docutils" id="text-summary">
<colgroup>
<col width="20%">
<col width="80%">
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Method</th>
<th class="head">Description</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.cat.html#pandas.Series.str.cat" title="pandas.Series.str.cat"><code class="xref py py-meth docutils literal notranslate"><span class="pre">cat()</span></code></a></td>
<td>Concatenate strings</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.split.html#pandas.Series.str.split" title="pandas.Series.str.split"><code class="xref py py-meth docutils literal notranslate"><span class="pre">split()</span></code></a></td>
<td>Split strings on delimiter</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.rsplit.html#pandas.Series.str.rsplit" title="pandas.Series.str.rsplit"><code class="xref py py-meth docutils literal notranslate"><span class="pre">rsplit()</span></code></a></td>
<td>Split strings on delimiter working from the end of the string</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.get.html#pandas.Series.str.get" title="pandas.Series.str.get"><code class="xref py py-meth docutils literal notranslate"><span class="pre">get()</span></code></a></td>
<td>Index into each element (retrieve i-th element)</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.join.html#pandas.Series.str.join" title="pandas.Series.str.join"><code class="xref py py-meth docutils literal notranslate"><span class="pre">join()</span></code></a></td>
<td>Join strings in each element of the Series with passed separator</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.get_dummies.html#pandas.Series.str.get_dummies" title="pandas.Series.str.get_dummies"><code class="xref py py-meth docutils literal notranslate"><span class="pre">get_dummies()</span></code></a></td>
<td>Split strings on the delimiter returning DataFrame of dummy variables</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.contains.html#pandas.Series.str.contains" title="pandas.Series.str.contains"><code class="xref py py-meth docutils literal notranslate"><span class="pre">contains()</span></code></a></td>
<td>Return boolean array if each string contains pattern/regex</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.replace.html#pandas.Series.str.replace" title="pandas.Series.str.replace"><code class="xref py py-meth docutils literal notranslate"><span class="pre">replace()</span></code></a></td>
<td>Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.repeat.html#pandas.Series.str.repeat" title="pandas.Series.str.repeat"><code class="xref py py-meth docutils literal notranslate"><span class="pre">repeat()</span></code></a></td>
<td>Duplicate values (<code class="docutils literal notranslate"><span class="pre">s.str.repeat(3)</span></code> equivalent to <code class="docutils literal notranslate"><span class="pre">x</span> <span class="pre">*</span> <span class="pre">3</span></code>)</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.pad.html#pandas.Series.str.pad" title="pandas.Series.str.pad"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pad()</span></code></a></td>
<td>Add whitespace to left, right, or both sides of strings</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.center.html#pandas.Series.str.center" title="pandas.Series.str.center"><code class="xref py py-meth docutils literal notranslate"><span class="pre">center()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.center</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.ljust.html#pandas.Series.str.ljust" title="pandas.Series.str.ljust"><code class="xref py py-meth docutils literal notranslate"><span class="pre">ljust()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.ljust</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.rjust.html#pandas.Series.str.rjust" title="pandas.Series.str.rjust"><code class="xref py py-meth docutils literal notranslate"><span class="pre">rjust()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.rjust</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.zfill.html#pandas.Series.str.zfill" title="pandas.Series.str.zfill"><code class="xref py py-meth docutils literal notranslate"><span class="pre">zfill()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.zfill</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.wrap.html#pandas.Series.str.wrap" title="pandas.Series.str.wrap"><code class="xref py py-meth docutils literal notranslate"><span class="pre">wrap()</span></code></a></td>
<td>Split long strings into lines with length less than a given width</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.slice.html#pandas.Series.str.slice" title="pandas.Series.str.slice"><code class="xref py py-meth docutils literal notranslate"><span class="pre">slice()</span></code></a></td>
<td>Slice each string in the Series</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.slice_replace.html#pandas.Series.str.slice_replace" title="pandas.Series.str.slice_replace"><code class="xref py py-meth docutils literal notranslate"><span class="pre">slice_replace()</span></code></a></td>
<td>Replace slice in each string with passed value</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.count.html#pandas.Series.str.count" title="pandas.Series.str.count"><code class="xref py py-meth docutils literal notranslate"><span class="pre">count()</span></code></a></td>
<td>Count occurrences of pattern</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.startswith.html#pandas.Series.str.startswith" title="pandas.Series.str.startswith"><code class="xref py py-meth docutils literal notranslate"><span class="pre">startswith()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.startswith(pat)</span></code> for each element</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.endswith.html#pandas.Series.str.endswith" title="pandas.Series.str.endswith"><code class="xref py py-meth docutils literal notranslate"><span class="pre">endswith()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.endswith(pat)</span></code> for each element</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.findall.html#pandas.Series.str.findall" title="pandas.Series.str.findall"><code class="xref py py-meth docutils literal notranslate"><span class="pre">findall()</span></code></a></td>
<td>Compute list of all occurrences of pattern/regex for each string</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.match.html#pandas.Series.str.match" title="pandas.Series.str.match"><code class="xref py py-meth docutils literal notranslate"><span class="pre">match()</span></code></a></td>
<td>Call <code class="docutils literal notranslate"><span class="pre">re.match</span></code> on each element, returning matched groups as list</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.extract.html#pandas.Series.str.extract" title="pandas.Series.str.extract"><code class="xref py py-meth docutils literal notranslate"><span class="pre">extract()</span></code></a></td>
<td>Call <code class="docutils literal notranslate"><span class="pre">re.search</span></code> on each element, returning DataFrame with one row for each element and one column for each regex capture group</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.extractall.html#pandas.Series.str.extractall" title="pandas.Series.str.extractall"><code class="xref py py-meth docutils literal notranslate"><span class="pre">extractall()</span></code></a></td>
<td>Call <code class="docutils literal notranslate"><span class="pre">re.findall</span></code> on each element, returning DataFrame with one row for each match and one column for each regex capture group</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.len.html#pandas.Series.str.len" title="pandas.Series.str.len"><code class="xref py py-meth docutils literal notranslate"><span class="pre">len()</span></code></a></td>
<td>Compute string lengths</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.strip.html#pandas.Series.str.strip" title="pandas.Series.str.strip"><code class="xref py py-meth docutils literal notranslate"><span class="pre">strip()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.strip</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.rstrip.html#pandas.Series.str.rstrip" title="pandas.Series.str.rstrip"><code class="xref py py-meth docutils literal notranslate"><span class="pre">rstrip()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.rstrip</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.lstrip.html#pandas.Series.str.lstrip" title="pandas.Series.str.lstrip"><code class="xref py py-meth docutils literal notranslate"><span class="pre">lstrip()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.lstrip</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.partition.html#pandas.Series.str.partition" title="pandas.Series.str.partition"><code class="xref py py-meth docutils literal notranslate"><span class="pre">partition()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.partition</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.rpartition.html#pandas.Series.str.rpartition" title="pandas.Series.str.rpartition"><code class="xref py py-meth docutils literal notranslate"><span class="pre">rpartition()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.rpartition</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.lower.html#pandas.Series.str.lower" title="pandas.Series.str.lower"><code class="xref py py-meth docutils literal notranslate"><span class="pre">lower()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.lower</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.upper.html#pandas.Series.str.upper" title="pandas.Series.str.upper"><code class="xref py py-meth docutils literal notranslate"><span class="pre">upper()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.upper</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.find.html#pandas.Series.str.find" title="pandas.Series.str.find"><code class="xref py py-meth docutils literal notranslate"><span class="pre">find()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.find</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.rfind.html#pandas.Series.str.rfind" title="pandas.Series.str.rfind"><code class="xref py py-meth docutils literal notranslate"><span class="pre">rfind()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.rfind</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.index.html#pandas.Series.str.index" title="pandas.Series.str.index"><code class="xref py py-meth docutils literal notranslate"><span class="pre">index()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.index</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.rindex.html#pandas.Series.str.rindex" title="pandas.Series.str.rindex"><code class="xref py py-meth docutils literal notranslate"><span class="pre">rindex()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.rindex</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.capitalize.html#pandas.Series.str.capitalize" title="pandas.Series.str.capitalize"><code class="xref py py-meth docutils literal notranslate"><span class="pre">capitalize()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.capitalize</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.swapcase.html#pandas.Series.str.swapcase" title="pandas.Series.str.swapcase"><code class="xref py py-meth docutils literal notranslate"><span class="pre">swapcase()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.swapcase</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.normalize.html#pandas.Series.str.normalize" title="pandas.Series.str.normalize"><code class="xref py py-meth docutils literal notranslate"><span class="pre">normalize()</span></code></a></td>
<td>Return Unicode normal form. Equivalent to <code class="docutils literal notranslate"><span class="pre">unicodedata.normalize</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.translate.html#pandas.Series.str.translate" title="pandas.Series.str.translate"><code class="xref py py-meth docutils literal notranslate"><span class="pre">translate()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.translate</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.isalnum.html#pandas.Series.str.isalnum" title="pandas.Series.str.isalnum"><code class="xref py py-meth docutils literal notranslate"><span class="pre">isalnum()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.isalnum</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.isalpha.html#pandas.Series.str.isalpha" title="pandas.Series.str.isalpha"><code class="xref py py-meth docutils literal notranslate"><span class="pre">isalpha()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.isalpha</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.isdigit.html#pandas.Series.str.isdigit" title="pandas.Series.str.isdigit"><code class="xref py py-meth docutils literal notranslate"><span class="pre">isdigit()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.isdigit</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.isspace.html#pandas.Series.str.isspace" title="pandas.Series.str.isspace"><code class="xref py py-meth docutils literal notranslate"><span class="pre">isspace()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.isspace</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.islower.html#pandas.Series.str.islower" title="pandas.Series.str.islower"><code class="xref py py-meth docutils literal notranslate"><span class="pre">islower()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.islower</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.isupper.html#pandas.Series.str.isupper" title="pandas.Series.str.isupper"><code class="xref py py-meth docutils literal notranslate"><span class="pre">isupper()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.isupper</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.istitle.html#pandas.Series.str.istitle" title="pandas.Series.str.istitle"><code class="xref py py-meth docutils literal notranslate"><span class="pre">istitle()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.istitle</span></code></td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="../reference/api/pandas.Series.str.isnumeric.html#pandas.Series.str.isnumeric" title="pandas.Series.str.isnumeric"><code class="xref py py-meth docutils literal notranslate"><span class="pre">isnumeric()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.isnumeric</span></code></td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="../reference/api/pandas.Series.str.isdecimal.html#pandas.Series.str.isdecimal" title="pandas.Series.str.isdecimal"><code class="xref py py-meth docutils literal notranslate"><span class="pre">isdecimal()</span></code></a></td>
<td>Equivalent to <code class="docutils literal notranslate"><span class="pre">str.isdecimal</span></code></td>
</tr>
</tbody>
</table>