# String Operations

Working with text data using Pandas string methods.


In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'text': [
        '  Hello World  ',
        'Data-Science_101',
        'email:test@example.com',
        'Phone: +91-9876543210',
        None
    ]
})

df

Unnamed: 0,text
0,Hello World
1,Data-Science_101
2,email:test@example.com
3,Phone: +91-9876543210
4,


## str accessor

In [2]:
df['text'].str.lower()

0             hello world  
1          data-science_101
2    email:test@example.com
3     phone: +91-9876543210
4                      None
Name: text, dtype: object

In [3]:
df['text'].str.upper()

0             HELLO WORLD  
1          DATA-SCIENCE_101
2    EMAIL:TEST@EXAMPLE.COM
3     PHONE: +91-9876543210
4                      None
Name: text, dtype: object

In [4]:
df['text'].str.len()

0    15.0
1    16.0
2    22.0
3    21.0
4     NaN
Name: text, dtype: float64

## String cleaning

In [5]:
df['text'].str.strip()

0               Hello World
1          Data-Science_101
2    email:test@example.com
3     Phone: +91-9876543210
4                      None
Name: text, dtype: object

In [6]:
df['text'].str.replace('-', ' ', regex=False)

0             Hello World  
1          Data Science_101
2    email:test@example.com
3     Phone: +91 9876543210
4                      None
Name: text, dtype: object

In [7]:
df['text'].str.replace(r'[_:]', ' ', regex=True)

0             Hello World  
1          Data-Science 101
2    email test@example.com
3     Phone  +91-9876543210
4                      None
Name: text, dtype: object

## Pattern matching

In [8]:
df['text'].str.contains('Data', na=False)

0    False
1     True
2    False
3    False
4    False
Name: text, dtype: bool

In [9]:
df['text'].str.startswith('email', na=False)

0    False
1    False
2     True
3    False
4    False
Name: text, dtype: bool

In [10]:
df['text'].str.endswith('.com', na=False)

0    False
1    False
2     True
3    False
4    False
Name: text, dtype: bool

## Regular expressions

In [11]:
df['text'].str.findall(r'\d+')

0                  []
1               [101]
2                  []
3    [91, 9876543210]
4                None
Name: text, dtype: object

In [12]:
df['text'].str.replace(r'\d', 'X', regex=True)

0             Hello World  
1          Data-Science_XXX
2    email:test@example.com
3     Phone: +XX-XXXXXXXXXX
4                      None
Name: text, dtype: object

## Extracting text features

In [13]:
df['email'] = df['text'].str.extract(r'([\w.-]+@[\w.-]+)')
df

Unnamed: 0,text,email
0,Hello World,
1,Data-Science_101,
2,email:test@example.com,test@example.com
3,Phone: +91-9876543210,
4,,


In [14]:
df['digits'] = df['text'].str.extract(r'(\d+)')
df

Unnamed: 0,text,email,digits
0,Hello World,,
1,Data-Science_101,,101.0
2,email:test@example.com,test@example.com,
3,Phone: +91-9876543210,,91.0
4,,,


## Splitting and joining strings

In [15]:
df['text'].str.split(' ')

0      [, , Hello, World, , ]
1          [Data-Science_101]
2    [email:test@example.com]
3    [Phone:, +91-9876543210]
4                        None
Name: text, dtype: object

In [16]:
df['text'].str.split(' ', expand=True)

Unnamed: 0,0,1,2,3,4,5
0,,,Hello,World,,
1,Data-Science_101,,,,,
2,email:test@example.com,,,,,
3,Phone:,+91-9876543210,,,,
4,,,,,,


In [17]:
df['text'].str.split(':').str[0]

0       Hello World  
1    Data-Science_101
2               email
3               Phone
4                None
Name: text, dtype: object

In [18]:
pd.Series(['DL', 'MH', 'TN']).str.cat(sep='-')

'DL-MH-TN'