<a href="https://colab.research.google.com/github/maushamkumar/Pandas/blob/main/string_operation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
# What are vectorised operations?
a = np.array([1, 2, 3, 4])
a * 3   # This is also a vectorised operation

array([ 3,  6,  9, 12])

In [4]:
# problem in vectorized operations in vanilla python
s = ['cat', 'mat', None, 'rat']
[i.startswith('c') for i in s]

AttributeError: 'NoneType' object has no attribute 'startswith'

In [5]:
# How pandas solves this issue?
# str is string accessor
s = pd.Series(['cat', 'mat', None, 'rat'])
s.str.startswith('c')

# This is not only working on this data also it's fast and optimized

0     True
1    False
2     None
3    False
dtype: object

In [6]:
df = pd.read_csv('/content/train.csv')

In [7]:
df['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [8]:
# Common Functions
# lower/upper/capitalize/title
# len
# strip

df['Name'].str.lower()
df['Name'].str.upper()
df['Name'].str.capitalize()
df['Name'].str.title()

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [13]:
# len ->
# Find that passenger whose name is bigger than every one of the other passengers
df['Name'].str.len()
df['Name'].str.len().max()
df['Name'][df['Name'].str.len() == 82][0]


KeyError: 0

In [14]:
# strip
df['Name'].str.strip()

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [19]:
# split -> get
df['last_name'] = df['Name'].str.split(',').str.get(0)

In [20]:
df['Name'].str.split(',').str.get(1)

0                                  Mr. Owen Harris
1       Mrs. John Bradley (Florence Briggs Thayer)
2                                      Miss. Laina
3               Mrs. Jacques Heath (Lily May Peel)
4                                Mr. William Henry
                          ...                     
886                                    Rev. Juozas
887                           Miss. Margaret Edith
888                 Miss. Catherine Helen "Carrie"
889                                Mr. Karl Howell
890                                    Mr. Patrick
Name: Name, Length: 891, dtype: object

In [21]:
df['Name'].str.split(',').str.get(1).str.split(' ')
# This looks awkward because of space

0                                  [, Mr., Owen, Harris]
1      [, Mrs., John, Bradley, (Florence, Briggs, Tha...
2                                       [, Miss., Laina]
3            [, Mrs., Jacques, Heath, (Lily, May, Peel)]
4                                [, Mr., William, Henry]
                             ...                        
886                                     [, Rev., Juozas]
887                           [, Miss., Margaret, Edith]
888                [, Miss., Catherine, Helen, "Carrie"]
889                                [, Mr., Karl, Howell]
890                                     [, Mr., Patrick]
Name: Name, Length: 891, dtype: object

In [28]:
# using this we have removed the space
df['Name'].str.split(',').str.get(1).str.strip()

0                                 Mr. Owen Harris
1      Mrs. John Bradley (Florence Briggs Thayer)
2                                     Miss. Laina
3              Mrs. Jacques Heath (Lily May Peel)
4                               Mr. William Henry
                          ...                    
886                                   Rev. Juozas
887                          Miss. Margaret Edith
888                Miss. Catherine Helen "Carrie"
889                               Mr. Karl Howell
890                                   Mr. Patrick
Name: Name, Length: 891, dtype: object

In [30]:
df['Name'].str.split(',').str.get(1).str.strip().str.split(" ", n=1) # n tell how many split you want.

0                                 [Mr., Owen Harris]
1      [Mrs., John Bradley (Florence Briggs Thayer)]
2                                     [Miss., Laina]
3              [Mrs., Jacques Heath (Lily May Peel)]
4                               [Mr., William Henry]
                           ...                      
886                                   [Rev., Juozas]
887                          [Miss., Margaret Edith]
888                [Miss., Catherine Helen "Carrie"]
889                               [Mr., Karl Howell]
890                                   [Mr., Patrick]
Name: Name, Length: 891, dtype: object

In [31]:
df['Name'].str.split(',').str.get(1).str.strip().str.split(" ", n=1, expand=True) # This expand will convert into a dataframe

Unnamed: 0,0,1
0,Mr.,Owen Harris
1,Mrs.,John Bradley (Florence Briggs Thayer)
2,Miss.,Laina
3,Mrs.,Jacques Heath (Lily May Peel)
4,Mr.,William Henry
...,...,...
886,Rev.,Juozas
887,Miss.,Margaret Edith
888,Miss.,"Catherine Helen ""Carrie"""
889,Mr.,Karl Howell


In [32]:
df[['title', 'Name']] = df['Name'].str.split(',').str.get(1).str.strip().str.split(" ", n=1, expand=True)

In [33]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,last_name,title
0,1,0,3,Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr.
1,2,1,1,John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs.
2,3,1,3,Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss.
3,4,1,1,Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1,C123,S,Futrelle,Mrs.
4,5,0,3,William Henry,male,35.0,0,0,373450,8.05,,S,Allen,Mr.


In [34]:
df['title'].value_counts()

title
Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Mlle.          2
Major.         2
Col.           2
the            1
Capt.          1
Ms.            1
Sir.           1
Lady.          1
Mme.           1
Don.           1
Jonkheer.      1
Name: count, dtype: int64

In [35]:
# replace
df['title'].str.replace('Ms', 'Miss')
df['title'].str.replace('Mlle', 'Miss')

0        Mr.
1       Mrs.
2      Miss.
3       Mrs.
4        Mr.
       ...  
886     Rev.
887    Miss.
888    Miss.
889      Mr.
890      Mr.
Name: title, Length: 891, dtype: object

In [37]:
# filtering
# Find those passenger whose name start with a
# startswith/endswith
df[df['Name'].str.startswith('A')]
# isdigit/isalpha

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,last_name,title
13,14,0,3,Anders Johan,male,39.0,1,5,347082,31.2750,,S,Andersson,Mr.
22,23,1,3,"Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q,McGowan,Miss.
35,36,0,1,Alexander Oskar,male,42.0,1,0,113789,52.0000,,S,Holverson,Mr.
38,39,0,3,Augusta Maria,female,18.0,2,0,345764,18.0000,,S,Vander Planke,Miss.
61,62,1,1,Amelie,female,38.0,0,0,113572,80.0000,B28,,Icard,Miss.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
842,843,1,1,Augusta,female,30.0,0,0,113798,31.0000,,C,Serepeca,Miss.
845,846,0,3,Anthony,male,42.0,0,0,C.A. 5547,7.5500,,S,Abbing,Mr.
866,867,1,2,Asuncion,female,27.0,1,0,SC/PARIS 2149,13.8583,,C,Duran y More,Miss.
875,876,1,3,"Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,,C,Najib,Miss.


In [None]:
# apllying regs
# contains
# search john -> both case
# find last names with start and end char vowel

In [40]:
df[df['Name'].str.contains('john', case=False)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,last_name,title
1,2,1,1,John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs.
41,42,0,2,William John Robert (Dorothy Ann Wonnacott),female,27.0,1,0,11668,21.0,,S,Turpin,Mrs.
45,46,0,3,William John,male,,0,0,S.C./A.4. 23567,8.05,,S,Rogers,Mr.
98,99,1,2,John T (Ada Julia Bone),female,34.0,0,1,231919,23.0,,S,Doling,Mrs.
112,113,0,3,David John,male,22.0,0,0,324669,8.05,,S,Barton,Mr.
117,118,0,2,William John Robert,male,29.0,1,0,11668,21.0,,S,Turpin,Mr.
160,161,0,3,John Hatfield,male,44.0,0,1,371362,16.1,,S,Cribb,Mr.
162,163,0,3,John Viktor,male,26.0,0,0,347068,7.775,,S,Bengtsson,Mr.
165,166,1,3,"Frank John William ""Frankie""",male,9.0,0,2,363291,20.525,,S,Goldsmith,Master.
168,169,0,1,John D,male,,0,0,PC 17318,25.925,,S,Baumann,Mr.


In [43]:
df[df['last_name'].str.contains('^[aeiouAEIOU].+[aeiouAEIOU]$')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,last_name,title
30,31,0,1,Manuel E,male,40.0,0,0,PC 17601,27.7208,,C,Uruchurtu,Don.
49,50,0,3,Josef (Josefine Franchi),female,18.0,1,0,349237,17.8,,S,Arnold-Franchi,Mrs.
207,208,1,3,Nassef Cassem,male,26.0,0,0,2699,18.7875,,C,Albimona,Mr.
210,211,0,3,Ahmed,male,24.0,0,0,SOTON/O.Q. 3101311,7.05,,S,Ali,Mr.
353,354,0,3,Josef,male,25.0,1,0,349237,17.8,,S,Arnold-Franchi,Mr.
493,494,0,1,Ramon,male,71.0,0,0,PC 17609,49.5042,,C,Artagaveytia,Mr.
518,519,1,2,"William A (Florence ""Mary"" Agnes Hughes)",female,36.0,1,0,226875,26.0,,S,Angle,Mrs.
784,785,0,3,William,male,25.0,0,0,SOTON/O.Q. 3101312,7.05,,S,Ali,Mr.
840,841,0,3,Ilmari Rudolf,male,20.0,0,0,SOTON/O2 3101287,7.925,,S,Alhomaki,Mr.


In [44]:
# slicing
df['Name'].str[:4]

0      Owen
1      John
2      Lain
3      Jacq
4      Will
       ... 
886    Juoz
887    Marg
888    Cath
889    Karl
890    Patr
Name: Name, Length: 891, dtype: object