In [1]:
import pandas as pd 
import numpy as np 

#### What is vectorized operations ?
Vectorized operations are operations that are applied to entire arrays or collections of data at once, rather than looping through elements one-by-one. They are a core feature in libraries like NumPy, Pandas, and other data-processing tools in Python, and are often much faster than explicit loops written in Python.

In [5]:
a = np.array([1, 2, 3, 4])
# this is a vectorized operation. 
a * 4

array([ 4,  8, 12, 16])

#### problem in vectorized opertions in vanilla python. 

In [8]:
s = ['cat', 'mat', None, 'sat', 'rat']

# this will through an error becacuse None does not have attribute with name startswith.
[i.startswith('c') for i in s]

AttributeError: 'NoneType' object has no attribute 'startswith'

#### How pandas solves this issue?

`NOTE :` 
- In Pandas, when you're working with Series (or columns of a DataFrame) that contain strings, you need to use .str to access vectorized string functions.

##### Why .str is needed?
The .str accessor tells Pandas:
"Hey, this Series contains strings — I want to apply string methods to each element in a vectorized way."

Without .str, if you try to use string methods directly, Pandas won't know how to apply them to each element.

In [11]:
s = pd.Series(['cat', 'mat', None, 'sat', 'rat'])
s.str.startswith('c')

0     True
1    False
2     None
3    False
4    False
dtype: object

In [72]:
# Import Titanic Dataset. 
titanic_df = pd.read_csv('./Datasets/Titanic-Dataset.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [73]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [74]:
titanic_df['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

#### Common String Functions
- lower
- upper
- capitalize
- title
- len
- strip
- split
- replace

In [75]:
# lower 
titanic_df['Name'].str.lower()

0                                braund, mr. owen harris
1      cumings, mrs. john bradley (florence briggs th...
2                                 heikkinen, miss. laina
3           futrelle, mrs. jacques heath (lily may peel)
4                               allen, mr. william henry
                             ...                        
886                                montvila, rev. juozas
887                         graham, miss. margaret edith
888             johnston, miss. catherine helen "carrie"
889                                behr, mr. karl howell
890                                  dooley, mr. patrick
Name: Name, Length: 891, dtype: object

In [76]:
# upper 
titanic_df['Name'].str.upper()

0                                BRAUND, MR. OWEN HARRIS
1      CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH...
2                                 HEIKKINEN, MISS. LAINA
3           FUTRELLE, MRS. JACQUES HEATH (LILY MAY PEEL)
4                               ALLEN, MR. WILLIAM HENRY
                             ...                        
886                                MONTVILA, REV. JUOZAS
887                         GRAHAM, MISS. MARGARET EDITH
888             JOHNSTON, MISS. CATHERINE HELEN "CARRIE"
889                                BEHR, MR. KARL HOWELL
890                                  DOOLEY, MR. PATRICK
Name: Name, Length: 891, dtype: object

In [77]:
# capitalize 
titanic_df['Name'].str.capitalize()

0                                Braund, mr. owen harris
1      Cumings, mrs. john bradley (florence briggs th...
2                                 Heikkinen, miss. laina
3           Futrelle, mrs. jacques heath (lily may peel)
4                               Allen, mr. william henry
                             ...                        
886                                Montvila, rev. juozas
887                         Graham, miss. margaret edith
888             Johnston, miss. catherine helen "carrie"
889                                Behr, mr. karl howell
890                                  Dooley, mr. patrick
Name: Name, Length: 891, dtype: object

In [78]:
# title 
titanic_df['Name'].str.title()

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [79]:
# len 
titanic_df['Name'].str.len()

0      23
1      51
2      22
3      44
4      24
       ..
886    21
887    28
888    40
889    21
890    19
Name: Name, Length: 891, dtype: int64

In [80]:
# Question find the name of the Passenger which has max length. 
max_length = titanic_df['Name'].str.len().max() 
print(max_length)
titanic_df[titanic_df['Name'].str.len() == max_length]['Name'].values[0]

82


'Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)'

In [81]:
# strip function is used to remove the trilling and last spaces from the string
name = '                           Steve Smith                  '
name.strip()

'Steve Smith'

In [82]:
titanic_df['Name'].str.strip()

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

`Question :` We need three more columns in our dataframe titanic_df firstname, lastname, firstname, title.
- this is done by using the get function

In [85]:
# spilt and get function.

In [86]:
titanic_df['lastname'] = titanic_df['Name'].str.split(',').str.get(0)
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,lastname
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Futrelle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Allen


In [98]:
titanic_df[['title', 'firstname']] = titanic_df['Name'].str.split(',').str.get(1).str.strip().str.split(' ', n = 1, expand = True)
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,lastname,title,firstname
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr.,Owen Harris
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs.,John Bradley (Florence Briggs Thayer)
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss.,Laina
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Futrelle,Mrs.,Jacques Heath (Lily May Peel)
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Allen,Mr.,William Henry


In [99]:
titanic_df['title'].value_counts()

title
Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Col.           2
Mlle.          2
Major.         2
Ms.            1
Mme.           1
Don.           1
Lady.          1
Sir.           1
Capt.          1
the            1
Jonkheer.      1
Name: count, dtype: int64

In [102]:
# here we see Miss. , Ms. , Mlle these are same so we replace these with Miss. 
# repplace function. 

In [103]:
titanic_df['title'] = titanic_df['title'].str.replace('Ms.', 'Miss.')
titanic_df['title'] = titanic_df['title'].str.replace('Mlle.', 'Miss.')
titanic_df['title'].value_counts()

title
Mr.          517
Miss.        185
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Major.         2
Col.           2
Don.           1
Lady.          1
Mme.           1
Sir.           1
Capt.          1
the            1
Jonkheer.      1
Name: count, dtype: int64

#### Filtering 
- startswith/endswith
- isdigit/isalpha
- and lot more functions

In [104]:
# startswith and endswith function

In [114]:
titanic_df[titanic_df['firstname'].str.startswith('A')]
titanic_df[titanic_df['firstname'].str.endswith('r')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,lastname,title,firstname
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0000,C23 C25 C27,S,Fortune,Mr.,Charles Alexander
35,36,0,1,"Holverson, Mr. Alexander Oskar",male,42.0,1,0,113789,52.0000,,S,Holverson,Mr.,Alexander Oskar
42,43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C,Kraeff,Mr.,Theodor
51,52,0,3,"Nosworthy, Mr. Richard Cater",male,21.0,0,0,A/4. 39886,7.8000,,S,Nosworthy,Mr.,Richard Cater
57,58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C,Novel,Mr.,Mansouer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,854,1,1,"Lines, Miss. Mary Conover",female,16.0,0,1,PC 17592,39.4000,D28,S,Lines,Miss.,Mary Conover
860,861,0,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S,Hansen,Mr.,Claus Peter
869,870,1,3,"Johnson, Master. Harold Theodor",male,4.0,1,1,347742,11.1333,,S,Johnson,Master.,Harold Theodor
873,874,0,3,"Vander Cruyssen, Mr. Victor",male,47.0,0,0,345765,9.0000,,S,Vander Cruyssen,Mr.,Victor


In [118]:
# isdigit 
titanic_df['lastname'].str.isdigit()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: lastname, Length: 891, dtype: bool

#### applying regex
- contains function 

What is Regular Expression ? 

- A regular expression (or regex) is a sequence of characters that defines a search pattern. It's mainly used for string matching, searching, extracting, or replacing patterns in text.

In [120]:
# search john -> both case
titanic_df[titanic_df['firstname'].str.contains('john',case=False)]
# find lastnames with start and end char vowel
titanic_df[titanic_df['lastname'].str.contains('^[^aeiouAEIOU].+[^aeiouAEIOU]$')] 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,lastname,title,firstname
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Braund,Mr.,Owen Harris
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs.,John Bradley (Florence Briggs Thayer)
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Heikkinen,Miss.,Laina
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Moran,Mr.,James
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,McCarthy,Mr.,Timothy J
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,Sutehall,Mr.,Henry Jr
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Graham,Miss.,Margaret Edith
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Johnston,Miss.,"Catherine Helen ""Carrie"""
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Behr,Mr.,Karl Howell


In [124]:
# slicing 

In [125]:
titanic_df['firstname'].str[ : 4]

0      Owen
1      John
2      Lain
3      Jacq
4      Will
       ... 
886    Juoz
887    Marg
888    Cath
889    Karl
890    Patr
Name: firstname, Length: 891, dtype: object

In [126]:
titanic_df['firstname'].str[1 : 6 : 2]

0      wnH
1      onB
2       an
3      aqe
4      ila
      ... 
886    uzs
887    agr
888    ahr
889    alH
890    arc
Name: firstname, Length: 891, dtype: object

In [127]:
titanic_df['firstname'].str[ : : -1]

0                                sirraH newO
1      )reyahT sggirB ecnerolF( yeldarB nhoJ
2                                      aniaL
3              )leeP yaM yliL( htaeH seuqcaJ
4                              yrneH mailliW
                       ...                  
886                                   sazouJ
887                           htidE teragraM
888                 "eirraC" neleH enirehtaC
889                              llewoH lraK
890                                  kcirtaP
Name: firstname, Length: 891, dtype: object