In [5]:
# import data analysis libraries #

import pandas as pd
import numpy as np

In [2]:
# import the titanic dataset #

df = pd.read_excel("titanic.xls")



Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"


In [13]:
# create a smaller dataset with just the variables of interest #

df = df[["name", "cabin", "home.dest"]]
df.head()

Unnamed: 0,name,cabin,home.dest
0,"Allen, Miss. Elisabeth Walton",B5,"St Louis, MO"
1,"Allison, Master. Hudson Trevor",C22 C26,"Montreal, PQ / Chesterville, ON"
2,"Allison, Miss. Helen Loraine",C22 C26,"Montreal, PQ / Chesterville, ON"
3,"Allison, Mr. Hudson Joshua Creighton",C22 C26,"Montreal, PQ / Chesterville, ON"
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",C22 C26,"Montreal, PQ / Chesterville, ON"


In [None]:
# explore the characteristics of the data #

df.info()

## STRING FREQS AND UNIQUE VALUES

In [15]:
#### Unique string values

df.nunique()



name         1307
cabin         186
home.dest     369
dtype: int64

In [None]:
#### String Frequencies

df["home.dest"].value_counts()

In [None]:
df["cabin"].value_counts()

In [18]:
df["name"].value_counts()

Connolly, Miss. Kate             2
Kelly, Mr. James                 2
Allen, Miss. Elisabeth Walton    1
Ilmakangas, Miss. Ida Livija     1
Ilieff, Mr. Ylio                 1
                                ..
Hart, Miss. Eva Miriam           1
Harris, Mr. Walter               1
Harris, Mr. George               1
Harper, Rev. John                1
Zimmerman, Mr. Leo               1
Name: name, Length: 1307, dtype: int64

## WORKING WITH TEXT DATA

In [19]:
df["name"].head()

0                      Allen, Miss. Elisabeth Walton
1                     Allison, Master. Hudson Trevor
2                       Allison, Miss. Helen Loraine
3               Allison, Mr. Hudson Joshua Creighton
4    Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
Name: name, dtype: object

### Length of a text string

In [20]:
#### length of a string
# how many characters are in the first passenger name?

len(df["name"][0])

29

### Starting & ending with specific characters or strings

In [22]:
#### does my string start with a specific character?
# how many passengers have the last name "Allison"?
# last name comes first in the name column

df["name"].str.startswith("Allison").sum()

4

In [24]:
df["name"].str.startswith("Allen").sum()

2

In [25]:
# which passengers have the last name "Allison"?

df.loc[df["name"].str.startswith("Allison")]

Unnamed: 0,name,cabin,home.dest
1,"Allison, Master. Hudson Trevor",C22 C26,"Montreal, PQ / Chesterville, ON"
2,"Allison, Miss. Helen Loraine",C22 C26,"Montreal, PQ / Chesterville, ON"
3,"Allison, Mr. Hudson Joshua Creighton",C22 C26,"Montreal, PQ / Chesterville, ON"
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",C22 C26,"Montreal, PQ / Chesterville, ON"


In [26]:
df.loc[df['name'].str.startswith('Allen')]

Unnamed: 0,name,cabin,home.dest
0,"Allen, Miss. Elisabeth Walton",B5,"St Louis, MO"
618,"Allen, Mr. William Henry",,"Lower Clapton, Middlesex or Erdington, Birmingham"


In [27]:
#### does my string end with a specific character?
# how many passengers have the first name "Ellen"?

df["name"].str.endswith("Ellen").sum()

2

In [28]:
df['name'].str.endswith('Henry').sum()

21

In [29]:
# which passengers have the first name "Ellen"?

df.loc[df["name"].str.endswith("Ellen")]

Unnamed: 0,name,cabin,home.dest
24,"Bird, Miss. Ellen",C97,
570,"Toomey, Miss. Ellen",,"Indianapolis, IN"


In [33]:
df.loc[df['name'].str.endswith('Henry')]

Unnamed: 0,name,cabin,home.dest
31,"Blank, Mr. Henry",A31,"Glen Ridge, NJ"
157,"Hilliard, Mr. Herbert Henry",E46,"Brighton, MA"
280,"Stengel, Mr. Charles Emil Henry",C116,"Newark, NJ"
367,"Chapman, Mr. Charles Henry",,"Bronx, NY"
368,"Chapman, Mr. John Henry",,"Cornwall / Spokane, WA"
386,"Davies, Mr. Charles Henry",,"Lyndhurst, England"
423,"Gillespie, Mr. William Henry",,"Vancouver, BC"
457,"Hunt, Mr. George Henry",,"Philadelphia, PA"
544,"Renouf, Mr. Peter Henry",,"Elizabeth, NJ"
618,"Allen, Mr. William Henry",,"Lower Clapton, Middlesex or Erdington, Birmingham"


### Locating specific characters or strings

In [34]:
#### locating words or symbols in dataset

df["name"].str.contains("Sara").sum()

4

In [None]:
df.loc[df["name"].str.contains("Sara")]

### Changing text case - uppercase, lowercase, titlecase

In [None]:
#### changing string case
# changing the text to all uppercase

df["UP_name"] = df["name"].str.upper()
df.head(3)

In [None]:
#### changing string case
# changing the text to all lowercase

df["LOW_name"] = df["name"].str.lower()

df.head()

### Remove white space from text

In [39]:
extra = "   _These words are surrounded by too much space_     "
extra

'   _These words are surrounded by too much space_     '

In [None]:
#### remove white space from the right-hand side of the text

extra.rstrip()

In [None]:
#### remove white space from the left-hand side of the text

extra.lstrip()

In [42]:
#### remove white space from both sides of the text

extra.strip()

'_These words are surrounded by too much space_'

In [None]:
a = '  no'
b = 'no'

In [45]:
df["name_s"] = df["name"].str.strip()

df.head(2)

Unnamed: 0,name,cabin,home.dest,UP_name,LOW_name,name_s
0,"Allen, Miss. Elisabeth Walton",B5,"St Louis, MO","ALLEN, MISS. ELISABETH WALTON","allen, miss. elisabeth walton","Allen, Miss. Elisabeth Walton"
1,"Allison, Master. Hudson Trevor",C22 C26,"Montreal, PQ / Chesterville, ON","ALLISON, MASTER. HUDSON TREVOR","allison, master. hudson trevor","Allison, Master. Hudson Trevor"


### Splitting Strings

In [47]:
df = df[["name", "cabin", "home.dest"]]

df.head(2)

Unnamed: 0,name,cabin,home.dest
0,"Allen, Miss. Elisabeth Walton",B5,"St Louis, MO"
1,"Allison, Master. Hudson Trevor",C22 C26,"Montreal, PQ / Chesterville, ON"


In [50]:
### split the string of text after each white space

df["name"].str.split(" ")

0                      [Allen,, Miss., Elisabeth, Walton]
1                     [Allison,, Master., Hudson, Trevor]
2                       [Allison,, Miss., Helen, Loraine]
3              [Allison,, Mr., Hudson, Joshua, Creighton]
4       [Allison,, Mrs., Hudson, J, C, (Bessie, Waldo,...
                              ...                        
1304                             [Zabour,, Miss., Hileni]
1305                            [Zabour,, Miss., Thamine]
1306                        [Zakarian,, Mr., Mapriededer]
1307                              [Zakarian,, Mr., Ortin]
1308                               [Zimmerman,, Mr., Leo]
Name: name, Length: 1309, dtype: object

In [None]:
### split the string of text after only the first white space (n = 1)

df["name"].str.split(" ", n = 1)

In [55]:
a = df

In [None]:
a['name_sp'] = df["name"].str.split(" ", n = 1)
# a['name_sp2'] = df["name"].str.split(" ", n = 1, expand = True)
# a

In [None]:
### split the string after the first white space, expand the seperated text to a new column

df["name"].str.split(" ", n = 1, expand = True)

In [70]:
### before splitting the text, you can replace certain characters with blank spaces
### replace special characters to make splitting easier

df["name_dot"] = df["name"].str.replace(".", "!")
df.head(3)

  df["name_dot"] = df["name"].str.replace(".", "!")


Unnamed: 0,name,cabin,home.dest,name_sp,name_dot,last name only
0,Allen Miss. Elisabeth Walton,B5,"St Louis, MO","[Allen,, Miss. Elisabeth Walton]",Allen Miss! Elisabeth Walton,Allen
1,Allison Master. Hudson Trevor,C22 C26,"Montreal, PQ / Chesterville, ON","[Allison,, Master. Hudson Trevor]",Allison Master! Hudson Trevor,Allison
2,Allison Miss. Helen Loraine,C22 C26,"Montreal, PQ / Chesterville, ON","[Allison,, Miss. Helen Loraine]",Allison Miss! Helen Loraine,Allison


In [73]:
#### adding the split columns onto your original dataset
# create a new dataset to capture the columns you plan to split (df2)

# Parsing by 2nd space into 2 columns


df2 = df['name'].str.split(" ", n = 3, expand = True)

df2.head()

Unnamed: 0,0,1,2,3
0,Allen,,Miss.,Elisabeth Walton
1,Allison,,Master.,Hudson Trevor
2,Allison,,Miss.,Helen Loraine
3,Allison,,Mr.,Hudson Joshua Creighton
4,Allison,,Mrs.,Hudson J C (Bessie Waldo Daniels)


In [None]:
#### create new columns (or overwrite existing columns) to include new columns to original dataset
# when specifying the column you want to include, set it equal to the column index position

df["last name only"] = df2[0]

df.head()