# Demo 4.2 Splitting Columns and More Cleaning  

- **Demonstrates:**  
  - [**Splitting Columns**](#Splitting-Columns)  
  - [**Keeping and Reordering Columns**](#Keeping-and-Reordering-Columns)        
  


In [2]:
import pandas as pd 
import plotly.express as px

In [3]:
# Hardcode a dataframe 
df = pd.DataFrame(
    [
    [1699,'Robinett, David','david22@adventure-works.com', '(827) 525-0100', '06-05-2010', '$80,950'], 
    [1700,'Robinson, Rebecca','rebecca5 @adventure-works.com', '(829) 525-0101', '05-01-2015', '$70,950'],
    [1701,'Robinson, Dorothy','dorothy3@adventure-works.com', '(828) 555-0102', '03-01-2017', '$50,000'],
    ], 
    columns=['BusinessEntityID', 'EmployeeName','EmailAddress', 'PhoneNumber', 'StartDate', 'CurrentSalary'])
  
print(df.shape)
df.head()

(3, 6)


Unnamed: 0,BusinessEntityID,EmployeeName,EmailAddress,PhoneNumber,StartDate,CurrentSalary
0,1699,"Robinett, David",david22@adventure-works.com,(827) 525-0100,06-05-2010,"$80,950"
1,1700,"Robinson, Rebecca",rebecca5 @adventure-works.com,(829) 525-0101,05-01-2015,"$70,950"
2,1701,"Robinson, Dorothy",dorothy3@adventure-works.com,(828) 555-0102,03-01-2017,"$50,000"


In [4]:
# Display the column names
df.columns

Index(['BusinessEntityID', 'EmployeeName', 'EmailAddress', 'PhoneNumber',
       'StartDate', 'CurrentSalary'],
      dtype='object')

# Splitting Columns  
- Split the *EmployeeName* column into Last Name and First Name columns

In [5]:
two_new_cols = ['LastName', 'FirstName']

df[two_new_cols] = df['EmployeeName'].str.split(',',1, expand=True)

print(df.shape)
df

(3, 8)


Unnamed: 0,BusinessEntityID,EmployeeName,EmailAddress,PhoneNumber,StartDate,CurrentSalary,LastName,FirstName
0,1699,"Robinett, David",david22@adventure-works.com,(827) 525-0100,06-05-2010,"$80,950",Robinett,David
1,1700,"Robinson, Rebecca",rebecca5 @adventure-works.com,(829) 525-0101,05-01-2015,"$70,950",Robinson,Rebecca
2,1701,"Robinson, Dorothy",dorothy3@adventure-works.com,(828) 555-0102,03-01-2017,"$50,000",Robinson,Dorothy


# Keeping and Reordering Columns    
- Get rid of BusinessEntityID and StartDate

In [6]:
df.columns

Index(['BusinessEntityID', 'EmployeeName', 'EmailAddress', 'PhoneNumber',
       'StartDate', 'CurrentSalary', 'LastName', 'FirstName'],
      dtype='object')

In [7]:
cols_to_keep = ['LastName', 'FirstName', 'EmailAddress', 'PhoneNumber', 'StartDate', 'CurrentSalary',  'EmployeeName']
df = df[cols_to_keep]


print(df.shape)
df

(3, 7)


Unnamed: 0,LastName,FirstName,EmailAddress,PhoneNumber,StartDate,CurrentSalary,EmployeeName
0,Robinett,David,david22@adventure-works.com,(827) 525-0100,06-05-2010,"$80,950","Robinett, David"
1,Robinson,Rebecca,rebecca5 @adventure-works.com,(829) 525-0101,05-01-2015,"$70,950","Robinson, Rebecca"
2,Robinson,Dorothy,dorothy3@adventure-works.com,(828) 555-0102,03-01-2017,"$50,000","Robinson, Dorothy"
