# Creating a DataFrame

In [27]:
import pandas as pd

data = {
    "Name": ["Anna", "Bob", "Charlie", "Diana", "Eric"],
    "Age": [20, 34, 23, None, 33],
    "Gender": ["f", "m", "m", "f", "m"],
    "Job": ["Programmer", "Writer", "Cook", "Programmer", "Teacher"],
}

df = pd.DataFrame(data)

print(df)

      Name   Age Gender         Job
0     Anna  20.0      f  Programmer
1      Bob  34.0      m      Writer
2  Charlie  23.0      m        Cook
3    Diana   NaN      f  Programmer
4     Eric  33.0      m     Teacher


# Preprocessing Pipeline

* Drop Name Features
* Impute Ages
* Turn Gender into Binary/ Numeric
* One Hot Encode Jobs

In [28]:
df = df.drop("Name", axis=1)

In [29]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

imputer = SimpleImputer(strategy= "mean")
df['Age'] = imputer.fit_transform(df[['Age']])




In [30]:
df

Unnamed: 0,Age,Gender,Job
0,20.0,f,Programmer
1,34.0,m,Writer
2,23.0,m,Cook
3,27.5,f,Programmer
4,33.0,m,Teacher


In [31]:
df['Gender'] = df['Gender'].replace({'f': 0, 'm': 1} )

In [32]:
df

Unnamed: 0,Age,Gender,Job
0,20.0,0,Programmer
1,34.0,1,Writer
2,23.0,1,Cook
3,27.5,0,Programmer
4,33.0,1,Teacher


In [33]:
df_encoded = pd.get_dummies(df, columns=['Job'])



In [34]:
df_encoded

Unnamed: 0,Age,Gender,Job_Cook,Job_Programmer,Job_Teacher,Job_Writer
0,20.0,0,0,1,0,0
1,34.0,1,0,0,0,1
2,23.0,1,1,0,0,0
3,27.5,0,0,1,0,0
4,33.0,1,0,0,1,0
