In [1]:
# Let's talk about Categorical Data
# cat data takes a limited number of responses (A, B, C, or D)
# Python will generally, error them out if you haven't encoded them first

# One-Hot Encoding: Standard Approach for Categorical Data

# One-Hot Encoding creates new columns, indicating the presence of each value from the original data

In [13]:
# Setup the Data
import pandas as pd

model_data = pd.read_csv('./datasets/withdrawl_model_data.cvs')
test_data = pd.read_csv('./datasets/withdrawl_test_data.cvs')

# Drop students where withdrawl is missing
model_data.dropna(axis=0, subset=['Withdrew'], inplace=True)

# What we're trying to predict
y_model = model_data.Withdrew

# For the purpose of this exercise, just drop columns with missing values
missing_columns = [col for col in model_data.columns
                  if model_data[col].isnull().any()]

# Remove columns from our model data and our test data
x_model = model_data.drop(['Respondent', 'Withdrew'] + missing_columns, axis=1)
x_test = test_data.drop(['Respondent'] + missing_columns, axis=1)

# We use cardinality to find the unique values in a column in order to create our categorical variables
cardinality_columns = [cname for cname in x_model.columns
                      if x_model[cname].nunique() < 10 and
                      x_model[cname].dtype == "object"]

numeric_columns = [cname for cname in x_model.columns
                  if x_model[cname].dtype in ['int64', 'float64']]

my_columns = cardinality_columns + numeric_columns

# We place any categorical data at the beginning of our dataset so its easier to manipulate later
x_model_predictors = x_model[my_columns]
x_test_predictors = x_test[my_columns]

In [20]:
# Now let's take a look at our datatypes
x_model_predictors.dtypes

Took Survey                         object
gender                              object
year in school                      object
live                                object
Program Type                        object
Total IM to know                     int64
Total IM to accomplish               int64
Total IM to stimulate                int64
IM Total                             int64
Total EM to identify                 int64
Total EM to introject                int64
Total EM to regulate                 int64
EM Total                             int64
Amotivation                          int64
follow budget                        int64
money for activities                 int64
afford friend lifestyle              int64
reduced course load                  int64
thought withdraw due to money        int64
thought transfer due to money        int64
neglect coursework due to money      int64
work                                 int64
studying                             int64
formal soci

In [26]:
# As you can see from the above, 'object' indicates text/string responses as responses
# These 'object' columns are what we want to encode
# Luckily, Pandas have a convienent function, get_dummies, which allows us to encode

# Encode your test data the same way you'd encode your model data
x_model_encoded_predictors = pd.get_dummies(x_model_predictors)
x_test_encoded_predictors = pd.get_dummies(x_test_predictors)

# Make sure the data is aligned so columns in both datasets show up in the same order
x_model_final, x_test_final = x_model_encoded_predictors.align(x_test_encoded_predictors,join='left', axis=1)
# "join='left'" think of this as a SQL join

In [46]:
x_model_final

Unnamed: 0,Total IM to know,Total IM to accomplish,Total IM to stimulate,IM Total,Total EM to identify,Total EM to introject,Total EM to regulate,EM Total,Amotivation,follow budget,...,year in school_Nontraditional Freshman,year in school_Other,year in school_Traditional Freshman,year in school_Transfer,live_Off campus (on your own),live_Off campus (w/parent),live_On campus,Program Type_CLA,Program Type_CPA,Program Type_UND
0,26,9,9,44,22,21,26,69,15,4,...,0,0,1,0,0,0,1,1,0,0
1,21,21,13,55,27,20,27,74,4,4,...,0,0,1,0,0,0,1,0,1,0
2,22,18,8,48,25,19,26,70,4,1,...,0,0,0,1,0,0,1,1,0,0
3,22,18,11,51,26,26,23,75,4,4,...,0,0,1,0,0,0,1,1,0,0
4,19,16,8,43,23,21,22,66,6,7,...,0,0,1,0,0,0,1,0,1,0
5,20,18,8,46,25,20,23,68,6,2,...,0,0,1,0,0,0,1,1,0,0
6,28,24,22,74,27,20,23,70,4,6,...,0,0,1,0,0,1,0,1,0,0
7,22,25,20,67,25,27,22,74,5,3,...,0,0,1,0,0,0,1,0,1,0
8,22,24,18,64,27,22,26,75,4,4,...,0,0,1,0,0,0,1,1,0,0
9,21,15,17,53,25,15,22,62,4,3,...,0,0,1,0,0,0,1,0,1,0
