In [1]:
%matplotlib inline

Unlike R, python doesn't have a factor function to convert categorical data to numeric data for machine learning.
So, we have to do it using different approach!
Here is our dataset

In [2]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'factor1':['a','a','a','b','c','c','c'], 'factor2':['d','a','d','b','c','d','c'], 'num1':range(1,8)})
print(df)
print(df.dtypes)

  factor1 factor2  num1
0       a       d     1
1       a       a     2
2       a       d     3
3       b       b     4
4       c       c     5
5       c       d     6
6       c       c     7
factor1    object
factor2    object
num1        int32
dtype: object


First approach: using LabelEncoder() from sklearn.preprocessing!
However, such integer representation can not be used directly with scikit-learn estimators, as these expect continuous input, and would interpret the categories as being ordered, which is often not desired (i.e. the set of browsers was ordered arbitrarily)

In [3]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
df1 = df.copy()

cat_columns = df1.select_dtypes(['object']).columns

# convert categorical data to numeric
for cat in df1.columns.values:
    if df1[cat].dtype == 'object':
        df1[cat] = le.fit_transform(df1[cat])
        
#df1.factor2 = le.fit_transform(df1.factor2)
print(df1)

# convert it from int to float. However, once it is converted to float, you won't be able to convert it back to categorical!
df1_copy = df1.copy()

for idx in range(len(cat_columns)):
    df1_copy[cat_columns[idx]] = df1_copy[cat_columns[idx]].astype(np.float32)
    
#df1_copy.factor2 = df1_copy.factor2.astype(np.float32)
print(df1_copy)

# convert it back to the categorical. ==> to run this, you need to reload the df from the previous cell
df1.factor2 = le.inverse_transform(df1.factor2)
print(df1)

   factor1  factor2  num1
0        0        3     1
1        0        0     2
2        0        3     3
3        1        1     4
4        2        2     5
5        2        3     6
6        2        2     7
   factor1  factor2  num1
0      0.0      3.0     1
1      0.0      0.0     2
2      0.0      3.0     3
3      1.0      1.0     4
4      2.0      2.0     5
5      2.0      3.0     6
6      2.0      2.0     7
   factor1 factor2  num1
0        0       d     1
1        0       a     2
2        0       d     3
3        1       b     4
4        2       c     5
5        2       d     6
6        2       c     7


Second: is using select_dtypes and cat.codes() function in pandas

In [4]:
df4 = df.copy()

# need to convert object type to category type
for cat in df4.columns.values:
    #print(df4[cat].dtype)
    if df4[cat].dtype == 'object':
        df4[cat] = df4[cat].astype('category')
    
cat_columns = df4.select_dtypes(['category']).columns
print(cat_columns)

df5 = df4.copy()
# the following will convert categorical column to int only
df4[cat_columns] = df4[cat_columns].apply(lambda x: x.cat.codes)
print(df4)

# however, to convert to float, you have to do the following:
df5[cat_columns] = df5[cat_columns].apply(lambda x: x.cat.codes.astype(np.float32))
print(df5)



Index(['factor1', 'factor2'], dtype='object')
   factor1  factor2  num1
0        0        3     1
1        0        0     2
2        0        3     3
3        1        1     4
4        2        2     5
5        2        3     6
6        2        2     7
   factor1  factor2  num1
0      0.0      3.0     1
1      0.0      0.0     2
2      0.0      3.0     3
3      1.0      1.0     4
4      2.0      2.0     5
5      2.0      3.0     6
6      2.0      2.0     7


Third, use OneHotEncoder() from sklearn.preprocessing. Need to use DictVectorizer from sklearn.feature_extraction

In [13]:
from sklearn.feature_extraction import DictVectorizer

df2 = df.copy()    # deep copy!
print(df2)
print(df2.to_dict('records'))

dVec = DictVectorizer()
df2 = dVec.fit_transform(df2.to_dict('records')).toarray()
print(df2)
print(dVec.get_feature_names())

  factor1 factor2  num1
0       a       d     1
1       a       a     2
2       a       d     3
3       b       b     4
4       c       c     5
5       c       d     6
6       c       c     7
[{'factor1': 'a', 'num1': 1, 'factor2': 'd'}, {'factor1': 'a', 'num1': 2, 'factor2': 'a'}, {'factor1': 'a', 'num1': 3, 'factor2': 'd'}, {'factor1': 'b', 'num1': 4, 'factor2': 'b'}, {'factor1': 'c', 'num1': 5, 'factor2': 'c'}, {'factor1': 'c', 'num1': 6, 'factor2': 'd'}, {'factor1': 'c', 'num1': 7, 'factor2': 'c'}]
[[ 1.  0.  0.  0.  0.  0.  1.  1.]
 [ 1.  0.  0.  1.  0.  0.  0.  2.]
 [ 1.  0.  0.  0.  0.  0.  1.  3.]
 [ 0.  1.  0.  0.  1.  0.  0.  4.]
 [ 0.  0.  1.  0.  0.  1.  0.  5.]
 [ 0.  0.  1.  0.  0.  0.  1.  6.]
 [ 0.  0.  1.  0.  0.  1.  0.  7.]]
['factor1=a', 'factor1=b', 'factor1=c', 'factor2=a', 'factor2=b', 'factor2=c', 'factor2=d', 'num1']


Third: use get_dummies() from Pandas. However, It only converts a column of categorical variables (0,1,...k) to dummy k columns of (0, 1), that is binary (in float). Therefore, use it carefully!

In [37]:
df3 = df.copy()
print(df3)

#df3.factor2 = pd.get_dummies(df3.factor2)
pd.get_dummies(df3)
print(df3)

  factor1 factor2  num1
0       a       d     1
1       a       a     2
2       a       d     3
3       b       b     4
4       c       c     5
5       c       d     6
6       c       c     7
  factor1 factor2  num1
0       a       d     1
1       a       a     2
2       a       d     3
3       b       b     4
4       c       c     5
5       c       d     6
6       c       c     7


In [60]:
import pandas as pd
dff = pd.DataFrame([['rick','young'],['phil','old'],['john','teenager']],columns=['name','age-group'])
print(dff.describe())
print(dff.dtypes)

pd.get_dummies(dff)

        name age-group
count      3         3
unique     3         3
top     rick       old
freq       1         1
name         object
age-group    object
dtype: object


Unnamed: 0,name_john,name_phil,name_rick,age-group_old,age-group_teenager,age-group_young
0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0
