In [1]:
import pandas as pd
import numpy as np

In [12]:
df = pd.DataFrame({
    'agg':[98,17,74,53,77],
    'gender':['male','female','male','female','male'],
    'review':['Good','Average','Poor','Poor','Average'],
    'education': ['UG','PG','PG','PG','HS'],
    'purchased': ['Yes','Yes','No','No','No']  
                                         })
df

Unnamed: 0,agg,gender,review,education,purchased
0,98,male,Good,UG,Yes
1,17,female,Average,PG,Yes
2,74,male,Poor,PG,No
3,53,female,Poor,PG,No
4,77,male,Average,HS,No


# OneHotEncoding using pandas

In [15]:
pd.get_dummies(df,columns=['gender','purchased'])

Unnamed: 0,agg,review,education,gender_female,gender_male,purchased_No,purchased_Yes
0,98,Good,UG,0,1,0,1
1,17,Average,PG,1,0,0,1
2,74,Poor,PG,0,1,1,0
3,53,Poor,PG,1,0,1,0
4,77,Average,HS,0,1,1,0


In [17]:
pd.get_dummies(df,columns=['gender','purchased'],drop_first=True)

Unnamed: 0,agg,review,education,gender_male,purchased_Yes
0,98,Good,UG,1,1
1,17,Average,PG,0,1
2,74,Poor,PG,1,0
3,53,Poor,PG,0,0
4,77,Average,HS,1,0


## OneHotEncoding using sklearn

In [19]:
from sklearn.preprocessing import OneHotEncoder

In [24]:
ohe = OneHotEncoder()
new_df = ohe.fit_transform(df[['gender','purchased']]).toarray()
new_df

array([[0., 1., 0., 1.],
       [1., 0., 0., 1.],
       [0., 1., 1., 0.],
       [1., 0., 1., 0.],
       [0., 1., 1., 0.]])

In [25]:
np.hstack((df[['agg','review','education']].values,new_df))

array([[98, 'Good', 'UG', 0.0, 1.0, 0.0, 1.0],
       [17, 'Average', 'PG', 1.0, 0.0, 0.0, 1.0],
       [74, 'Poor', 'PG', 0.0, 1.0, 1.0, 0.0],
       [53, 'Poor', 'PG', 1.0, 0.0, 1.0, 0.0],
       [77, 'Average', 'HS', 0.0, 1.0, 1.0, 0.0]], dtype=object)

In [26]:
ohe = OneHotEncoder(drop='first')
new_df = ohe.fit_transform(df[['gender','purchased']]).toarray()
new_df

array([[1., 1.],
       [0., 1.],
       [1., 0.],
       [0., 0.],
       [1., 0.]])

In [27]:
np.hstack((df[['agg','review','education']].values,new_df))

array([[98, 'Good', 'UG', 1.0, 1.0],
       [17, 'Average', 'PG', 0.0, 1.0],
       [74, 'Poor', 'PG', 1.0, 0.0],
       [53, 'Poor', 'PG', 0.0, 0.0],
       [77, 'Average', 'HS', 1.0, 0.0]], dtype=object)

In [38]:
ohe = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)
new_df = ohe.fit_transform(df[['gender','purchased']])
new_df = pd.DataFrame(new_df,columns=['gender','purchased'])
new_df

Unnamed: 0,gender,purchased
0,1,1
1,0,1
2,1,0
3,0,0
4,1,0


In [48]:
Final_DF = pd.concat((df[['agg','review','education']],new_df),axis =1)
Final_DF

Unnamed: 0,agg,review,education,gender,purchased
0,98,Good,UG,1,1
1,17,Average,PG,0,1
2,74,Poor,PG,1,0
3,53,Poor,PG,0,0
4,77,Average,HS,1,0


### One hot encoding for multiple categories

In [58]:
car = pd.read_csv('Car_sales (1).csv')
car.sample(5)

Unnamed: 0,Manufacturer,Model,Sales_in_thousands,__year_resale_value,Vehicle_type,Price_in_thousands,Engine_size,Horsepower,Wheelbase,Width,Length,Curb_weight,Fuel_capacity,Fuel_efficiency,Latest_Launch,Power_perf_factor
31,Chrysler,Cirrus,32.306,12.64,Passenger,16.48,2.0,132.0,108.0,71.0,186.0,2.911,16.0,27.0,10/6/2011,53.5662
21,Chevrolet,Lumina,24.629,10.31,Passenger,18.89,3.1,175.0,107.5,72.5,200.9,3.33,16.6,25.0,5/24/2011,69.991396
4,Audi,A4,20.397,22.255,Passenger,23.99,1.8,150.0,102.6,68.2,178.0,2.998,16.4,27.0,10/8/2011,62.777639
66,Jaguar,S-Type,15.467,,Passenger,42.8,3.0,240.0,114.5,71.6,191.3,3.65,18.4,21.0,11/3/2012,102.178985
137,Toyota,Camry,247.994,13.245,Passenger,17.518,2.2,133.0,105.2,70.1,188.5,2.998,18.5,27.0,2/10/2011,54.37242


In [81]:
car['Manufacturer'].value_counts()

Dodge         11
Ford          11
Toyota         9
Chevrolet      9
Mercedes-B     9
Mitsubishi     7
Nissan         7
Chrysler       7
Volvo          6
Oldsmobile     6
Lexus          6
Mercury        6
Pontiac        6
Volkswagen     6
Saturn         5
Cadillac       5
Honda          5
Plymouth       4
Acura          4
Buick          4
Audi           3
Jeep           3
Porsche        3
Hyundai        3
BMW            3
Lincoln        3
Saab           2
Subaru         2
Jaguar         1
Infiniti       1
Name: Manufacturer, dtype: int64

In [51]:
counts = car['Manufacturer'].value_counts()

In [52]:
car['Manufacturer'].nunique()

30

In [53]:
threshold = 5

In [55]:
counts[counts<= threshold].index

Index(['Saturn', 'Cadillac', 'Honda', 'Plymouth', 'Acura', 'Buick', 'Audi',
       'Jeep', 'Porsche', 'Hyundai', 'BMW', 'Lincoln', 'Saab', 'Subaru',
       'Jaguar', 'Infiniti'],
      dtype='object')

In [56]:
repl = counts[counts<= threshold].index

In [57]:
pd.get_dummies(car['Manufacturer'].replace(repl,'Uncommon'))

Unnamed: 0,Chevrolet,Chrysler,Dodge,Ford,Lexus,Mercedes-B,Mercury,Mitsubishi,Nissan,Oldsmobile,Pontiac,Toyota,Uncommon,Volkswagen,Volvo
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
154,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
155,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


# ordinal encoding

In [59]:
df 

Unnamed: 0,agg,gender,review,education,purchased
0,98,male,Good,UG,Yes
1,17,female,Average,PG,Yes
2,74,male,Poor,PG,No
3,53,female,Poor,PG,No
4,77,male,Average,HS,No


In [62]:
new = df.iloc[:,2:]

In [63]:
new

Unnamed: 0,review,education,purchased
0,Good,UG,Yes
1,Average,PG,Yes
2,Poor,PG,No
3,Poor,PG,No
4,Average,HS,No


In [141]:
ord1 = new.iloc[:,:-1]
leb = new.iloc[:,-1]
leb

0    Yes
1    Yes
2     No
3     No
4     No
Name: purchased, dtype: object

In [92]:
from sklearn.preprocessing import OrdinalEncoder

In [93]:
oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['HS','UG','PG']],dtype=np.int32)
oe.fit_transform(ord1)  

array([[2, 1],
       [1, 2],
       [0, 2],
       [0, 2],
       [1, 0]])

## Lebel encoding

In [94]:
from sklearn.preprocessing import LabelEncoder

In [96]:
le = LabelEncoder()
le.fit_transform(leb)

array([1, 1, 0, 0, 0])

## column transformer

In [169]:
df = pd.DataFrame({
    'age':[98,17,74,53,77,np.nan,np.nan],
    'gender':['male','female','male','female','male','male','female'],
    'review':['Good','Average','Poor','Poor','Average','Good','Average'],
    'education': ['UG','PG','PG','PG','HS','PG','HS'],
    'purchased': ['Yes','Yes','No','No','No','Yes','No']  
                                         })
df

Unnamed: 0,age,gender,review,education,purchased
0,98.0,male,Good,UG,Yes
1,17.0,female,Average,PG,Yes
2,74.0,male,Poor,PG,No
3,53.0,female,Poor,PG,No
4,77.0,male,Average,HS,No
5,,male,Good,PG,Yes
6,,female,Average,HS,No


In [170]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['purchased']),df['purchased'],test_size=.2)
x_train

Unnamed: 0,age,gender,review,education
2,74.0,male,Poor,PG
5,,male,Good,PG
1,17.0,female,Average,PG
6,,female,Average,HS
4,77.0,male,Average,HS


In [171]:
from sklearn.impute import SimpleImputer

In [172]:
from sklearn.compose import ColumnTransformer

In [188]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['age']),
    ('tnf2',OrdinalEncoder(categories=[['Poor','Average','Good'],['HS','UG','PG']]),['review','education']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender'])
    
])

In [189]:
transformer.fit_transform(x_train)

array([[74.,  0.,  2.,  1.],
       [56.,  2.,  2.,  1.],
       [17.,  1.,  2.,  0.],
       [56.,  1.,  0.,  0.],
       [77.,  1.,  0.,  1.]])

In [175]:
transformer.transform(x_test)

array([[53.,  0.,  2.,  0.],
       [98.,  2.,  1.,  1.]])

In [182]:
df

Unnamed: 0,age,gender,review,education,purchased
0,98.0,male,Good,UG,Yes
1,17.0,female,Average,PG,Yes
2,74.0,male,Poor,PG,No
3,53.0,female,Poor,PG,No
4,77.0,male,Average,HS,No
5,,male,Good,PG,Yes
6,,female,Average,HS,No


In [196]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['age']),
    ('tnf2',OrdinalEncoder(categories=[['Poor','Average','Good'],['HS','UG','PG']]),['review','education']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender'])
])

In [197]:
transformer.fit_transform(df)

array([[98. ,  2. ,  1. ,  1. ],
       [17. ,  1. ,  2. ,  0. ],
       [74. ,  0. ,  2. ,  1. ],
       [53. ,  0. ,  2. ,  0. ],
       [77. ,  1. ,  0. ,  1. ],
       [63.8,  2. ,  2. ,  1. ],
       [63.8,  1. ,  0. ,  0. ]])