In [55]:
import numpy as np 
import pandas as pd 

In [56]:
df = pd.read_csv('customer.csv')

In [57]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


gender column consists of 2 unordered values thus #one-hot encoding will work on it 

review and education column are ordered columns thus #ordinal encoding 

and purchased is the target categorical variable thus #label-encoding

In [58]:
df = df.iloc[:,2:]

In [59]:
df.sample(5)

Unnamed: 0,review,education,purchased
26,Poor,PG,No
10,Good,UG,Yes
14,Poor,PG,Yes
28,Poor,School,No
42,Good,PG,Yes


In [60]:
df['education'].unique()

array(['School', 'UG', 'PG'], dtype=object)

In [61]:
df['review'].unique()

array(['Average', 'Poor', 'Good'], dtype=object)

In [90]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop('purchased',axis=1),df['purchased'],test_size=0.2) # here df['purchased'] is stored as the pandas.series 
# to store it in the dataframe use df[['purchased']]

In [91]:
X_train.shape

(40, 2)

In [92]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[['Poor', 'Average', 'Good'],['School', 'UG', 'PG']]) # make sure the values in order ranging from low to high (preferable approach)
oe.fit(X_train)

In [93]:
X_train_cat = oe.transform(X_train)
X_train_cat = pd.DataFrame(X_train_cat,columns = X_train.columns)
X_train_cat.head()

Unnamed: 0,review,education
0,2.0,1.0
1,1.0,1.0
2,0.0,2.0
3,2.0,2.0
4,2.0,1.0


In [94]:
X_test_cat = oe.transform(X_test)
X_test_cat = pd.DataFrame(X_test_cat,columns = X_test.columns)
print(X_test_cat.head())
print(X_test.shape)


   review  education
0     1.0        0.0
1     0.0        2.0
2     2.0        0.0
3     2.0        0.0
4     1.0        0.0
(10, 2)


In [95]:
# lets transform target variable with label-encoding 

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y_train)

In [98]:
le.classes_ # no = '0' and yes = '1'

array(['No', 'Yes'], dtype=object)

In [99]:
y_train_cat = le.transform(y_train)
y_train_cat = pd.DataFrame(y_train_cat,columns=[y_train.name]) # series dont have columns attribute there fore u use [.name]
y_train_cat.head()

Unnamed: 0,purchased
0,1
1,0
2,0
3,1
4,1
