## Synthetic Minority Oversampling Technique (SMOTE) is a technique used to balance out the data set in classification problems. Rather than creating duplicates of the minority class, it creates synthetic or artificial data points that are relatively similar to the original ones. 



In [None]:
import pandas as pd
data = pd.read_csv('car_evaluation.csv')
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [None]:
data.shape

(1728, 7)

In [None]:
data.outcome.value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: outcome, dtype: int64

In [None]:
## Splitting into target and independent variables

In [None]:
X = data.iloc[:,:6]
y = data.outcome
X.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med


In [None]:
##Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
X.loc[:,['buying','maint','lug_boot','safety']]=\
X.loc[:,['buying','maint','lug_boot','safety']].apply(enc.fit_transform)
X.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3,3,2,2,2,1
1,3,3,2,2,2,2
2,3,3,2,2,2,0
3,3,3,2,2,1,1
4,3,3,2,2,1,2


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = \
train_test_split(X,y,test_size=0.3,random_state=10)

In [None]:
##Model selection, fitting and testing

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train,y_train)
y_predict = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_predict))
pd.crosstab(y_test,y_predict)

0.9402697495183044


col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,93,1,8,0
good,7,14,0,0
unacc,5,0,366,0
vgood,7,2,1,15


In [None]:
###Using SMOTE to correct Imbalance 

In [None]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [None]:
##resampling our training dataset

In [None]:
X_train_smote, y_train_smote = smote.fit_resample(X_train.astype('float'),y_train)

In [None]:
from collections import Counter
print("Before SMOTE :" , Counter(y_train))
print("After SMOTE :" , Counter(y_train_smote))

Before SMOTE : Counter({'unacc': 839, 'acc': 282, 'good': 48, 'vgood': 40})
After SMOTE : Counter({'acc': 839, 'unacc': 839, 'vgood': 839, 'good': 839})


In [None]:
model.fit(X_train_smote,y_train_smote)
y_predict = model.predict(X_test)
print(accuracy_score(y_test,y_predict))
pd.crosstab(y_test,y_predict)

0.9344894026974951


col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,89,7,5,1
good,2,19,0,0
unacc,15,1,354,1
vgood,0,2,0,23
