In [3]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection, tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN

In [4]:
df=pd.read_csv("final_.csv")
df.head()

Unnamed: 0,GENDER,AGEGRP1,NGROUP,BILL,REGION,CLASS,ORGYN,AFFL,LTIME
0,3,3,2,3000.0,2,2,0,12,39
1,1,1,4,5000.01,1,3,1,8,9
2,2,3,2,5000.01,2,3,0,5,9
3,2,3,2,380.0,1,2,0,6,36
4,1,3,2,5000.01,5,3,0,9,8


In [5]:
df.dtypes

GENDER       int64
AGEGRP1      int64
NGROUP       int64
BILL       float64
REGION       int64
CLASS        int64
ORGYN        int64
AFFL         int64
LTIME        int64
dtype: object

In [6]:
#Fixing variable type

df['GENDER'] = df['GENDER'].astype('category')
df['AGEGRP1'] = df['AGEGRP1'].astype('category')
df['NGROUP'] = df['NGROUP'].astype('category')
df['REGION'] = df['REGION'].astype('category')
df['CLASS'] = df['CLASS'].astype('category')
df['ORGYN'] = df['ORGYN'].astype('category')
df.dtypes

GENDER     category
AGEGRP1    category
NGROUP     category
BILL        float64
REGION     category
CLASS      category
ORGYN      category
AFFL          int64
LTIME         int64
dtype: object

In [7]:
#Checking the imbalance on ORGYN

df.ORGYN.value_counts()

0    16718
1     5505
Name: ORGYN, dtype: int64

In [8]:
df.describe(include='all')

Unnamed: 0,GENDER,AGEGRP1,NGROUP,BILL,REGION,CLASS,ORGYN,AFFL,LTIME
count,22223.0,22223.0,22223.0,22223.0,22223.0,22223.0,22223.0,22223.0,22223.0
unique,3.0,3.0,7.0,,5.0,4.0,2.0,,
top,2.0,2.0,3.0,,4.0,2.0,0.0,,
freq,12149.0,10439.0,4717.0,,8829.0,8572.0,16718.0,,
mean,,,,4420.590041,,,,8.707285,6.559735
std,,,,7559.047522,,,,3.395368,4.645552
min,,,,0.01,,,,1.0,0.0
25%,,,,0.01,,,,6.0,4.0
50%,,,,2000.0,,,,8.0,5.0
75%,,,,6000.0,,,,11.0,8.0


In [9]:
#Separating dataset into training data and test data

X = df[['GENDER','AGEGRP1','NGROUP','BILL','REGION','CLASS','AFFL','LTIME']]
Y = df['ORGYN']

scaler=StandardScaler()
Z1=X.BILL
Z1=pd.DataFrame(Z1)
Z1=scaler.fit_transform(Z1)
Z1=pd.DataFrame(Z1)
X=df.drop("BILL",axis=1)
X["BILL"]=Z1

Z2=X.AFFL
Z2=pd.DataFrame(Z2)
Z2=scaler.fit_transform(Z2)
Z2=pd.DataFrame(Z2)
X=df.drop("AFFL",axis=1)
X["AFFL"]=Z2

Z3=X.LTIME
Z3=pd.DataFrame(Z3)
Z3=scaler.fit_transform(Z3)
Z3=pd.DataFrame(Z3)
X=df.drop("LTIME",axis=1)
X["LTIME"]=Z3

X=X[['GENDER','AGEGRP1','NGROUP','BILL','REGION','CLASS','AFFL','LTIME']]
print(X.head())
print(X.dtypes)
print(X.describe(include='all'))

seed = 9
validation_size = 0.3
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)


  GENDER AGEGRP1 NGROUP     BILL REGION CLASS  AFFL     LTIME
0      3       3      2  3000.00      2     2    12  6.983237
1      1       1      4  5000.01      1     3     8  0.525302
2      2       3      2  5000.01      2     3     5  0.525302
3      2       3      2   380.00      1     2     6  6.337444
4      1       3      2  5000.01      5     3     9  0.310038
GENDER     category
AGEGRP1    category
NGROUP     category
BILL        float64
REGION     category
CLASS      category
AFFL          int64
LTIME       float64
dtype: object
         GENDER  AGEGRP1   NGROUP           BILL   REGION    CLASS  \
count   22223.0  22223.0  22223.0   22223.000000  22223.0  22223.0   
unique      3.0      3.0      7.0            NaN      5.0      4.0   
top         2.0      2.0      3.0            NaN      4.0      2.0   
freq    12149.0  10439.0   4717.0            NaN   8829.0   8572.0   
mean        NaN      NaN      NaN    4420.590041      NaN      NaN   
std         NaN      NaN      NaN 

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [10]:
#Oversampling training data with SMOTE due to imbalance data problem
sm=SMOTEENN(random_state=1)
X1_train,Y1_train=sm.fit_resample(X_train,Y_train)

#Neural Network with SMOTE
NNs = MLPClassifier(hidden_layer_sizes=(50, 50,50), max_iter=1000,alpha=0.01, activation='relu')# 2 layers 30 neurons and 20 neurons
print(NNs)
NNs.fit(X1_train, Y1_train)
NNs_prediction_training=NNs.predict(X1_train)
NNs_prediction_test = NNs.predict(X_test)
print('Akurasi (training data) = ', accuracy_score(Y1_train, NNs_prediction_training))
print('Akurasi (test data) = ', accuracy_score(Y_test, NNs_prediction_test))
print(confusion_matrix(Y_test, NNs_prediction_test))
print(classification_report(Y_test, NNs_prediction_test))

MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 50, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
Akurasi (training data) =  0.8132340523528057
Akurasi (test data) =  0.6923653817309134
[[3533 1469]
 [ 582 1083]]
              precision    recall  f1-score   support

           0       0.86      0.71      0.78      5002
           1       0.42      0.65      0.51      1665

   micro avg       0.69      0.69      0.69      6667
   macro avg       0.64      0.68      0.64      6667
weighted avg       0.75      0.69      0.71      6667

