In [49]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
import time

# Import mobile price dataset

In [2]:
raw_data = pd.read_csv('mobile_price_range.csv')
print("data dimensions:" ,raw_data.shape)
print("label classes: ", raw_data.price_range.value_counts())

data dimensions: (2000, 21)
label classes:  1    500
2    500
3    500
0    500
Name: price_range, dtype: int64


In [6]:
X1_o = raw_data.loc[:,raw_data.columns!= 'price_range']
y1 = raw_data['price_range']
X1 = preprocessing.scale(X1_o)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.4, random_state=18)

# 4.1 Clustering with NN

#### 4.1.1 K-means

In [34]:
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [43]:
random_seed = 15
# Choose k according to the approximate elbow location
k1 = 5
# Run kmeans again with optimal k values
kmeans1 = KMeans(n_clusters=k1, random_state=random_seed).fit(X_train)


In [45]:
X_train_kmeans = kmeans1.transform(X_train)
X_test_kmeans = kmeans1.transform(X_test)

In [46]:
clf_nn = MLPClassifier(random_state=7, max_iter=10000)
clf_nn.fit(X_train_kmeans, y_train)
y_pred = clf_nn.predict(X_test_kmeans)
nn_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of neural network without hyperparameter tuning is %.2f%%' % (nn_accuracy * 100))
# print(classification_report(y1, y_pred))

Accuracy of neural network without hyperparameter tuning is 30.38%


In [47]:
#Hyperparameter tuning
# Define grid for grid search after observing validation curves
alpha_range = np.logspace(-1, 2, 5)
lr_range = np.logspace(-5, 0, 6)
tuned_params = {'alpha' : alpha_range, 'learning_rate_init' : lr_range}
clf_nn = GridSearchCV(clf_nn, param_grid=tuned_params, cv=5, n_jobs=-1)
clf_nn.fit(X_train_kmeans, y_train)
best_clf_nn = clf_nn.best_estimator_
best_params = clf_nn.best_params_
print("Best parameters set found on development set:")
print(best_params)

Best parameters set found on development set:
{'alpha': 3.1622776601683795, 'learning_rate_init': 0.001}


In [50]:
t0 = time.time()
best_clf_nn.fit(X_train_kmeans, y_train)
t1 = time.time()
print('Training time: %f seconds' % (t1 - t0))
t0 = time.time()
y_pred = best_clf_nn.predict(X_test_kmeans)
t1 = time.time()
test_time = t1 - t0
print('Inference time on test data: %f seconds' % test_time)
best_accuracy = accuracy_score(y_test, y_pred)
print('Best accuracy of neural network is %.2f%%' % (best_accuracy * 100))

Training time: 1.506187 seconds
Inference time on test data: 0.010527 seconds
Best accuracy of neural network is 30.63%


#### 4.1.2 EM

In [52]:
from sklearn.mixture import GaussianMixture

In [53]:
gmm = GaussianMixture(n_components=26, random_state=random_seed)
gmm.fit(X_train)
X_train_gmm = gmm.predict_proba(X_train)
X_test_gmm = gmm.predict_proba(X_test)

In [54]:
clf_nn = MLPClassifier(random_state=7, max_iter=10000)
clf_nn.fit(X_train_gmm, y_train)
y_pred = clf_nn.predict(X_test_gmm)
nn_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of neural network without hyperparameter tuning is %.2f%%' % (nn_accuracy * 100))

Accuracy of neural network without hyperparameter tuning is 24.38%


In [55]:
# Define grid for grid search after observing validation curves
# alpha_range = np.logspace(-10, -5, 5)
alpha_range = np.asarray([0])
lr_range = np.logspace(-5, 0, 6)
tuned_params = {'alpha' : alpha_range, 'learning_rate_init' : lr_range}
clf_nn = GridSearchCV(clf_nn, param_grid=tuned_params, cv=5, n_jobs=-1)
clf_nn.fit(X_train_gmm, y_train)
best_clf_nn = clf_nn.best_estimator_
best_params = clf_nn.best_params_
print("Best parameters set found on development set:")
print(best_params)

Best parameters set found on development set:
{'alpha': 0, 'learning_rate_init': 0.01}


In [56]:
t0 = time.time()
best_clf_nn.fit(X_train_gmm, y_train)
t1 = time.time()
print('Training time: %f seconds' % (t1 - t0))
t0 = time.time()
y_pred = best_clf_nn.predict(X_test_gmm)
t1 = time.time()
test_time = t1 - t0
print('Inference time on test data: %f seconds' % test_time)
best_accuracy = accuracy_score(y_test, y_pred)
print('Best accuracy of neural network is %.2f%%' % (best_accuracy * 100))

Training time: 0.789144 seconds
Inference time on test data: 0.001994 seconds
Best accuracy of neural network is 24.00%


#### 4.2 Dimension Reduction with NN

#### 4.2.1 PCA

In [58]:
from sklearn.decomposition import PCA

In [59]:
# Choose the number of components that capture 85% of the variance
n1 = 16

# Transform the data
pca1 = PCA(n_components=n1).fit(X_train)

X_train_pca = pca1.transform(X_train)
X_test_pca = pca1.transform(X_test)

In [63]:
clf_nn = MLPClassifier(random_state=7, max_iter=10000)
clf_nn.fit(X_train_pca, y_train)
y_pred = clf_nn.predict(X_test_pca)
nn_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of neural network without hyperparameter tuning is %.2f%%' % (nn_accuracy * 100))
# print(classification_report(y1, y_pred))

Accuracy of neural network without hyperparameter tuning is 91.25%


In [64]:
#Hyperparameter tuning
# Define grid for grid search after observing validation curves
alpha_range = np.logspace(-1, 2, 5)
lr_range = np.logspace(-5, 0, 6)
tuned_params = {'alpha' : alpha_range, 'learning_rate_init' : lr_range}
clf_nn = GridSearchCV(clf_nn, param_grid=tuned_params, cv=5, n_jobs=-1)
clf_nn.fit(X_train_pca, y_train)
best_clf_nn = clf_nn.best_estimator_
best_params = clf_nn.best_params_
print("Best parameters set found on development set:")
print(best_params)

Best parameters set found on development set:
{'alpha': 3.1622776601683795, 'learning_rate_init': 0.001}


In [65]:
t0 = time.time()
best_clf_nn.fit(X_train_pca, y_train)
t1 = time.time()
print('Training time: %f seconds' % (t1 - t0))
t0 = time.time()
y_pred = best_clf_nn.predict(X_test_pca)
t1 = time.time()
test_time = t1 - t0
print('Inference time on test data: %f seconds' % test_time)
best_accuracy = accuracy_score(y_test, y_pred)
print('Best accuracy of neural network is %.2f%%' % (best_accuracy * 100))

Training time: 5.380371 seconds
Inference time on test data: 0.001964 seconds
Best accuracy of neural network is 92.88%


#### 4.2.2 ICA

In [66]:
from sklearn.decomposition import FastICA

In [67]:
# Choose the number of components that capture 85% of the variance
n1 = 16

# Transform the data
ica1 = FastICA(n_components=16, random_state=random_seed).fit(X_train)

X_train_ica = ica1.transform(X_train)
X_test_ica = ica1.transform(X_test)

In [68]:
clf_nn = MLPClassifier(random_state=7, max_iter=10000)
clf_nn.fit(X_train_ica, y_train)
y_pred = clf_nn.predict(X_test_ica)
nn_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of neural network without hyperparameter tuning is %.2f%%' % (nn_accuracy * 100))
# print(classification_report(y1, y_pred))

Accuracy of neural network without hyperparameter tuning is 92.88%


In [69]:
#Hyperparameter tuning
# Define grid for grid search after observing validation curves
alpha_range = np.logspace(-1, 2, 5)
lr_range = np.logspace(-5, 0, 6)
tuned_params = {'alpha' : alpha_range, 'learning_rate_init' : lr_range}
clf_nn = GridSearchCV(clf_nn, param_grid=tuned_params, cv=5, n_jobs=-1)
clf_nn.fit(X_train_ica, y_train)
best_clf_nn = clf_nn.best_estimator_
best_params = clf_nn.best_params_
print("Best parameters set found on development set:")
print(best_params)

Best parameters set found on development set:
{'alpha': 0.1, 'learning_rate_init': 0.001}


In [71]:
t0 = time.time()
best_clf_nn.fit(X_train_ica, y_train)
t1 = time.time()
print('Training time: %f seconds' % (t1 - t0))
t0 = time.time()
y_pred = best_clf_nn.predict(X_test_ica)
t1 = time.time()
test_time = t1 - t0
print('Inference time on test data: %f seconds' % test_time)
best_accuracy = accuracy_score(y_test, y_pred)
print('Best accuracy of neural network is %.2f%%' % (best_accuracy * 100))

Training time: 10.875093 seconds
Inference time on test data: 0.001974 seconds
Best accuracy of neural network is 93.38%


#### 4.2.3 RP

In [70]:
from sklearn.random_projection import GaussianRandomProjection

In [75]:
grp1 = GaussianRandomProjection(n_components=18, random_state=random_seed).fit(X_train)
X_train_grp = grp1.transform(X_train)
X_test_grp = grp1.transform(X_test)

In [80]:
clf_nn = None
clf_nn = MLPClassifier(random_state=7, max_iter=10000)
clf_nn.fit(X_train_grp, y_train)
y_pred = clf_nn.predict(X_test_grp)
nn_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of neural network without hyperparameter tuning is %.2f%%' % (nn_accuracy * 100))
# print(classification_report(y1, y_pred))

Accuracy of neural network without hyperparameter tuning is 72.12%


In [81]:
#Hyperparameter tuning
# Define grid for grid search after observing validation curves
alpha_range = np.logspace(-1, 2, 5)
lr_range = np.logspace(-5, 0, 6)
tuned_params = {'alpha' : alpha_range, 'learning_rate_init' : lr_range}
clf_nn_cv = GridSearchCV(clf_nn, param_grid=tuned_params, cv=5, n_jobs=-1)
clf_nn_cv.fit(X_train_grp, y_train)
best_clf_nn = clf_nn_cv.best_estimator_
best_params = clf_nn_cv.best_params_
print("Best parameters set found on development set:")
print(best_params)

Best parameters set found on development set:
{'alpha': 0.5623413251903491, 'learning_rate_init': 0.1}


In [82]:
t0 = time.time()
best_clf_nn.fit(X_train_grp, y_train)
t1 = time.time()
print('Training time: %f seconds' % (t1 - t0))
t0 = time.time()
y_pred = best_clf_nn.predict(X_test_grp)
t1 = time.time()
test_time = t1 - t0
print('Inference time on test data: %f seconds' % test_time)
best_accuracy = accuracy_score(y_test, y_pred)
print('Best accuracy of neural network is %.2f%%' % (best_accuracy * 100))

Training time: 1.128522 seconds
Inference time on test data: 0.002007 seconds
Best accuracy of neural network is 78.00%


#### 4.2.4 Lasso

In [83]:
# grid search hyperparameters for lasso regression
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel

In [93]:
sel_ = SelectFromModel(LogisticRegression(C=0.1, penalty='l1',solver = 'liblinear'))
sel_.fit(X_train, y_train)
Xtrain_df = pd.DataFrame(data=X_train)
Xtest_df = pd.DataFrame(data=X_test)

selected_feat1 = X1_df.columns[(sel_.get_support())]
X_train_lasso = Xtrain_df[Xtrain_df.columns[selected_feat1]]
X_test_lasso = Xtest_df[Xtest_df.columns[selected_feat1]]

In [95]:
clf_nn = None
clf_nn = MLPClassifier(random_state=7, max_iter=10000)
clf_nn.fit(X_train_lasso, y_train)
y_pred = clf_nn.predict(X_test_lasso)
nn_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of neural network without hyperparameter tuning is %.2f%%' % (nn_accuracy * 100))
# print(classification_report(y1, y_pred))

Accuracy of neural network without hyperparameter tuning is 91.88%


In [96]:
#Hyperparameter tuning
# Define grid for grid search after observing validation curves
alpha_range = np.logspace(-1, 2, 5)
lr_range = np.logspace(-5, 0, 6)
tuned_params = {'alpha' : alpha_range, 'learning_rate_init' : lr_range}
clf_nn_cv = GridSearchCV(clf_nn, param_grid=tuned_params, cv=5, n_jobs=-1)
clf_nn_cv.fit(X_train_lasso, y_train)
best_clf_nn = clf_nn_cv.best_estimator_
best_params = clf_nn_cv.best_params_
print("Best parameters set found on development set:")
print(best_params)

Best parameters set found on development set:
{'alpha': 3.1622776601683795, 'learning_rate_init': 0.001}


In [97]:
t0 = time.time()
best_clf_nn.fit(X_train_lasso, y_train)
t1 = time.time()
print('Training time: %f seconds' % (t1 - t0))
t0 = time.time()
y_pred = best_clf_nn.predict(X_test_lasso)
t1 = time.time()
test_time = t1 - t0
print('Inference time on test data: %f seconds' % test_time)
best_accuracy = accuracy_score(y_test, y_pred)
print('Best accuracy of neural network is %.2f%%' % (best_accuracy * 100))

Training time: 5.159770 seconds
Inference time on test data: 0.004307 seconds
Best accuracy of neural network is 96.75%


In [98]:
alpha_range

array([  0.1       ,   0.56234133,   3.16227766,  17.7827941 ,
       100.        ])

In [99]:
lr_range

array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00])