In [1]:
# import basic libraries
import pandas as pd
import numpy as np

In [2]:
# set labels
feature_names = ["battery_power","blue","clock_speed","dual_sim","fc","four_g","int_memory","m_dep","mobile_wt","n_cores","pc","px_height","px_width","ram","sc_h","sc_w","talk_time","three_g","touch_screen","wifi","price_range"]
class_names = ["very cheap", # label 0
               "cheap", # label 1 
               "expensive", # label 2
               "very expensive"]

### load training data

In [3]:
# load from the provided csv-file ( source: https://www.kaggle.com/iabhishekofficial/mobile-price-classification )
trainingsdata = pd.read_csv("../data/mobile-price-classification/train.csv")

In [4]:
# trainings label are provided by column price_range
labels = trainingsdata["price_range"]

In [5]:
# drop coulmn of the trainings label to get only the trainings data 
trainingsdata = trainingsdata.drop("price_range", axis=1)
trainingsdata

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,14,1222,1890,668,13,4,19,1,1,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,3,915,1965,2032,11,10,16,1,1,1
1997,1911,0,0.9,1,1,1,36,0.7,108,8,3,868,1632,3057,9,1,5,1,1,0
1998,1512,0,0.9,0,4,1,46,0.1,145,5,5,336,670,869,18,10,19,1,1,1


### load test data

In [6]:
# load additional test data, here no labels for a check are provided
testdata = pd.read_csv("../data/mobile-price-classification/test.csv")
testdata.drop("id", axis=1)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1043,1,1.8,1,14,0,5,0.1,193,3,16,226,1412,3476,12,7,2,0,1,0
1,841,1,0.5,1,4,1,61,0.8,191,5,12,746,857,3895,6,0,7,1,0,0
2,1807,1,2.8,0,1,0,27,0.9,186,3,4,1270,1366,2396,17,10,10,0,1,1
3,1546,0,0.5,1,18,1,25,0.5,96,8,20,295,1752,3893,10,0,7,1,1,0
4,1434,0,1.4,0,11,1,49,0.5,108,6,18,749,810,1773,15,8,7,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1700,1,1.9,0,0,1,54,0.5,170,7,17,644,913,2121,14,8,15,1,1,0
996,609,0,1.8,1,0,0,13,0.9,186,4,2,1152,1632,1933,8,1,19,0,1,1
997,1185,0,1.4,0,1,1,8,0.5,80,1,12,477,825,1223,5,0,14,1,0,0
998,1533,1,0.5,1,0,0,50,0.4,171,2,12,38,832,2509,15,11,6,0,1,0


### Train net

In [7]:
# get the range for the labels
print("min:", min(labels),"max:", max(labels))

min: 0 max: 3


In [8]:
# create a target vector ( hot encoding ) to train a neural net
def target_vector(i):
    ret = [0 for i in range(3+1)]
    ret[i] = 1
    return ret

In [9]:
# transform the labels into a hot encoded vector
X_enc = trainingsdata
y_enc = []
for k in labels:
    y_enc.append((target_vector(k)))

y_enc = np.array(y_enc)

## Train a blackbox classification System

In [10]:
#import the base package and the pipeline model
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# decomposition for preprocess the data
from sklearn.decomposition import PCA

# simple regression models 
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# classifier models 
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [11]:
# prepare the data for training by splitting up the labeled data into two sets. the first to train the model (67 % of all)
# and 33% to validate the efficiency of the model

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainingsdata, labels, test_size=0.33, random_state=101)

# to handle the data more easily the index will be reassigned
X_train.index = [i for i in range(1340)]

In [15]:
#first analyse the data with a linear regression
lm = LinearRegression()
lm.fit(X_train,y_train)
print("linear regression score for the trainingdata:", lm.score(X_train,y_train))


linear regression score for the trainingdata: 0.9202456563636244


In [14]:
print("linear regression score for the testdata:", lm.score(X_test,y_test))

linear regression score for the testdata: 0.9132801488185275


In [16]:
# analyse the data with an appropriate model for instance a random forest classifier
model = Pipeline
pca = PCA() # preproccess the data
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
model = Pipeline([('pca', pca), ('rf', rf)])
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('rf',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=-1,
                                        oob_score=False, random_state=None,
                    

In [17]:
# get an prediction of the model for all data
pred = model.predict(trainingsdata)
print("Overall score for modell: ", sklearn.metrics.f1_score(labels, pred, average='weighted'))

0.9527174332736281


In [20]:
# get an prediction of the model for all data
pred = model.predict(X_train)
print("Score for trainingsdata: ", sklearn.metrics.f1_score(y_train, pred, average='weighted'))

Score for trainingsdata:  1.0


In [21]:
# get an prediction of the model for all data
pred = model.predict(X_test)
print("Score for testdata: ", sklearn.metrics.f1_score(y_test, pred, average='weighted'))

Score for testdata:  0.8558857868488424


In [27]:
# prediction influence which 
np.set_printoptions(threshold=30)
print(model.predict_proba(X_train).round(3))

[[0.93 0.07 0.   0.  ]
 [0.93 0.06 0.01 0.  ]
 [0.   0.01 0.05 0.94]
 ...
 [0.88 0.1  0.02 0.  ]
 [0.02 0.   0.77 0.21]
 [0.89 0.05 0.02 0.04]]


## Lime Blackbox explainer

In [29]:
# import the LimeTabular Explainer to explain which properties leads to the classfication
from interpret.blackbox import LimeTabular
from interpret import show

# provide the prediction funciotn (model.predict_proba) and the trainingsdata to the explainer
lime = LimeTabular(predict_fn=model.predict_proba, data=X_train, random_state=1)


#create the explanation in a graph for the first 30 instances of the test dataset 
lime_local = lime.explain_local(X_test[:30], y_test[:30], name='LIME')

show(lime_local)


## Conclusion

As expected for this simple classification model the classification for the mobile prices can be done by respecting only few properties as for example the RAM.
For a better classficiation the model should be adapted.