In [None]:
# #mounting drive
# from google.colab import drive
# drive.mount('/content/drive')

Goal:

**Phase 1: Strategic Understanding**

Mobile price classification dataset holds crucial attributes for predicting mobile price ranges based on mobile features. The mobile phone industry can leverage this classification model to make data-driven decisions, enhance pricing accuracy, market campaigns, and efficient inventory control. The objective of this classification model is to categorize mobile phones into low cost, medium cost, high cost, and very high-cost categories based mobile features like ram, battery power, pixel height., etc. To cater to diverse customers strategic pricing in the mobile industry is required. By data analysis companies can aim at determining phone characteristics contributing towards price segments. Businesses can optimize their pricing strategy and make informed decisions on discounts, promotions, and plan product bundles. This dataset also reveals rising demand for storage space in phones, battery life, and the need for advanced cameras. Insights into feature-based mobile price classification will help the business in understanding customer purchasing motivation. This classification model will also help the customer in making informed decisions about phone purchase.

**Phase 2: Data Collection and Data Insights**

The data is sourced from Kaggle in a csv file.

In [None]:
#read csv file
import pandas as pd
raw_data = pd.read_csv('/content/drive/My Drive/Colab_Notebooks/Mobile_Classification/train.csv')
raw_data.head()

In [None]:
print(raw_data['fc'].value_counts())

In [None]:
raw_data.shape

In [None]:
raw_data.info()

In [None]:
#changing some of the column names for better understanding of the data description
raw_data.rename(columns={
    'blue': 'bluetooth',
    'fc': 'frnt_cam',
    'm_dep': 'moble_depth',
    'pc': 'prmry_cam',
    'sc_h': 'scrn_height',
    'sc_w': 'scrn_width'
}, inplace=True)

In [None]:
raw_data['price_range'].unique()

Price Range is the target variable which is of int data type. Looking at the dataset it is better to convert it to categorical type and categorizing it into low - 0, medium - 1, high - 2 and very high - 3 price range for better model explainability.

In [None]:
#converting the price range to low, medium, high and very high range
price_map = {
    1:'Medium Cost',
    2:'High Cost',
    3:'Very High Cost',
    0:'Low Cost'
    }

raw_data_converted = raw_data.copy()
raw_data_converted['price_range'] = raw_data_converted['price_range'].map(price_map)
raw_data_converted['price_range'].info()

In [None]:
raw_data_converted.head(4)

**Data Cleaning**

**1. There are no missing values in the dataset**

In [None]:
#looking for any missing values and obtaining only the columns with missing values
print(raw_data_converted.isna().any()[raw_data_converted.isna().any()])

**2. There are no duplicate values in the dataset**

In [None]:
raw_data_converted.duplicated().sum()

**Class Imbalance**

**Target Class is balanced** - The price range attribute has equally distributed number of records.

In [None]:
raw_data_converted['price_range'].value_counts()

**Phase 3: Determining and Depicting Outliers:**

In [None]:
raw_data_converted.describe()

From data we see that some records have no primary camera but still has pixel resolution height and width. This can be treated as manual error and hence deleting those records.

Some records dont have pixel resolution height but still has pixel resolution width which is impractical hence these records can be deleted.

In [None]:
#There are 101 records with 0 primary camera
raw_data_converted[raw_data_converted['prmry_cam'] == 0]['prmry_cam'].value_counts()

In [None]:
raw_data_converted[(raw_data_converted['prmry_cam'] == 0) & (raw_data_converted['px_height'] != 0)].value_counts().sum()

In [None]:
raw_data_converted[(raw_data_converted['prmry_cam'] == 0) & (raw_data_converted['px_width'] != 0)].value_counts().sum()

In [None]:
#creating violin plot to visualize distribution of Pixel Height by Camera Megapixel
import seaborn as sns
import matplotlib.pyplot as plt

sns.violinplot(x='prmry_cam', y = 'px_height', data = raw_data_converted, palette='viridis',hue='prmry_cam', legend=False)
plt.xlabel('Primary camera mega pixel', fontweight='bold')
plt.ylabel('Pixel resolution height', fontweight='bold')
plt.title('Distribution of Pixel Height by Camera Megapixel', fontweight='bold')
plt.show()

In [None]:
#creating violin plot to visualize distribution of Pixel width by Camera Megapixel
sns.violinplot(x='prmry_cam', y = 'px_width', data = raw_data_converted, palette='viridis',hue='prmry_cam', legend=False)
plt.xlabel('Primary camera mega pixel', fontweight='bold')
plt.ylabel('Pixel resolution width', fontweight='bold')
plt.title('Distribution of Pixel Width by Camera Megapixel', fontweight='bold')
plt.show()

In [None]:
#getting the records which doesn't have 0 primary camera.
clean_data = raw_data_converted[~(raw_data_converted['prmry_cam'] == 0)]
clean_data.shape

In [None]:
#creating violin plot to understand screen_width and screen_height distribution
sns.barplot(x='scrn_width', y= 'scrn_height', data = clean_data, palette = 'viridis', hue='scrn_width', legend=False)
plt.xlabel('Screen Width', fontweight='bold')
plt.ylabel('Screen Height', fontweight = 'bold')
plt.title('Distribution of screen width and height', fontweight='bold')
plt.show()

We have records with 0cm screen width but still have screen height which seems like manual error and hence these records can be deleted.

In [None]:
clean_data[(clean_data['scrn_width'] == 0) & (clean_data['scrn_height'] != 0)].value_counts().sum()

Dropping 167 records with 0 screen width but still having screen height.

In [None]:
clean_data = clean_data[~((clean_data['scrn_width'] == 0) & (clean_data['scrn_height'] != 0))]

In [None]:
clean_data.shape

**Exploratory Data Analysis**

**Correlation Heatmap**

In [None]:
correlation_matrix = raw_data.corr()
# plt.figure(figsize=(15,10))
# sns.heatmap(correlation_matrix, annot =True, cmap = 'Purples',linewidths = 0.2)
# plt.title('Correlation Heatmap', fontweight = 'bold')
# plt.show()
#Removed plot image from here and added in read.me file as the image was huge in size.

By picturing a correlation heatmap one can effortlessly depict the patterns and relations in the dataset. The correlation Heatmap interpret that the below features have positive correlation, for example as ram_mb increases price_range increases. ​

1. ram_mb and target variable price_range. ​

2. three_g and four_g ​

3. prm_cam_mp and front_cam ​

4. px_res_wdt and px_res_ht  ​

**Phase 4: Model Design**

In [None]:
#splitting the data into train and test
from sklearn.model_selection import train_test_split
X = clean_data.drop('price_range', axis = 1)
y = clean_data['price_range']
#splitting the data into 90% train and 10% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 1)

In [None]:
#1. Building Support Vector Machines
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

svc_clf = SVC(random_state = 1)
svc_clf.fit(X_train, y_train)
y_predict = svc_clf.predict(X_test)

#Analysing models performance via confusion matrix, accuracy, precision, recall, and F1 score
cm_dct = pd.DataFrame(confusion_matrix(y_test, y_predict, labels =['Medium Cost','High Cost', 'Very High Cost', 'Low Cost']),
                          index = ['actual Medium', 'actual High', 'actual Very High','actual Low'],
                          columns = ['predicted medium','predicted high','predicted very high','predicted low'])
print('SVM Confusion Matrix :\n',cm_dct )
print('Accuracy :\n', accuracy_score(y_test, y_predict))
print('Precision : \n', precision_score(y_test, y_predict, average = 'weighted'))
print('Recall : \n', recall_score(y_test, y_predict,average = 'weighted'))
print('F1 Score : \n', f1_score(y_test, y_predict, average = 'weighted'))


In [None]:
#2. Logistic Regression
from sklearn.linear_model import LogisticRegression

#splitting the data into 90% train and 10% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 1)

lr_clf = LogisticRegression(random_state = 1)
lr_clf.fit(X_test, y_test)
y_predict = lr_clf.predict(X_test)

#Analysing models performance via confusion matrix, accuracy, precision, recall, and F1 score
cm_dct = pd.DataFrame(confusion_matrix(y_test, y_predict, labels =['Medium Cost','High Cost', 'Very High Cost', 'Low Cost']),
                          index = ['actual Medium', 'actual High', 'actual Very High','actual Low'],
                          columns = ['predicted medium','predicted high','predicted very high','predicted low'])
print('Logistic Regression Confusion Matrix :\n',cm_dct )
print('Accuracy :\n', accuracy_score(y_test, y_predict))
print('Precision : \n', precision_score(y_test, y_predict, average = 'weighted'))
print('Recall : \n', recall_score(y_test, y_predict,average = 'weighted'))
print('F1 Score : \n', f1_score(y_test, y_predict, average = 'weighted'))

In [None]:
#3. Decision Tree
from sklearn.tree import DecisionTreeClassifier

dtc_clf = DecisionTreeClassifier(random_state = 1)
dtc_clf.fit(X_train, y_train)
y_predict = dtc_clf.predict(X_test)

#Analysing models performance via confusion matrix, accuracy, precision, recall, and F1 score
cm_dct = pd.DataFrame(confusion_matrix(y_test, y_predict, labels =['Medium Cost','High Cost', 'Very High Cost', 'Low Cost']),
                          index = ['actual Medium', 'actual High', 'actual Very High','actual Low'],
                          columns = ['predicted medium','predicted high','predicted very high','predicted low'])
print('Decision Tree Classifier Confusion Matrix :\n',cm_dct )
print('Accuracy :\n', accuracy_score(y_test, y_predict))
print('Precision : \n', precision_score(y_test, y_predict, average = 'weighted'))
print('Recall : \n', recall_score(y_test, y_predict,average = 'weighted'))
print('F1 Score : \n', f1_score(y_test, y_predict, average = 'weighted'))

In [None]:
#decision tree visualization
# from sklearn.tree import plot_tree
# plt.figure(figsize=(30, 20))
# plot_tree(dtc_clf, filled=True, feature_names = X.columns, class_names=['M', 'H', 'VH', 'L'], rounded = True, fontsize=14, max_depth=2)
# plt.show()

In [None]:
# from sklearn.tree import export_graphviz
# import graphviz

# # Export tree as dot format
# dot_data = export_graphviz(dtc_clf, out_file=None,
#                            feature_names=X.columns,
#                            class_names=['M', 'H', 'VH', 'L'],
#                            filled=True, rounded=True,
#                            special_characters=True)

# # Visualize using graphviz
# graph = graphviz.Source(dot_data)
# #graph.render("decision_tree")  # saves as PDF
# graph

The interpretability of Decision trees gives clear understanding that the instances with greater ram, battery power, pixel resolution height and width pushes the phone range into High or Very High Cost.

**Phase 5: Model Analysis, Model Refinement, Model Review, and Model Performance**

In [None]:
#1. SHAP analysis to get feature importance -  Support Vector Machines
import shap

#Initializing JS visualization in notebooks
shap.initjs()

#using kernel explainer
#The decision_function is used for multi-class SVC
explainer = shap.KernelExplainer(svc_clf.decision_function, shap.kmeans(X_train, 10))
shap_values = explainer.shap_values(X_test)

#Summary plot (global importance)
shap.summary_plot(shap_values, X_test, plot_type="bar")

From SHAP analysis it is evident that the most important features contribuiting towards the prediction of price range are ram, battery_power, px_height, and px_width.

In [None]:
X = clean_data[['battery_power', 'ram', 'px_height', 'px_width']]
X.shape

In [None]:
#Re-building the model considering only the important features from SHAP analysis
from sklearn.svm import SVC

X = clean_data[['battery_power', 'ram', 'px_height', 'px_width']]
y = clean_data['price_range']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.1,random_state =1)

svc_model_1 = SVC(random_state = 1)
svc_model_1.fit(X_train,y_train)
y_predict_svc = svc_model_1.predict(X_test)

#analysing the models performance using accuracy, precision, recall, and F1-score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

cm_svc = pd.DataFrame(confusion_matrix(y_predict_svc, y_test, labels= ['Medium Cost', 'High Cost', 'Very High Cost', 'Low Cost']),
                      index = ['actual Medium', 'actual High', 'actual Very High', 'actual Low'],
                      columns = ['predicted Medium', 'predicted High', 'predicted Very High', 'predicted Low'])
print('Confusion Matrix of Support Vector Machine Model 1:\n', cm_svc)
print('Accuracy Score :\n', accuracy_score(y_test, y_predict_svc))
print('Precision Score :\n', precision_score(y_test, y_predict_svc, average='weighted'))
print('Recall Score :\n', recall_score(y_test, y_predict_svc, average='weighted'))
print('F1 Score :\n', f1_score(y_test,y_predict_svc, average = 'weighted'))

Since there is no difference in models performance with all the features and with most important features, we can consider the model 1 for further performance tuning.

In [None]:
#2. Hyper parameter tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

#Lower C = stronger regularization (simpler model), higher C = weaker regularization (less penalty for misclassifications)
#gamma controls how far the influence of a single training example reaches:
#Low gamma = far influence (smooth decision boundary)
#High gamma = close influence (can lead to overfitting)


# param_grid = {#regularization parameter
#     'C': [0.01, 0.1, 1, 10, 100], 'gamma': ['scale','auto'],'kernel' : ['rbf', 'linear']}

# #verbose=1 shows basic progress messages while training.
# grid = GridSearchCV(SVC(random_state = 1),param_grid,cv=StratifiedKFold(n_splits=3), scoring = 'accuracy', verbose = 1, n_jobs=-1)
# grid.fit(X_train, y_train)

# # Best parameters and score
# print("Best parameters:", grid.best_params_)
# print("Best cross-validation score:", grid.best_score_)

# # Predict on test data
# y_pred_test = grid.predict(X_test)