In [1]:
!pip install --quiet umap-learn

In [2]:
import pandas as pd

TRAIN = '/kaggle/input/mobile-price-classification/train.csv'
TEST = '/kaggle/input/mobile-price-classification/test.csv'

TARGET = 'price_range'

train_df = pd.read_csv(filepath_or_buffer=TRAIN)
test_df = pd.read_csv(filepath_or_buffer=TEST)
COLUMNS = [column for column in train_df.columns if column != TARGET]
train_df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [3]:
train_df[TARGET].value_counts().to_dict()

{1: 500, 2: 500, 3: 500, 0: 500}

Our target classes are balanced.

In [4]:
import arrow
from umap import UMAP

time_start = arrow.now()
reducer = UMAP(random_state=2024, verbose=False, n_jobs=1, low_memory=False, n_epochs=201)
train_df[['x', 'y']] = reducer.fit_transform(X=train_df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

done with UMAP in 0:00:11.492391


In [5]:
from plotly import express
express.scatter(data_frame=train_df, x='x', y='y', color=TARGET, height=600)

Honestly this looks like a situation where a support vector classifier might work well, but let's start with logistic regression.

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(train_df[COLUMNS], train_df[TARGET], test_size=0.2, random_state=2024, stratify=train_df[TARGET])

logreg = LogisticRegression(max_iter=100000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(logreg.n_iter_[0]))
print('accuracy: {:5.4f} f1: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=logreg.predict(X=X_test)), f1_score(average='weighted', y_true=y_test, y_pred=logreg.predict(X=X_test), zero_division=0)))
print(classification_report(y_true=y_test, y_pred=logreg.predict(X=X_test), zero_division=0))

model fit in 13593 iterations
accuracy: 0.7950 f1: 0.7970
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       100
           1       0.74      0.76      0.75       100
           2       0.67      0.71      0.69       100
           3       0.86      0.83      0.84       100

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Let's try using a support vector classifier.

In [7]:
from sklearn.svm import SVC

svc = SVC(kernel='linear', C=0.025, random_state=2024).fit(X=X_train, y=y_train)

print('accuracy: {:5.4f} f1: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=svc.predict(X=X_test)), f1_score(average='weighted', y_true=y_test, y_pred=svc.predict(X=X_test), zero_division=0)))
print(classification_report(y_true=y_test, y_pred=svc.predict(X=X_test), zero_division=0))


accuracy: 0.9850 f1: 0.9850
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       100
           1       0.98      0.99      0.99       100
           2       0.99      0.97      0.98       100
           3       0.98      0.99      0.99       100

    accuracy                           0.98       400
   macro avg       0.99      0.99      0.98       400
weighted avg       0.99      0.98      0.98       400



As expected from our analysis of the scatter plot above we do much better with SVC.

In [8]:
probability_df = pd.DataFrame(data=logreg.predict_proba(X=X_test).max(axis=1), columns=['probability'])
probability_df['true'] = y_test.tolist()
probability_df['pred'] = logreg.predict(X=X_test)
probability_df['correct'] = probability_df['true'] == probability_df['pred']
probability_df[['x', 'y']] = reducer.transform(X=X_test)

probability_df.head()

Unnamed: 0,probability,true,pred,correct,x,y
0,0.646253,2,3,False,0.645742,2.411383
1,0.983913,0,0,True,9.474147,2.908605
2,0.703979,1,1,True,6.251802,4.212541
3,0.669653,2,3,False,1.334082,4.627661
4,0.715534,2,2,True,-0.321622,5.363823
