In [1]:
import pandas as pd

DATA = '/kaggle/input/almond-types-classification/Almond.csv'

df = pd.read_csv(filepath_or_buffer=DATA, index_col=[0])
df.head()

Unnamed: 0,Length (major axis),Width (minor axis),Thickness (depth),Area,Perimeter,Roundness,Solidity,Compactness,Aspect Ratio,Eccentricity,Extent,Convex hull(convex area),Type
0,,227.940628,127.759132,22619.0,643.813269,,0.973384,1.458265,,,0.681193,23237.5,MAMRA
1,,234.188126,128.199509,23038.0,680.984841,,0.957304,1.601844,,,0.656353,24065.5,MAMRA
2,,229.41861,125.796547,22386.5,646.943212,,0.96727,1.487772,,,0.68362,23144.0,MAMRA
3,,232.763153,125.918808,22578.5,661.227483,,0.965512,1.540979,,,0.68536,23385.0,MAMRA
4,,230.150742,107.253448,19068.0,624.842706,,0.95145,1.629395,,,0.7148,20041.0,MAMRA


Let's see if we can make any headway classifying almonds based on just their size.

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2803 entries, 0 to 2802
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Length (major axis)       1946 non-null   float64
 1   Width (minor axis)        1861 non-null   float64
 2   Thickness (depth)         1799 non-null   float64
 3   Area                      2803 non-null   float64
 4   Perimeter                 2803 non-null   float64
 5   Roundness                 1946 non-null   float64
 6   Solidity                  2803 non-null   float64
 7   Compactness               2803 non-null   float64
 8   Aspect Ratio              1004 non-null   float64
 9   Eccentricity              1004 non-null   float64
 10  Extent                    2803 non-null   float64
 11  Convex hull(convex area)  2803 non-null   float64
 12  Type                      2803 non-null   object 
dtypes: float64(12), object(1)
memory usage: 306.6+ KB


Our data has a lot of nulls. Let's try using just the variables for which we have no nulls.

In [3]:
COLUMNS = ['Area', 'Perimeter', 'Solidity', 'Compactness', 'Extent', 'Convex hull(convex area)',]
TARGET = 'Type'

Is our target class balanced?

In [4]:
df['Type'].value_counts(normalize=True).to_dict()

{'SANORA': 0.3364252586514449,
 'MAMRA': 0.3328576525151623,
 'REGULAR': 0.3307170888333928}

Yes. That's good news. Let's visualize our dataset using dimension reduction.

In [5]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-08-02 12:29:34.293140: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 12:29:34.293310: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-02 12:29:34.431975: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Fri Aug  2 12:29:45 2024 Construct fuzzy simplicial set
Fri Aug  2 12:29:49 2024 Finding Nearest Neighbors
Fri Aug  2 12:29:54 2024 Finished Nearest Neighbor Search
Fri Aug  2 12:29:57 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Fri Aug  2 12:30:02 2024 Finished embedding
done with UMAP in 0:00:17.693127


In [6]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df, x='x', y='y', color=TARGET, facet_col=TARGET)

This is not encouraging. From this perspective our three classes look almost identical. Let's build a model.

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=100, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 46 iterations
accuracy: 0.4688


Our model is doing better than guessing, but not a lot better. Let's look at the regression cofficients.

In [8]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

       MAMRA       0.55      0.42      0.47       187
     REGULAR       0.44      0.65      0.53       185
      SANORA       0.45      0.34      0.39       189

    accuracy                           0.47       561
   macro avg       0.48      0.47      0.46       561
weighted avg       0.48      0.47      0.46       561



Our model does substantially better with regular almonds than with the other two classes. Is this surprising?

Let's look at the regression coefficients.

In [9]:
from plotly import express

express.histogram(x=COLUMNS, y=model.coef_[0])

They don't tell us much. Let's try another model.

In [10]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(max_depth=5, n_estimators=30, max_features=1, random_state=2024).fit(X=X_train, y=y_train)
print(classification_report(zero_division=0 , y_true=y_test, y_pred=forest.predict(X=X_test)))

              precision    recall  f1-score   support

       MAMRA       0.67      0.64      0.65       187
     REGULAR       0.52      0.58      0.55       185
      SANORA       0.58      0.54      0.56       189

    accuracy                           0.59       561
   macro avg       0.59      0.59      0.59       561
weighted avg       0.59      0.59      0.59       561



A random forest model does substantially better, but still not especially well.