# Model finding

This notebook will work finding the best model to make predictions on our datas 

**Main goals are**: 
- Find or more suitable models
- Tune their parameters to improve their performances
- Evaluate their performance using different metrics

In [7]:
# Retreiving a significative amount of data
from local_lib.dataset import retreiveDatasetFromCsv

dataset = retreiveDatasetFromCsv(50)

In [2]:
# Retreiving a significative amount of data
from local_lib.dataset_images import retreiveDatasetFromImages

dataset = retreiveDatasetFromImages(30)

In [3]:
dataset.iloc[0]["pixel_value"]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset["pixel_value"], dataset["class"], test_size=0.33, random_state=0)

In [5]:
import numpy as np
np.shape(X_train)


(20,)

In [6]:
X_train_array = np.array(X_train.values.reshape(-1, 1))
np.shape(X_train_array)
X_train_array[0]

array([array([141, 136, 131, ...,   2,   0,   1], dtype=int64)],
      dtype=object)

In [7]:
from local_lib.transformer import MaskSeuilTransformer
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('image_mask', MaskSeuilTransformer(strategy='combined')),
])


# Fit and transform data
X_train_transformed = pipeline.fit_transform(X_train)

In [19]:
import numpy as np

# Convert X_train and y_train to numpy arrays
X_train_array = np.array(X_train.values.reshape(-1, 1))  # Reshape if needed
y_train_array = np.array(list(y_train.values))

# Convert X_test and y_test to numpy arrays
X_test_array = np.array(X_test.values.reshape(-1, 1)) # Reshape if needed
y_test_array = np.array(list(y_test.values))


In [20]:
X_train_array

array([[array([141, 136, 131, ...,   2,   0,   1], dtype=int64)],
       [array([0, 0, 0, ..., 0, 0, 0], dtype=int64)],
       [array([0, 0, 0, ..., 0, 0, 0], dtype=int64)],
       [array([ 1, 45, 54, ...,  0,  0,  0], dtype=int64)],
       [array([62, 62, 62, ..., 26, 26, 26], dtype=int64)],
       [array([156, 157, 158, ...,  85,  85,  85], dtype=int64)],
       [array([36, 37, 38, ..., 15, 16, 17], dtype=int64)],
       [array([44, 44, 44, ...,  0,  0,  0], dtype=int64)],
       [array([0, 0, 0, ..., 0, 0, 0], dtype=int64)],
       [array([5, 5, 5, ..., 0, 0, 0], dtype=int64)],
       [array([32, 31, 29, ...,  0,  0,  0], dtype=int64)],
       [array([5, 4, 3, ..., 2, 2, 2], dtype=int64)],
       [array([44, 45, 45, ..., 32, 33, 34], dtype=int64)],
       [array([ 94,  99, 107, ...,   0,   0,   0], dtype=int64)],
       [array([112, 111, 109, ...,  66,  66,  67], dtype=int64)],
       [array([ 3,  3,  3, ..., 13, 13, 13], dtype=int64)],
       [array([0, 0, 0, ..., 0, 0, 0], dtype=i

In [9]:
X_train_transformed

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
# First of all we will use a DummyClassifier to get an idea of what a uncorrelated model looks like
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X_train_array, y_train_array)
ac = dummy_clf.score(X_test_array, y_test_array)
print("Using randomly generated answer we get " + str(ac) + "% Accuracy")

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train_array, y_train_array)
ac = round(dummy_clf.score(X_test_array, y_test_array) * 100, 2)
print("Using only most frequent number as answer we get " + str(ac) + "% Accuracy")

dummy_clf = DummyClassifier(strategy="uniform")
dummy_clf.fit(X_train_array, y_train_array)
ac = round(dummy_clf.score(X_test_array, y_test_array) * 100, 2)
print("Using uniform distribution answer we get " + str(ac) + "% Accuracy")

Using randomly generated answer we get 0.3% Accuracy
Using only most frequent number as answer we get 30.0% Accuracy
Using uniform distribution answer we get 40.0% Accuracy


In [23]:
from sklearn.neighbors import KNeighborsClassifier
from local_lib.transformer import MaskSeuilTransformer
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('image_mask', MaskSeuilTransformer(strategy='combined')),
    ('knn', KNeighborsClassifier())
])

# knn_clf = KNeighborsClassifier()
pipeline.fit(dataset["pixel_value"], dataset["class"])
ac = round(pipeline.score(X_test_array, y_test_array) * 100, 2)
print("Base knn accuracy is around " + str(ac) + "% Accuracy")

ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [68]:
print("X_train_array:", type(X_train_array), X_train_array.shape)
print("y_train_array:", type(y_train_array), y_train_array.shape)
print("X_test_array:", type(X_test_array), X_test_array.shape)
print("y_test_array:", type(y_test_array), y_test_array.shape)

X_train_array: <class 'numpy.ndarray'> (670, 1)
y_train_array: <class 'numpy.ndarray'> (670,)
X_test_array: <class 'numpy.ndarray'> (330, 1)
y_test_array: <class 'numpy.ndarray'> (330,)


In [69]:
print("Unique values in y_train_array:", np.unique(y_train_array))
print("Unique values in y_test_array:", np.unique(y_test_array))

Unique values in y_train_array: [0 1 2]
Unique values in y_test_array: [0 1 2]
