<a href="https://colab.research.google.com/github/mdzikrim/Hands-on_DL/blob/main/Chapter_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##MNIST Classifier

In [2]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


In [3]:
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist.data, mnist.target.astype(np.uint8)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, stratify=y, random_state=42)


In [4]:
knn_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

param_grid = {
    "knn__n_neighbors": [3, 4, 5],
    "knn__weights": ["uniform", "distance"]
}

grid_search = GridSearchCV(knn_pipeline, param_grid, cv=3, scoring="accuracy", verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Best Params:", grid_search.best_params_)
print("Test Accuracy:", accuracy_score(y_test, y_pred))


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Params: {'knn__n_neighbors': 4, 'knn__weights': 'distance'}
Test Accuracy: 0.9498


##MNIST Data Augmentation

In [5]:
from scipy.ndimage import shift

def shift_image(image, dx, dy):
    image = image.reshape(28, 28)
    shifted = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted.reshape(-1)


In [6]:
X_train_augmented = [X_train]
y_train_augmented = [y_train]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    shifted = np.apply_along_axis(shift_image, axis=1, arr=X_train, dx=dx, dy=dy)
    X_train_augmented.append(shifted)
    y_train_augmented.append(y_train)

X_train_aug = np.concatenate(X_train_augmented)
y_train_aug = np.concatenate(y_train_augmented)
print("Ukuran setelah augmentasi:", X_train_aug.shape)


Ukuran setelah augmentasi: (300000, 784)


In [7]:
aug_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=3, weights="distance"))
])

aug_pipeline.fit(X_train_aug, y_train_aug)
y_pred_aug = aug_pipeline.predict(X_test)
print("Accuracy after augmentation:", accuracy_score(y_test, y_pred_aug))


Accuracy after augmentation: 0.9636


##TItanic Dataset Classifier

In [9]:
# Upload 'train.csv' dari Kaggle ke environment Colab
import pandas as pd

titanic = pd.read_csv("train.csv")

titanic["Sex"] = titanic["Sex"].map({"male": 0, "female": 1})
titanic["Embarked"].fillna("S", inplace=True)
titanic["Embarked"] = titanic["Embarked"].map({"S": 0, "C": 1, "Q": 2})
titanic["Age"].fillna(titanic["Age"].median(), inplace=True)
titanic["Fare"].fillna(titanic["Fare"].median(), inplace=True)

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X = titanic[features]
y = titanic["Survived"]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic["Embarked"].fillna("S", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic["Age"].fillna(titanic["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are s

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier(random_state=42)
scores = cross_val_score(model, X, y, cv=5)
print("Cross-val accuracy:", scores.mean())


Cross-val accuracy: 0.8114682066411399


##Spam Classifier

In [11]:
spam = ["Free money!!!", "You won a prize!", "Important notice"]
ham = ["Let's have lunch tomorrow.", "See you at the meeting.", "Happy birthday!"]

X = spam + ham
y = [1]*len(spam) + [0]*len(ham)


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vec, y_train)

print("Spam classifier accuracy:", model.score(X_test_vec, y_test))


Spam classifier accuracy: 0.0
