# Random Forest Classifier

In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# imports and path setup
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import numpy as np
import tqdm
from sklearn.utils import shuffle
from joblib import Parallel, delayed
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import RidgeClassifier

from userkits.features import *
from userkits.utils import *

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'dataset/train'

In [None]:
HALF_SIZE = False

In [None]:
# load data from train and eval directories
# set half=True to resize images to half to reduce memory usage
X, y = load_train_data(data_dir='./train_data', half=HALF_SIZE)
X, y = shuffle(X, y, random_state=42)

Loading train data: 100%|██████████| 29/29 [00:48<00:00,  1.68s/it]


In [None]:
X

[array([[[255, 177, 138],
         [255, 177, 138],
         [255, 177, 138],
         ...,
         [255, 179, 141],
         [255, 179, 141],
         [255, 179, 141]],
 
        [[255, 177, 138],
         [255, 177, 138],
         [255, 177, 138],
         ...,
         [255, 179, 141],
         [255, 179, 141],
         [255, 179, 141]],
 
        [[255, 177, 138],
         [255, 177, 138],
         [255, 177, 138],
         ...,
         [255, 179, 141],
         [255, 179, 141],
         [255, 179, 141]],
 
        ...,
 
        [[ 28,  51,  37],
         [ 28,  52,  38],
         [ 28,  52,  38],
         ...,
         [ 37,  71,  56],
         [ 37,  71,  56],
         [ 37,  71,  56]],
 
        [[ 28,  52,  38],
         [ 37,  69,  50],
         [ 38,  69,  50],
         ...,
         [ 37,  71,  56],
         [ 37,  71,  56],
         [ 37,  71,  56]],
 
        [[ 38,  70,  51],
         [ 38,  70,  51],
         [ 38,  70,  51],
         ...,
         [ 44,  87,  67],
  

In [None]:
def extract_features(images):
    features_list = []
    def process_image(img):
        feats = []
        # add feature functions here
        feats.extend(color_histogram(img))
        feats.extend(lbp_texture_features(img))
        feats.extend(find_mean(img))
        feats.extend(find_stddev(img))
        feats.append(edge_density(img))
        feats.append(green_pixel_ratio(img))
        feats.append(brightness(img))
        feats.append(shannon_entropy(img))

        return feats

    features_list = Parallel(n_jobs=-1)(delayed(process_image)(img) for img in tqdm.tqdm(images, desc="Extracting features"))
    return np.array(features_list)

In [None]:
X_features = extract_features(X)
X_features.shape

Extracting features: 100%|██████████| 1483/1483 [02:32<00:00,  9.70it/s]


(1483, 532)

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y_encoded, test_size=0.1) # you can change test_size
clf = XGBClassifier(n_estimators = 1250, subsample = 0.85, max_depth = 20, learning_rate = 0.075, min_child_weight = 3)  # you can tune hyperparameters here
clf.fit(X_train, y_train)
print("Train Accuracy:", clf.score(X_train, y_train))
print("Test Accuracy:", clf.score(X_test, y_test))

Train Accuracy: 1.0
Test Accuracy: 0.7785234899328859


## Evaluate

In [None]:
# load eval data
# set half=True to resize images to half to reduce memory usage
X_eval, file_ids = load_eval_data("./eval_data", half=HALF_SIZE) 

Loading eval data: 100%|██████████| 1486/1486 [00:39<00:00, 38.05it/s]


In [None]:
X_eval_features = extract_features(X_eval)
eval_predictions = clf.predict(X_eval_features)
print(eval_predictions[:5])

Extracting features: 100%|██████████| 1486/1486 [02:40<00:00,  9.23it/s]


[19 20 24 11 11]


In [None]:
try:
    preds = label_encoder.inverse_transform(eval_predictions)
except Exception:
    preds = eval_predictions

save_predictions(preds, file_ids, output_file='./output/submission.csv')

Saved ./output/submission.csv
