In [2]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from collections import Counter
import joblib

# Directory for the dataset
data_dir = "/Users/samikazi/Desktop/GithubProject/dataset"

# Set image parameters
img_height, img_width = 128, 128

# Helper function to load and preprocess images
def load_images_from_directory(directory, target_size):
    images = []
    labels = []
    class_names = sorted(os.listdir(directory))
    
    for class_index, class_name in enumerate(class_names):
        class_path = os.path.join(directory, class_name)
        
        if os.path.isdir(class_path):
            class_images = []
            for img_name in os.listdir(class_path):
                img_path = os.path.join(class_path, img_name)
                try:
                    img = load_img(img_path, target_size=target_size)
                    img_array = img_to_array(img)
                    img_array = img_array.flatten()  # Flatten the 2D image to a 1D array
                    class_images.append(img_array)
                except Exception as e:
                    print(f"Could not load image {img_path}: {e}")
            
            # Add class images if it has at least two samples
            if len(class_images) >= 2:
                images.extend(class_images)
                labels.extend([class_index] * len(class_images))
            else:
                print(f"Skipping class '{class_name}' due to insufficient images.")
                    
    return np.array(images), np.array(labels), class_names

# Load and preprocess images
images, labels, class_names = load_images_from_directory(data_dir, target_size=(img_height, img_width))

# Check class distribution after filtering
class_counts = Counter(labels)
print("Class distribution after filtering:", class_counts)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    images, labels, test_size=0.2, random_state=42, stratify=labels
)

# Initialize and train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and calculate accuracy on the validation set
y_pred = rf_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

# Save the trained model
joblib.dump(rf_model, "animal_classifier_rf_model.pkl")
print("Model saved as 'animal_classifier_rf_model.pkl'")


Skipping class 'cat' due to insufficient images.
Skipping class 'dog' due to insufficient images.
Class distribution after filtering: Counter({4: 89, 2: 77, 5: 72})
Validation Accuracy: 75.00%
Model saved as 'animal_classifier_rf_model.pkl'


In [8]:
import os
from icrawler.builtin import GoogleImageCrawler

# Creating the main dataset folder and subfolder
main_folder = "./dataset"
os.makedirs(main_folder, exist_ok=True)

# Create sybfolder for each class
classes = ['dog', 'cow', 'cat', 'lamb', 'zebra']
for class_name in classes:
    os.makedirs(os.path.join(main_folder, class_name), exist_ok=True)

def download_images(query, limit, output_dir):
    crawler = GoogleImageCrawler(storage={'root_dir': output_dir})
    crawler.crawl(keyword=query, max_num=limit)

# Example usage
download_images("dog", 100, os.path.join(main_folder, "dog"))
download_images("cow", 100, os.path.join(main_folder, "cow"))
download_images("cat", 100, os.path.join(main_folder, "cat"))
download_images("lamb", 100, os.path.join(main_folder, "lamb"))
download_images("zebra", 100, os.path.join(main_folder, "zebra"))

print("Image downloading completed!")


ERROR:downloader:Response status code 401, file https://i.guim.co.uk/img/media/595623e12934b89a84bb3a739c0e080f77e0d69e/0_346_5184_3110/master/5184.jpg
ERROR:downloader:Response status code 400, file https://media.gettyimages.com/id/1164046558/photo/baby-sheep-close-up.jpg
ERROR:downloader:Exception caught when downloading file https://extension.msstate.edu/sites/default/files/styles/feature/public/blog/lamb%20pics.png, error: HTTPSConnectionPool(host='extension.msstate.edu', port=443): Max retries exceeded with url: /sites/default/files/styles/feature/public/blog/lamb%20pics.png (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)'))), remaining retry times: 2
ERROR:downloader:Exception caught when downloading file https://extension.msstate.edu/sites/default/files/styles/feature/public/blog/lamb%20pics.png, error: HTTPSConnectionPool(host='extension.msstate.edu', port=443): Max

Image downloading completed!
