<a href="https://colab.research.google.com/github/pavitraa0625/ML-LAB/blob/main/Copy_of_ml_lab_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --------------------------
# PlantVillage CSV Generator
# --------------------------

from google.colab import drive
import zipfile
import os
import pandas as pd

# 1️⃣ Mount Google Drive
drive.mount('/content/drive')

# 2️⃣ Set ZIP file path in Drive
zip_path = '/content/drive/MyDrive/PlantVillage-Dataset.zip'
extract_path = '/content/PlantVillage'

# 3️⃣ Extract ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction completed!")

# 4️⃣ Automatically locate the folder containing class subfolders
# Usually inside 'PlantVillage' folder
subfolders = os.listdir(extract_path)
print("Top-level folders:", subfolders)

# Assuming the images are inside the first folder
dataset_path = os.path.join(extract_path, subfolders[0])
print("Using dataset folder:", dataset_path)
print("Sample classes:", os.listdir(dataset_path)[:10])

# 5️⃣ Generate CSV from image paths and labels
image_paths = []
labels = []

for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            image_paths.append(os.path.join(root, file))
            labels.append(os.path.basename(root))  # folder name = label

df = pd.DataFrame({'image_path': image_paths, 'label': labels})

# 6️⃣ Save CSV to Drive
csv_path = '/content/drive/MyDrive/plant_dataset.csv'
df.to_csv(csv_path, index=False)

print(f"CSV file created at: {csv_path}")
print(df.head())
print("Total images found:", len(df))
print("Class distribution:\n", df['label'].value_counts())


Mounted at /content/drive
Extraction completed!
Top-level folders: ['PlantVillage-Dataset']
Using dataset folder: /content/PlantVillage/PlantVillage-Dataset
Sample classes: ['create_data_distribution.py', 'generate_data_color-50-50.sh', 'README.md', 'generate_data_color-80-20.sh', 'slurm-476493.out', 'slurm-476489.out', 'slurm-476485.out', 'generate_data_color-60-40.sh', '.git', 'slurm-476490.out']
CSV file created at: /content/drive/MyDrive/plant_dataset.csv
                                          image_path label
0  /content/PlantVillage/PlantVillage-Dataset/dat...    34
1  /content/PlantVillage/PlantVillage-Dataset/dat...    34
2  /content/PlantVillage/PlantVillage-Dataset/dat...    34
3  /content/PlantVillage/PlantVillage-Dataset/dat...    34
4  /content/PlantVillage/PlantVillage-Dataset/dat...    34
Total images found: 182214
Class distribution:
 label
Orange___Haunglongbing_(Citrus_greening)    16521
Tomato___Tomato_Yellow_Leaf_Curl_Virus      16071
Soybean___healthy           

In [None]:
from sklearn.model_selection import train_test_split

# Load the CSV
df = pd.read_csv('/content/drive/MyDrive/plant_dataset.csv')

# Split 80% train, 20% test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Save to CSV
train_csv = '/content/drive/MyDrive/plant_train.csv'
test_csv = '/content/drive/MyDrive/plant_test.csv'

train_df.to_csv(train_csv, index=False)
test_df.to_csv(test_csv, index=False)

print("Train CSV saved at:", train_csv)
print("Test CSV saved at:", test_csv)
print("Train samples:", len(train_df))
print("Test samples:", len(test_df))


Train CSV saved at: /content/drive/MyDrive/plant_train.csv
Test CSV saved at: /content/drive/MyDrive/plant_test.csv
Train samples: 145771
Test samples: 36443


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load CSV
train_df = pd.read_csv('/content/drive/MyDrive/plant_train.csv')
test_df  = pd.read_csv('/content/drive/MyDrive/plant_test.csv')

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Class distribution in train:\n", train_df['label'].value_counts())

# Encode labels
le = LabelEncoder()
train_df['label_enc'] = le.fit_transform(train_df['label'])
test_df['label_enc']  = le.transform(test_df['label'])

# Features are image paths, target is label_enc
X_train = train_df['image_path']
y_train = train_df['label_enc']
X_test  = test_df['image_path']
y_test  = test_df['label_enc']


Train shape: (145771, 2)
Test shape: (36443, 2)
Class distribution in train:
 label
Orange___Haunglongbing_(Citrus_greening)    13217
Tomato___Tomato_Yellow_Leaf_Curl_Virus      12857
Soybean___healthy                           12216
Peach___Bacterial_spot                       5513
Tomato___Bacterial_spot                      5105
                                            ...  
14                                            117
36                                            114
2                                              79
17                                             76
22                                             33
Name: count, Length: 76, dtype: int64


In [None]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
import os
import cv2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score


In [None]:
# Load the full CSV
df = pd.read_csv('/content/drive/MyDrive/plant_dataset.csv')

# Random sample 5000 images for faster processing
df_sample = df.sample(n=5000, random_state=42).reset_index(drop=True)

# Split into train/test 80:20
train_df, test_df = train_test_split(df_sample, test_size=0.2, stratify=df_sample['label'], random_state=42)

# Encode labels
le = LabelEncoder()
train_df['label_enc'] = le.fit_transform(train_df['label'])
test_df['label_enc']  = le.transform(test_df['label'])

X_train_paths = train_df['image_path']
y_train = train_df['label_enc']
X_test_paths  = test_df['image_path']
y_test  = test_df['label_enc']

print("Train samples:", len(X_train_paths))
print("Test samples:", len(X_test_paths))


Train samples: 4000
Test samples: 1000


In [None]:
# Function to load and resize images
def load_and_preprocess(img_paths, size=(32,32)):
    features = []
    for path in img_paths:
        img = cv2.imread(path)
        if img is not None:
            img = cv2.resize(img, size)
            features.append(img.flatten())
    return np.array(features)

# Convert train/test images to features
X_train_features = load_and_preprocess(X_train_paths)
X_test_features  = load_and_preprocess(X_test_paths)

print("Feature shapes:", X_train_features.shape, X_test_features.shape)


Feature shapes: (4000, 3072) (1000, 3072)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameter grid
param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize RandomForest
rf = RandomForestClassifier(random_state=42)

# RandomizedSearchCV
rf_random = RandomizedSearchCV(rf, param_distributions=param_dist,
                               n_iter=3, scoring='accuracy', cv=3,
                               random_state=42, n_jobs=-1)
rf_random.fit(X_train_features, y_train)

print("Best RandomForest params:", rf_random.best_params_)




Best RandomForest params: {'n_estimators': 150, 'min_samples_split': 10, 'max_depth': None}


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load full CSV
df = pd.read_csv('/content/drive/MyDrive/plant_dataset.csv')

# Random sample 5000 images
df_sample = df.sample(n=5000, random_state=42).reset_index(drop=True)

# Split into train/test 80:20
train_df, test_df = train_test_split(df_sample, test_size=0.2,
                                     stratify=df_sample['label'], random_state=42)

# Encode labels
le = LabelEncoder()
train_df['label_enc'] = le.fit_transform(train_df['label'])
test_df['label_enc']  = le.transform(test_df['label'])

X_train_paths = train_df['image_path']
y_train = train_df['label_enc']
X_test_paths  = test_df['image_path']
y_test  = test_df['label_enc']

print("Train samples:", len(X_train_paths))
print("Test samples:", len(X_test_paths))


Train samples: 4000
Test samples: 1000


In [None]:
import cv2
import numpy as np

def load_and_preprocess(img_paths, size=(16,16)):
    features = []
    for path in img_paths:
        img = cv2.imread(path)
        if img is not None:
            img = cv2.resize(img, size)
            features.append(img.flatten())
    return np.array(features)

X_train_features = load_and_preprocess(X_train_paths)
X_test_features  = load_and_preprocess(X_test_paths)

print("Feature shapes:", X_train_features.shape, X_test_features.shape)


Feature shapes: (4000, 768) (1000, 768)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)

rf_random = RandomizedSearchCV(rf, param_distributions=param_dist,
                               n_iter=3, scoring='accuracy', cv=2,
                               random_state=42, n_jobs=-1)
rf_random.fit(X_train_features, y_train)

print("Best RandomForest params:", rf_random.best_params_)


Best RandomForest params: {'n_estimators': 150, 'min_samples_split': 10, 'max_depth': None}


In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Optimized classifier list
classifiers = {
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': rf_random.best_estimator_,
    'AdaBoost': AdaBoostClassifier(),
    'NaiveBayes': GaussianNB(),
    # Optional slower classifiers (uncomment if needed)
    # 'SVM': SVC(),
    # 'XGBoost': xgb.XGBClassifier(eval_metric='mlogloss'),
    # 'CatBoost': CatBoostClassifier(verbose=0),
    # 'MLP': MLPClassifier(max_iter=300)
}

results = []

for name, clf in classifiers.items():
    clf.fit(X_train_features, y_train)
    y_train_pred = clf.predict(X_train_features)
    y_test_pred  = clf.predict(X_test_features)
    results.append({
        'Model': name,
        'Train Accuracy': round(accuracy_score(y_train, y_train_pred), 4),
        'Test Accuracy': round(accuracy_score(y_test, y_test_pred), 4)
    })

results_df = pd.DataFrame(results)
print(results_df)


          Model  Train Accuracy  Test Accuracy
0  DecisionTree          0.9990          0.175
1  RandomForest          0.9962          0.381
2      AdaBoost          0.1442          0.145
3    NaiveBayes          0.1655          0.080


In [None]:
from sklearn.decomposition import PCA

# Reduce features to 50 dimensions
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X_train_features)

print("PCA reduced feature shape:", X_pca.shape)


PCA reduced feature shape: (4000, 50)


In [None]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Agglomerative clustering
agglo = AgglomerativeClustering(n_clusters=10, linkage='ward')  # You can change n_clusters
agglo_labels = agglo.fit_predict(X_pca)

# Count samples per cluster
unique, counts = np.unique(agglo_labels, return_counts=True)
print("Agglomerative cluster distribution:", dict(zip(unique, counts)))


Agglomerative cluster distribution: {np.int64(0): np.int64(401), np.int64(1): np.int64(529), np.int64(2): np.int64(182), np.int64(3): np.int64(688), np.int64(4): np.int64(397), np.int64(5): np.int64(319), np.int64(6): np.int64(359), np.int64(7): np.int64(445), np.int64(8): np.int64(337), np.int64(9): np.int64(343)}


In [None]:
from sklearn.cluster import DBSCAN

# DBSCAN parameters (tune eps for your dataset)
dbscan = DBSCAN(eps=15, min_samples=5)  # eps may need tuning
dbscan_labels = dbscan.fit_predict(X_pca)

# Count samples per cluster
unique, counts = np.unique(dbscan_labels, return_counts=True)
print("DBSCAN cluster distribution:", dict(zip(unique, counts)))


DBSCAN cluster distribution: {np.int64(-1): np.int64(4000)}
