<a href="https://colab.research.google.com/github/mdsiam135/research/blob/main/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install numpy pandas scikit-learn




In [2]:
#Support Vector Machine (SVM)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, classification_report
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline

In [4]:
# --- Step 1: Load Dataset ---
df = pd.read_csv('/content/drive/MyDrive/Yearbook of Agricultural Statistics/train2.csv')  # Replace with your dataset

In [5]:
# --- Step 2: Binning using quantiles ---
labels = ['very low', 'low', 'medium', 'high', 'very high']
df['Production_Label'], bin_edges = pd.qcut(df['Production (M.Ton)'], q=5, labels=labels, retbins=True)


In [6]:
# --- Display bin ranges ---
print("Label Ranges (Quantiles):")
for i in range(len(bin_edges) - 1):
    print(f"{labels[i].capitalize()}: {bin_edges[i]:.2f} - {bin_edges[i+1]:.2f}")

Label Ranges (Quantiles):
Very low: 0.00 - 545.38
Low: 545.38 - 1452.00
Medium: 1452.00 - 3050.00
High: 3050.00 - 6447.40
Very high: 6447.40 - 544979.54


In [7]:
# --- Label counts ---
print("\nLabel Counts:")
print(df['Production_Label'].value_counts())


Label Counts:
Production_Label
low          1025
very low     1024
medium       1024
very high    1024
high         1023
Name: count, dtype: int64


In [8]:
# --- Step 3: Prepare features and encoded labels ---
X = df.drop(columns=['Production (M.Ton)', 'Production_Label'])
y = df['Production_Label']

In [9]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [10]:
# --- Step 4: Split into train/val/test sets ---
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y_encoded, test_size=0.15, stratify=y_encoded, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1765, stratify=y_temp, random_state=42
)  # ≈15% of full data


In [11]:
# --- Step 5: Define SVM pipeline ---
model = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', probability=True, random_state=42))
])


In [12]:
# --- Step 6: Training with Epochs ---
epochs = 10
best_val_f1 = 0
best_model = None

epoch_history = []  # Store metrics for each epoch

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)

    val_acc = accuracy_score(y_val, y_val_pred)
    val_precision = precision_score(y_val, y_val_pred, average='weighted', zero_division=0)
    val_recall = recall_score(y_val, y_val_pred, average='weighted', zero_division=0)
    val_f1_weighted = f1_score(y_val, y_val_pred, average='weighted', zero_division=0)
    val_f1_macro = f1_score(y_val, y_val_pred, average='macro', zero_division=0)

    print(f"Validation Accuracy: {val_acc:.4f}")
    print(f"Weighted Precision: {val_precision:.4f}, Weighted Recall: {val_recall:.4f}, Weighted F1: {val_f1_weighted:.4f}")
    print(f"Macro F1 Score: {val_f1_macro:.4f}")

    # Save best model
    if val_f1_weighted > best_val_f1:
        best_val_f1 = val_f1_weighted
        best_model = model

    # Store results
    epoch_history.append({
        'Epoch': epoch + 1,
        'Accuracy': val_acc,
        'Precision': val_precision,
        'Recall': val_recall,
        'F1_Weighted': val_f1_weighted,
        'F1_Macro': val_f1_macro
    })



Epoch 1/10
Validation Accuracy: 0.7503
Weighted Precision: 0.7504, Weighted Recall: 0.7503, Weighted F1: 0.7489
Macro F1 Score: 0.7487

Epoch 2/10
Validation Accuracy: 0.7503
Weighted Precision: 0.7504, Weighted Recall: 0.7503, Weighted F1: 0.7489
Macro F1 Score: 0.7487

Epoch 3/10
Validation Accuracy: 0.7503
Weighted Precision: 0.7504, Weighted Recall: 0.7503, Weighted F1: 0.7489
Macro F1 Score: 0.7487

Epoch 4/10
Validation Accuracy: 0.7503
Weighted Precision: 0.7504, Weighted Recall: 0.7503, Weighted F1: 0.7489
Macro F1 Score: 0.7487

Epoch 5/10
Validation Accuracy: 0.7503
Weighted Precision: 0.7504, Weighted Recall: 0.7503, Weighted F1: 0.7489
Macro F1 Score: 0.7487

Epoch 6/10
Validation Accuracy: 0.7503
Weighted Precision: 0.7504, Weighted Recall: 0.7503, Weighted F1: 0.7489
Macro F1 Score: 0.7487

Epoch 7/10
Validation Accuracy: 0.7503
Weighted Precision: 0.7504, Weighted Recall: 0.7503, Weighted F1: 0.7489
Macro F1 Score: 0.7487

Epoch 8/10
Validation Accuracy: 0.7503
Weighted

In [13]:
# --- Step 7: Print epoch effects table ---
print("\n--- Epoch-wise Validation Metrics ---")
df_history = pd.DataFrame(epoch_history)
print(df_history.to_string(index=False))


--- Epoch-wise Validation Metrics ---
 Epoch  Accuracy  Precision   Recall  F1_Weighted  F1_Macro
     1  0.750325   0.750412 0.750325     0.748873  0.748721
     2  0.750325   0.750412 0.750325     0.748873  0.748721
     3  0.750325   0.750412 0.750325     0.748873  0.748721
     4  0.750325   0.750412 0.750325     0.748873  0.748721
     5  0.750325   0.750412 0.750325     0.748873  0.748721
     6  0.750325   0.750412 0.750325     0.748873  0.748721
     7  0.750325   0.750412 0.750325     0.748873  0.748721
     8  0.750325   0.750412 0.750325     0.748873  0.748721
     9  0.750325   0.750412 0.750325     0.748873  0.748721
    10  0.750325   0.750412 0.750325     0.748873  0.748721


In [14]:
# --- Step 8: Final Evaluation on Test Set ---
print("\n--- Final Evaluation on Test Data ---")
y_test_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
f1_weighted = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)
f1_macro = f1_score(y_test, y_test_pred, average='macro', zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Weighted): {precision:.4f}")
print(f"Recall (Weighted): {recall:.4f}")
print(f"F1 Score (Weighted): {f1_weighted:.4f}")
print(f"Macro F1 Score: {f1_macro:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_, zero_division=0))


--- Final Evaluation on Test Data ---
Accuracy: 0.7865
Precision (Weighted): 0.7843
Recall (Weighted): 0.7865
F1 Score (Weighted): 0.7846
Macro F1 Score: 0.7844

Classification Report:
              precision    recall  f1-score   support

        high       0.72      0.69      0.71       153
         low       0.78      0.72      0.75       154
      medium       0.75      0.73      0.74       153
   very high       0.83      0.86      0.85       154
    very low       0.84      0.92      0.88       154

    accuracy                           0.79       768
   macro avg       0.78      0.79      0.78       768
weighted avg       0.78      0.79      0.78       768



In [16]:
# --- Step 9: Predict on Unknown Test Data ---
try:
    unknown_data = pd.read_csv('/content/drive/MyDrive/Yearbook of Agricultural Statistics/test.csv')  # Same features as X
    # Drop the 'Production (M.Ton)' column from the unknown data
    # This makes the columns match the data the model was trained on (X_train)
    unknown_data_processed = unknown_data.drop(columns=['Production (M.Ton)'], errors='ignore') # Use errors='ignore' in case the column is not present

    unknown_preds = best_model.predict(unknown_data_processed)
    decoded_preds = label_encoder.inverse_transform(unknown_preds)

    print("\n--- Predictions on Unknown Test Data ---")
    # It might be useful to print the predictions alongside the original data if possible
    # For this example, we'll just print the predictions
    for i, pred in enumerate(decoded_preds):
        print(f"Sample {i + 1}: {pred}")

except FileNotFoundError:
    print("\nUnknown test data file not found. Skipping predictions.")
except KeyError as e:
    print(f"\nError processing unknown data: {e}. Make sure the test file has the 'Production (M.Ton)' column if needed for dropping.")


--- Predictions on Unknown Test Data ---
Sample 1: low
Sample 2: high
Sample 3: medium
Sample 4: very low
Sample 5: very low
Sample 6: very high
Sample 7: medium
Sample 8: low
Sample 9: high
Sample 10: very low
Sample 11: high
Sample 12: very high
Sample 13: very high
Sample 14: high
Sample 15: medium
Sample 16: low
Sample 17: very low
Sample 18: low
Sample 19: very high
Sample 20: low
Sample 21: very low
Sample 22: high
Sample 23: low
Sample 24: very high
Sample 25: very high
Sample 26: very high
Sample 27: high
Sample 28: medium
Sample 29: high
Sample 30: very low
Sample 31: very low
Sample 32: very low
Sample 33: low
Sample 34: very low
Sample 35: low
Sample 36: high
Sample 37: high
Sample 38: medium
Sample 39: high
Sample 40: low
Sample 41: low
Sample 42: medium
Sample 43: high
Sample 44: very high
Sample 45: very low
Sample 46: low
Sample 47: very low
Sample 48: very high
Sample 49: very low
Sample 50: low
Sample 51: very low
Sample 52: medium
Sample 53: medium
Sample 54: very hi