# Stratified Sampling, DBSCAN, and Isolation Forest Implementation

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [None]:
df = pd.read_csv('data/paddydataset.csv')
print(f"Dataset shape: {df.shape}")
df.head()

## Stratified Sampling (From Scratch)

In [None]:
def stratified_sample(data, strata_column, sample_size=None, sample_fraction=0.2, random_seed=42):
    np.random.seed(random_seed)
    
    strata = data[strata_column].unique()
    sampled_data = []
    
    for stratum in strata:
        stratum_data = data[data[strata_column] == stratum]
        stratum_size = len(stratum_data)
        
        if sample_size:
            n_samples = min(sample_size, stratum_size)
        else:
            n_samples = max(1, int(stratum_size * sample_fraction))
        
        indices = np.random.choice(stratum_data.index, size=n_samples, replace=False)
        sampled_data.append(data.loc[indices])
    
    return pd.concat(sampled_data, ignore_index=True)

In [None]:
stratified_data = stratified_sample(df, strata_column='Variety', sample_fraction=0.3)
print(f"\nOriginal dataset size: {len(df)}")
print(f"Stratified sample size: {len(stratified_data)}")
print(f"\nOriginal variety distribution:\n{df['Variety'].value_counts()}")
print(f"\nStratified sample variety distribution:\n{stratified_data['Variety'].value_counts()}")

## Data Preprocessing for Clustering and Anomaly Detection

In [None]:
def prepare_features(data):
    numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
    features = data[numeric_cols].copy()
    features = features.fillna(features.mean())
    
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    return features_scaled, numeric_cols

In [None]:
features_scaled, feature_names = prepare_features(df)
print(f"Number of features: {len(feature_names)}")
print(f"Features shape: {features_scaled.shape}")

## DBSCAN Clustering

In [None]:
def apply_dbscan(features, eps=0.5, min_samples=5):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(features)
    
    n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
    n_noise = list(clusters).count(-1)
    
    return clusters, n_clusters, n_noise

In [None]:
clusters, n_clusters, n_noise = apply_dbscan(features_scaled, eps=3.0, min_samples=10)

df['cluster'] = clusters
print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")
print(f"\nCluster distribution:\n{pd.Series(clusters).value_counts().sort_index()}")

In [None]:
def visualize_clusters(features, clusters, title="DBSCAN Clustering"):
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components=2)
    features_2d = pca.fit_transform(features)
    
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(features_2d[:, 0], features_2d[:, 1], c=clusters, cmap='viridis', alpha=0.6)
    plt.colorbar(scatter, label='Cluster')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title(title)
    plt.tight_layout()
    plt.show()

In [None]:
visualize_clusters(features_scaled, clusters)

## Isolation Forest Anomaly Detection

In [None]:
def detect_anomalies(features, contamination=0.1, random_state=42):
    iso_forest = IsolationForest(contamination=contamination, random_state=random_state)
    predictions = iso_forest.fit_predict(features)
    anomaly_scores = iso_forest.score_samples(features)
    
    return predictions, anomaly_scores

In [None]:
predictions, anomaly_scores = detect_anomalies(features_scaled, contamination=0.05)

df['anomaly'] = predictions
df['anomaly_score'] = anomaly_scores

n_anomalies = (predictions == -1).sum()
n_normal = (predictions == 1).sum()

print(f"Normal points: {n_normal}")
print(f"Anomalies detected: {n_anomalies}")
print(f"Percentage of anomalies: {n_anomalies/len(df)*100:.2f}%")

In [None]:
def visualize_anomalies(features, predictions, scores, title="Isolation Forest Anomalies"):
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components=2)
    features_2d = pca.fit_transform(features)
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    colors = ['red' if pred == -1 else 'blue' for pred in predictions]
    axes[0].scatter(features_2d[:, 0], features_2d[:, 1], c=colors, alpha=0.6)
    axes[0].set_xlabel('PC1')
    axes[0].set_ylabel('PC2')
    axes[0].set_title(f'{title} - Classification')
    axes[0].legend(['Normal', 'Anomaly'])
    
    scatter = axes[1].scatter(features_2d[:, 0], features_2d[:, 1], c=scores, cmap='coolwarm', alpha=0.6)
    axes[1].set_xlabel('PC1')
    axes[1].set_ylabel('PC2')
    axes[1].set_title(f'{title} - Anomaly Scores')
    plt.colorbar(scatter, ax=axes[1], label='Score')
    
    plt.tight_layout()
    plt.show()

In [None]:
visualize_anomalies(features_scaled, predictions, anomaly_scores)

## Summary Statistics

In [None]:
anomalies_df = df[df['anomaly'] == -1].copy()
normal_df = df[df['anomaly'] == 1].copy()

print("Top 10 anomalies by score:")
print(anomalies_df.nsmallest(10, 'anomaly_score')[['Variety', 'Paddy yield(in Kg)', 'anomaly_score', 'cluster']])