In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from collections import Counter

def compare_sampling_strategies(X, y):
    """
    Compare different sampling strategies for handling imbalanced data
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    clf = RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        random_state=42
    )

    samplers = {
        'SMOTE': SMOTE(random_state=42),
        'ADASYN': ADASYN(random_state=42),
        'Random Under-sampling': RandomUnderSampler(random_state=42),
        'SMOTEENN': SMOTEENN(random_state=42),
        'SMOTETomek': SMOTETomek(random_state=42),
        'Combined': None  
    }
    
    results = {}
    
    for name, sampler in samplers.items():
        if name == 'Combined':
            over = SMOTE(sampling_strategy={1: 2000, 2: 3000, 3: 4000, 4: 3000, 5: 2000}, random_state=42)
            under = RandomUnderSampler(sampling_strategy={1: 2000, 2: 3000, 3: 3000, 4: 3000, 5: 2000}, random_state=42)
            X_res, y_res = over.fit_resample(X_train, y_train)
            X_res, y_res = under.fit_resample(X_res, y_res)
        else:
            X_res, y_res = sampler.fit_resample(X_train, y_train)
        
        clf.fit(X_res, y_res)
        y_pred = clf.predict(X_test)
        
        # Store results
        results[name] = {
            'class_distribution': Counter(y_res),
            'classification_report': classification_report(y_test, y_pred, output_dict=True)
        }
        
        print(f"\n{name} Results:")
        print(f"Resampled class distribution: {Counter(y_res)}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

    return results

def create_custom_sampler(X, y, target_ratios):
    """
    Create a custom sampling strategy based on target ratios
    """
    total_samples = len(y)
    sampling_strategy = {}
    
    for class_label, ratio in target_ratios.items():
        sampling_strategy[class_label] = int(total_samples * ratio)
    
    over = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
    X_res, y_res = over.fit_resample(X, y)
    
    return X_res, y_res

# Example usage with custom ratios
custom_ratios = {
    1: 0.15,  # 15% of total samples
    2: 0.25,  # 25% of total samples
    3: 0.25,  # 25% of total samples
    4: 0.20,  #
    5: 0.15   
}

def analyze_sampling_impact(X, y, sampling_strategy):
    """
    Analyze the impact of sampling on model performance
    """
    # Original data distribution
    print("Original class distribution:", Counter(y))
    
    # Apply sampling
    X_resampled, y_resampled = sampling_strategy.fit_resample(X, y)
    print("Resampled class distribution:", Counter(y_resampled))
    
    # Compare model performance
    clf_original = RandomForestClassifier(random_state=42)
    clf_resampled = RandomForestClassifier(random_state=42)
    
    # Cross-validation scores
    original_scores = cross_val_score(clf_original, X, y, cv=5)
    resampled_scores = cross_val_score(clf_resampled, X_resampled, y_resampled, cv=5)
    
    print("\nOriginal Data CV Scores:", original_scores.mean())
    print("Resampled Data CV Scores:", resampled_scores.mean())

# Example usage:


In [None]:
df = pd.read_csv('./data/urban_development_dataset.csv')
X = df.drop('development_trend_score', axis=1)
y = df['development_trend_score']
results = compare_sampling_strategies(X, y)
X_custom, y_custom = create_custom_sampler(X, y, custom_ratios)

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

def preprocess_and_train(df):
    """
    Preprocess data including categorical variables and handle imbalanced data
    """
    # Separate features and target
    X = df.drop('development_trend_score', axis=1)
    y = df['development_trend_score']

    # Identify numeric and categorical columns
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(drop='first', sparse=False), categorical_features)
        ])

    # Create pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', RandomForestClassifier(
            n_estimators=200,
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=2,
            class_weight='balanced',
            random_state=42
        ))
    ])

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Fit and predict
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Print results
    print("\nClass distribution in training set:")
    print(y_train.value_counts())

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    return pipeline

In [4]:
df = pd.read_csv('./data/urban_development_dataset.csv')
# X = df.drop('development_trend_score', axis=1)
# y = df['development_trend_score']
# results = compare_sampling_strategies(X, y)
# X_custom, y_custom = create_custom_sampler(X, y, custom_ratios)
pipeline = preprocess_and_train(df)


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'