In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = 'MC.csv'
df = pd.read_csv(file_path)

# Handle missing values (fill income with median)
df['Income'] = df['Income'].fillna(df['Income'].median())

# Drop ID and Dt_Customer (not useful for ML)
df = df.drop(['ID', 'Dt_Customer'], axis=1)

# Encode categorical variables
le = LabelEncoder()
for col in ['Education', 'Marital_Status']:
    df[col] = le.fit_transform(df[col])

# Features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Random Forest Model
rf = RandomForestClassifier(random_state=42)

# Hyperparameter grid
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,  # number of parameter settings sampled
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit model
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate on test set
y_pred = random_search.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}
Accuracy: 0.8883928571428571
              precision    recall  f1-score   support

           0       0.89      0.98      0.94       381
           1       0.79      0.34      0.48        67

    accuracy                           0.89       448
   macro avg       0.84      0.66      0.71       448
weighted avg       0.88      0.89      0.87       448



In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Classifiers
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Load dataset
file_path = 'MC.csv'
df = pd.read_csv(file_path)

# Handle missing values
df['Income'] = df['Income'].fillna(df['Income'].median())

# Drop ID and Dt_Customer
df = df.drop(['ID', 'Dt_Customer'], axis=1)

# Encode categorical variables
le = LabelEncoder()
for col in ['Education', 'Marital_Status']:
    df[col] = le.fit_transform(df[col])

# Features and target
X = df.drop('Response', axis=1)
y = df['Response']

# Scale features (important for SVM & MLP)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Models to evaluate
models = {
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'NaiveBayes': GaussianNB(),
    'MLP': MLPClassifier(max_iter=500, random_state=42)
}

# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    metrics = {
        'Train_Accuracy': accuracy_score(y_train, y_train_pred),
        'Test_Accuracy': accuracy_score(y_test, y_test_pred),
        'Precision': precision_score(y_test, y_test_pred, zero_division=0),
        'Recall': recall_score(y_test, y_test_pred, zero_division=0),
        'F1_Score': f1_score(y_test, y_test_pred, zero_division=0)
    }
    return metrics

# Evaluate all models
results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_train, y_train, X_test, y_test)

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print(results_df)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              Train_Accuracy  Test_Accuracy  Precision    Recall  F1_Score
SVM                 0.911272       0.881696   0.791667  0.283582  0.417582
DecisionTree        0.994978       0.830357   0.432836  0.432836  0.432836
RandomForest        0.994978       0.883929   0.800000  0.298507  0.434783
AdaBoost            0.889509       0.875000   0.703704  0.283582  0.404255
XGBoost             0.994420       0.881696   0.705882  0.358209  0.475248
CatBoost            0.974888       0.886161   0.785714  0.328358  0.463158
NaiveBayes          0.832589       0.808036   0.388235  0.492537  0.434211
MLP                 0.986049       0.859375   0.538462  0.417910  0.470588




In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Regressors
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

# Load dataset
file_path = 'MC.csv'
df = pd.read_csv(file_path)

# Handle missing values
df['Income'] = df['Income'].fillna(df['Income'].median())

# Drop ID and Dt_Customer
df = df.drop(['ID', 'Dt_Customer'], axis=1)

# Encode categorical variables
le = LabelEncoder()
for col in ['Education', 'Marital_Status']:
    df[col] = le.fit_transform(df[col])

# Features and target (regression: predict Income)
X = df.drop('Income', axis=1)
y = df['Income']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Models
models = {
    'SVR': SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(random_state=42),
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'AdaBoostRegressor': AdaBoostRegressor(random_state=42),
    'XGBRegressor': XGBRegressor(random_state=42, objective='reg:squarederror'),
    'MLPRegressor': MLPRegressor(max_iter=500, random_state=42)
}

# Evaluation function
def evaluate_regressor(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    metrics = {
        'Train_R2': r2_score(y_train, y_train_pred),
        'Test_R2': r2_score(y_test, y_test_pred),
        'MAE': mean_absolute_error(y_test, y_test_pred),
        'MSE': mean_squared_error(y_test, y_test_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred))
    }
    return metrics

# Evaluate
results = {}
for name, model in models.items():
    results[name] = evaluate_regressor(model, X_train, y_train, X_test, y_test)

# Results DataFrame
results_df = pd.DataFrame(results).T
print(results_df)



                       Train_R2   Test_R2           MAE           MSE  \
SVR                    0.006159  0.000060  17076.205580  4.261488e+08   
DecisionTreeRegressor  0.999624 -3.413130   9264.301339  1.880764e+09   
RandomForestRegressor  0.935966  0.851522   5461.710110  6.327757e+07   
AdaBoostRegressor      0.835005  0.701323   8997.720944  1.272885e+08   
XGBRegressor           0.998378  0.808494   5443.896064  8.161515e+07   
MLPRegressor          -2.033291 -3.549796  40924.296698  1.939007e+09   

                               RMSE  
SVR                    20643.371901  
DecisionTreeRegressor  43367.773683  
RandomForestRegressor   7954.719880  
AdaBoostRegressor      11282.219084  
XGBRegressor            9034.110147  
MLPRegressor           44034.157679  




In [14]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering, DBSCAN, OPTICS
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("MC.csv")

# Drop target column if exists (last column)
X = df.drop(columns=[df.columns[-1]])

# Convert categorical columns to numeric using One-Hot Encoding
X = pd.get_dummies(X, drop_first=True)

# Handle missing values (impute with mean for numeric features)
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Dictionary of clustering algorithms
clustering_algorithms = {
    "Agglomerative (Ward)": AgglomerativeClustering(n_clusters=3, linkage='ward'),
    "Agglomerative (Average)": AgglomerativeClustering(n_clusters=3, linkage='average'),
    "DBSCAN": DBSCAN(eps=0.5, min_samples=5),
    "OPTICS": OPTICS(min_samples=5)
}

# Results storage
results = []

for name, model in clustering_algorithms.items():
    labels = model.fit_predict(X_scaled)

    # Handle cases where clustering fails (all points in one cluster or all noise)
    if len(set(labels)) > 1 and len(set(labels)) < len(X_scaled):
        silhouette = silhouette_score(X_scaled, labels)
        db_index = davies_bouldin_score(X_scaled, labels)
        ch_index = calinski_harabasz_score(X_scaled, labels)
    else:
        silhouette, db_index, ch_index = None, None, None

    results.append({
        "Model": name,
        "Silhouette Score": silhouette,
        "Davies-Bouldin": db_index,
        "Calinski-Harabasz": ch_index
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)
print(results_df)


                     Model  Silhouette Score  Davies-Bouldin  \
0     Agglomerative (Ward)          0.003862       16.064879   
1  Agglomerative (Average)          0.342164        0.520901   
2                   DBSCAN               NaN             NaN   
3                   OPTICS          0.176422        1.756949   

   Calinski-Harabasz  
0           5.667667  
1           3.418561  
2                NaN  
3           3.961352  
