In [None]:
import dask.dataframe as dd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from concurrent.futures import ThreadPoolExecutor
import joblib
from sklearn.impute import SimpleImputer

# Load Data with Dask and optimize memory usage
data = dd.read_parquet('sequence_features_val.parquet')

# Drop 'ID' column if present and optimize data types
data = data.drop(columns=['ID'], errors='ignore').astype('float32')
data = data.compute()  # Convert to a Pandas DataFrame

# Assign labels directly
num_positive_samples = 1249857
labels = pd.Series([1] * num_positive_samples + [0] * (len(data) - num_positive_samples), name='label')

# Separate features and labels
features = data
assert len(features) == len(labels), "Mismatch between features and labels!"

features = features.fillna(0)


# Split into train and test sets with stratification to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    labels, 
    test_size=0.3, 
    random_state=42,
    shuffle=True,
    stratify=labels  # Ensures both train and test have proportional classes
)

# Function for training and evaluating models
def train_and_evaluate(model, model_name):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds, average='weighted')
    recall = recall_score(y_test, preds, average='weighted')
    f1 = f1_score(y_test, preds, average='weighted')
    
    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Predictions': preds,
        'Trained_Model': model
    }

# Define models
models = [
    (RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42), 'Random Forest'),
    (XGBClassifier(n_estimators=100, tree_method='hist', random_state=42), 'XGBoost'),
    (MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42), 'Neural Network')
]

# Train models in parallel
results = []
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(train_and_evaluate, model, name) for model, name in models]
    for future in futures:
        results.append(future.result())

# Compile results into a DataFrame
results_df = pd.DataFrame({
    'Model': [result['Model'] for result in results],
    'Accuracy': [result['Accuracy'] for result in results],
    'Precision': [result['Precision'] for result in results],
    'Recall': [result['Recall'] for result in results],
    'F1-Score': [result['F1-Score'] for result in results]
})

# Save results
results_df.to_csv('Final_results_ML_DL.csv', index=False)

# Save feature importances for Random Forest model
rf_importance_df = pd.DataFrame({
    'Feature': features.columns,
    'Importance': results[0]['Trained_Model'].feature_importances_  # Assuming Random Forest is the first model
})
rf_importance_df.sort_values(by='Importance', ascending=False, inplace=True)
rf_importance_df.to_csv('feature_importances_ML_DL.csv', index=False)

# Save models
for result in results:
    joblib.dump(result['Trained_Model'], f"{result['Model'].replace(' ', '_').lower()}_model.joblib")

print("Models trained in parallel, results saved, and models serialized.")


In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from dask_ml.model_selection import train_test_split as dask_train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import joblib
from sklearn.neural_network import MLPClassifier


# Step 1: Load the Data
parquet_file = 'sequence_features_val.parquet'
# parquet_file = 'sampled_Data.parquet'
data = dd.read_parquet(parquet_file)

# Load labels
labels_df = pd.read_csv('labels_Val.csv')
# labels_df = pd.read_csv('sample_Data_labels_val.csv')


data = data.drop(columns=['ID'], errors='ignore')  # Drop 'ID' column if still present

# Merge on index

data = data.join(labels_df, how='inner')  # Inner join on the index

# Step 2: Data Preprocessing
features = data.drop('label', axis=1)
labels = data['label']



# Split into train and test sets
X_train, X_test, y_train, y_test = dask_train_test_split(features, labels, test_size=0.2, random_state=42)

with joblib.parallel_backend('threading'):
    rf_model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
    rf_model.fit(X_train.compute(), y_train.compute())  # Compute to fit with Scikit-Learn
    importance_df = pd.DataFrame({'Feature': features.columns, 'Importance': rf_model.feature_importances_})

    # Step 2: Train XGBoost Classifier
    xgb_model = XGBClassifier(n_estimators=100, tree_method='hist', random_state=42)  # GPU: 'gpu_hist' for faster training
    xgb_model.fit(X_train.compute(), y_train.compute())  # Compute to fit with XGBoost

    # Step 3: Train Neural Network Classifier
    nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
    nn_model.fit(X_train.compute(), y_train.compute())  # Compute to fit with Neural Network

# Step 4: Evaluate Models
rf_preds = rf_model.predict(X_test.compute())
xgb_preds = xgb_model.predict(X_test.compute())
nn_preds = nn_model.predict(X_test.compute())

# Calculate metrics
rf_accuracy = accuracy_score(y_test.compute(), rf_preds)
xgb_accuracy = accuracy_score(y_test.compute(), xgb_preds)
nn_accuracy = accuracy_score(y_test.compute(), nn_preds)

# Prepare results DataFrame
results_df = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost', 'Neural Network'],
    'Accuracy': [rf_accuracy, xgb_accuracy, nn_accuracy],
    'Precision': [
        precision_score(y_test.compute(), rf_preds, average='weighted'),
        precision_score(y_test.compute(), xgb_preds, average='weighted'),
        precision_score(y_test.compute(), nn_preds, average='weighted')
    ],
    'Recall': [
        recall_score(y_test.compute(), rf_preds, average='weighted'),
        recall_score(y_test.compute(), xgb_preds, average='weighted'),
        recall_score(y_test.compute(), nn_preds, average='weighted')
    ],
    'F1-Score': [
        f1_score(y_test.compute(), rf_preds, average='weighted'),
        f1_score(y_test.compute(), xgb_preds, average='weighted'),
        f1_score(y_test.compute(), nn_preds, average='weighted')
    ]
})

# Save results to CSV
results_df.to_csv('Final_results_ML_DL.csv', index=False)

# Step 5: Save Classification Reports
rf_classification_report = classification_report(y_test.compute(), rf_preds, output_dict=True)
xgb_classification_report = classification_report(y_test.compute(), xgb_preds, output_dict=True)
nn_classification_report = classification_report(y_test.compute(), nn_preds, output_dict=True)

# Convert classification reports to DataFrame and save
classification_reports_df = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost', 'Neural Network'],
    'Classification Report': [
        rf_classification_report,
        xgb_classification_report,
        nn_classification_report
    ]
})

classification_reports_df.to_csv('classification_result_ML_DL.csv', index=False)


# Classification Reports
print("\nRandom Forest Classification Report:")
print(rf_classification_report)

print("\nXGBoost Classification Report:")
print(xgb_classification_report)

# Classification Reports
print("\nNN Classification Report:")
print(nn_classification_report)

# Save feature importances to CSV
importance_df.sort_values(by='Importance', ascending=False, inplace=True)
importance_df.to_csv('feature_importances_ML_DL.csv', index=False)

# Step 6: Save the trained models
joblib.dump(rf_model, 'rf_model.joblib')
joblib.dump(xgb_model, 'xgb_model.joblib')
joblib.dump(nn_model, 'nn_model.joblib')

print("Models trained, results saved, and models serialized.")


