In [23]:
import json
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Function to parse JSON and execute ML pipeline
def run_ml_pipeline(json_file, csv_file):
    # Step 1: Read JSON file
    with open(json_file, 'r') as f:
        config = json.load(f)

    # Step 2: Load dataset
    data = pd.read_csv(csv_file)

    # Extract target and features from JSON
    target = config['target']
    features = config['features']

    # Step 3: Handle missing values
    imputer = SimpleImputer(strategy='mean')
    data[features] = imputer.fit_transform(data[features])

    # Step 4: Feature reduction
    reduction_type = config['feature_reduction']
    if reduction_type == 'No Reduction':
        X = data[features]
    elif reduction_type == 'Corr with Target':
        # Implement correlation-based feature selection
        corr = data.corr()[target].abs()
        relevant_features = corr[corr > 0.1].index.tolist()
        X = data[relevant_features]
    elif reduction_type == 'Tree-based':
        # Example using RandomForest for feature importance
        model = RandomForestClassifier()
        model.fit(data[features], data[target])
        importances = model.feature_importances_
        indices = importances.argsort()[::-1]
        X = data[features].iloc[:, indices[:5]]  # Select top 5 features
    elif reduction_type == 'PCA':
        pca = PCA(n_components=2)  # Adjust number of components as needed
        X = pca.fit_transform(data[features])

    y = data[target]

    # Step 5: Model building based on prediction type
    models = []
    for model_config in config['models']:
        if model_config['is_selected']:
            if model_config['type'] == 'RandomForest':
                model = RandomForestClassifier()
            elif model_config['type'] == 'LogisticRegression':
                model = LogisticRegression()
            else:
                continue

            # Hyperparameter tuning with GridSearchCV
            param_grid = model_config['hyperparameters']
            grid_search = GridSearchCV(model, param_grid, cv=5)
            models.append((grid_search, model_config['type']))

    # Step 6: Fit and predict
    for grid_search, model_type in models:
        grid_search.fit(X, y)
        predictions = grid_search.predict(X)
        print(f"Model: {model_type}")
        print(classification_report(y, predictions))

# Example usage
run_ml_pipeline(r'C:\Users\mukta\Downloads\DS_Assignment - internship\Screening Test - DS\algoparams_from_ui.json.rtf', r'C:\Users\mukta\Downloads\DS_Assignment - internship\Screening Test - DS/iris.csv')