In [None]:
!pip install streamlit pyngrok seaborn scikit-learn openpyxl

# Import necessary modules
import subprocess
from pyngrok import ngrok
import time
# Create the Streamlit script
streamlit_script = """
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, r2_score, precision_recall_curve, average_precision_score
from sklearn.multiclass import OneVsRestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import time

# Set the project title
st.set_page_config(page_title="Automated ML Pipeline")

# Initialize session state to store results
if 'model_accuracies' not in st.session_state:
    st.session_state.model_accuracies = {}
if 'pipeline_times' not in st.session_state:
    st.session_state.pipeline_times = {}
if 'results' not in st.session_state:
    st.session_state.results = {}

# Function to create preprocessing pipeline
def create_preprocessing_pipeline(data, numeric_strategy, fill_value=None):
    # Automatically detect numerical and categorical columns
    numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = data.select_dtypes(include=['object', 'bool', 'category']).columns.tolist()

    # Create transformers for preprocessing
    if numeric_strategy == 'custom':
        # Ensure fill_value is appropriate for the column's datatype
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value=fill_value)),
            ('scaler', StandardScaler())
        ])
    else:
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=numeric_strategy)),
            ('scaler', StandardScaler())
        ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features)
        ])

    return preprocessor, numeric_features + categorical_features

# Function to encode categorical features
def encode_categorical(data):
    label_encoders = {}
    categorical_features = data.select_dtypes(include=['object', 'bool', 'category']).columns.tolist()
    for col in categorical_features:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le
    return data, label_encoders

# Streamlit app
st.title("Automated Data Preparation and Machine Learning Pipeline")

# Upload datasets
uploaded_files = st.file_uploader("Upload your CSV or Excel files", type=["csv", "xlsx"], accept_multiple_files=True)

if uploaded_files:
    datasets = {}
    for uploaded_file in uploaded_files:
        if uploaded_file.name.endswith('.csv'):
            data = pd.read_csv(uploaded_file)
        else:
            data = pd.read_excel(uploaded_file)
        datasets[uploaded_file.name] = data

    for name, data in datasets.items():
        st.write(f"Uploaded Dataset - {name}:", data.head())

        # Show missing values
        missing_values = data.isnull().sum()
        if missing_values.sum() > 0:
            st.write(f"Missing Values in {name}:", missing_values)
        else:
            st.success(f"No missing values in {name}!")

        if missing_values.sum() > 0:
            # Handle missing values
            numeric_strategy = st.selectbox(f"Select strategy for handling missing values in numerical columns for {name}", ["mean", "median", "most_frequent", "custom"], key=f"strategy_{name}")
            fill_value = None
            if numeric_strategy == 'custom':
                min_val = float(data.select_dtypes(include=[np.number]).min().min())
                max_val = float(data.select_dtypes(include=[np.number]).max().max())
                fill_value = st.slider(f"Select a value to fill missing values in {name}", min_val, max_val, key=f"fill_value_{name}")

        # Encode categorical columns to numerical
        data_encoded, label_encoders = encode_categorical(data)
        st.write(f"Dataset after encoding categorical features for {name}:", data_encoded.head())

        # Choose target column
        target_column = st.selectbox(f"Select the target column for {name}", data.columns, key=f"target_{name}")

        if target_column:
            y = data_encoded[target_column]
            X = data_encoded.drop(target_column, axis=1)

            if missing_values.sum() > 0:
                preprocessor, feature_names = create_preprocessing_pipeline(X, numeric_strategy, fill_value)
            else:
                preprocessor, feature_names = create_preprocessing_pipeline(X, 'mean')  # Default to mean if no missing values
            X_preprocessed = preprocessor.fit_transform(X)

            # Option to plot pair plots before and after transformation
            if st.button(f"Plot Pair Plots Before Transformation for {name}"):
                st.write(f"Pair Plot Before Transformation for {name}:")
                sns.pairplot(data_encoded, diag_kind=None)
                st.pyplot(plt.gcf())

            if st.button(f"Plot Pair Plots After Transformation for {name}"):
                st.write(f"Pair Plot After Transformation for {name}:")
                transformed_data = pd.DataFrame(X_preprocessed, columns=feature_names)
                transformed_data[target_column] = y.values
                sns.pairplot(transformed_data, diag_kind=None)
                st.pyplot(plt.gcf())

            # Select task type
            task_type = st.selectbox(f"Select the task type for {name}", ["Classification", "Regression"], key=f"task_{name}")

            model = None

            if task_type == "Classification":
                model_name = st.selectbox(f"Select the classification model for {name}", ["Logistic Regression", "Decision Tree", "Random Forest", "Support Vector Machine", "K-Nearest Neighbors", "Naive Bayes"], key=f"class_model_{name}")
                if model_name == "Logistic Regression":
                    model = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))
                elif model_name == "Decision Tree":
                    model = DecisionTreeClassifier(random_state=42)
                elif model_name == "Random Forest":
                    model = RandomForestClassifier(random_state=42)
                elif model_name == "Support Vector Machine":
                    model = OneVsRestClassifier(SVC(probability=True, random_state=42))  # Enable probability estimates for PR curve
                elif model_name == "K-Nearest Neighbors":
                    model = KNeighborsClassifier()
                elif model_name == "Naive Bayes":
                    model = GaussianNB()
            else:
                model_name = st.selectbox(f"Select the regression model for {name}", ["Linear Regression", "Decision Tree", "Random Forest", "Support Vector Machine", "K-Nearest Neighbors"], key=f"reg_model_{name}")
                if model_name == "Linear Regression":
                    model = LinearRegression()
                elif model_name == "Decision Tree":
                    model = DecisionTreeRegressor(random_state=42)
                elif model_name == "Random Forest":
                    model = RandomForestRegressor(random_state=42)
                elif model_name == "Support Vector Machine":
                    model = SVR()
                elif model_name == "K-Nearest Neighbors":
                    model = KNeighborsRegressor()

            if st.button(f"Run Task for {name}"):
                start_time = time.time()
                # Split data into training and test sets
                X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

                model.fit(X_train, y_train)

                # Evaluate the model
                y_pred = model.predict(X_test)
                end_time = time.time()

                if task_type == "Classification":
                    accuracy = accuracy_score(y_test, y_pred)
                    st.write(f"Model accuracy for {name}: {accuracy}")
                    st.session_state.model_accuracies[name] = accuracy
                    # Display confusion matrix
                    cm = confusion_matrix(y_test, y_pred)
                    st.write(f"Confusion Matrix for {name}:")
                    fig, ax = plt.subplots(figsize=(10, 7))
                    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
                    st.pyplot(fig)

                    # Plot Precision-Recall Curve for multiclass
                    y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
                    y_prob = model.predict_proba(X_test)
                    precision = dict()
                    recall = dict()
                    average_precision = dict()
                    for i in range(y_test_bin.shape[1]):
                        precision[i], recall[i], _ = precision_recall_curve(y_test_bin[:, i], y_prob[:, i])
                        average_precision[i] = average_precision_score(y_test_bin[:, i], y_prob[:, i])

                    # Plot all PR curves
                    fig, ax = plt.subplots(figsize=(10, 7))
                    for i, color in zip(range(y_test_bin.shape[1]), plt.cm.rainbow(np.linspace(0, 1, y_test_bin.shape[1]))):
                        ax.plot(recall[i], precision[i], color=color, lw=2, label=f'PR curve (AP = {average_precision[i]:0.2f}) for class {i}')
                    ax.set_xlabel('Recall')
                    ax.set_ylabel('Precision')
                    ax.set_title('Precision-Recall Curve')
                    ax.legend(loc="lower left")
                    st.pyplot(fig)

                else:
                    mse = mean_squared_error(y_test, y_pred)
                    r2 = r2_score(y_test, y_pred)
                    st.write(f"Mean Squared Error for {name}: {mse}")
                    st.write(f"R^2 Score for {name}: {r2}")

                    # Line plot of predicted vs actual values
                    fig, ax = plt.subplots(figsize=(10, 7))
                    ax.plot(y_test.values, label='Actual')
                    ax.plot(y_pred, label='Predicted', linestyle='--')
                    ax.set_title('Actual vs Predicted Values')
                    ax.legend()
                    st.pyplot(fig)

                st.session_state.pipeline_times[name] = end_time - start_time
                st.session_state.results[name] = {
                    'accuracy': accuracy if task_type == "Classification" else None,
                    'mse': mse if task_type == "Regression" else None,
                    'r2': r2 if task_type == "Regression" else None,
                    'time': end_time - start_time
                }

    # Button to plot model accuracies and pipeline times
    if st.button("Plot Model Accuracies and Pipeline Times"):
        # Plot model accuracies
        if st.session_state.model_accuracies:
            st.write("Model Accuracies Comparison")
            fig, ax = plt.subplots(figsize=(12, 8))
            ax.bar(st.session_state.model_accuracies.keys(), st.session_state.model_accuracies.values())
            ax.set_ylabel("Accuracy")
            ax.set_title("Model Accuracies for Different Datasets")
            plt.xticks(rotation=45, ha='right')
            st.pyplot(fig)

        # Plot pipeline times
        if st.session_state.pipeline_times:
            st.write("Pipeline Processing Time Comparison")
            fig, ax = plt.subplots(figsize=(12, 8))
            ax.bar(st.session_state.pipeline_times.keys(), st.session_state.pipeline_times.values())
            ax.set_ylabel("Time (seconds)")
            ax.set_title("Pipeline Processing Time for Different Datasets")
            plt.xticks(rotation=45, ha='right')
            st.pyplot(fig)

        # Combined plot for accuracy and time
        if st.session_state.model_accuracies and st.session_state.pipeline_times:
            st.write("Combined Comparison of Model Accuracies and Pipeline Processing Time")
            fig, ax1 = plt.subplots(figsize=(12, 8))

            color = 'tab:blue'
            ax1.set_xlabel('Dataset')
            ax1.set_ylabel('Accuracy', color=color)
            ax1.bar(st.session_state.model_accuracies.keys(), st.session_state.model_accuracies.values(), color=color, alpha=0.6, label='Accuracy')
            ax1.tick_params(axis='y', labelcolor=color)
            plt.xticks(rotation=45, ha='right')

            ax2 = ax1.twinx()
            color = 'tab:red'
            ax2.set_ylabel('Time (seconds)', color=color)
            ax2.plot(list(st.session_state.pipeline_times.keys()), list(st.session_state.pipeline_times.values()), color=color, marker='o', linestyle='-', label='Time')
            ax2.tick_params(axis='y', labelcolor=color)

            fig.tight_layout()
            fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))
            st.pyplot(fig)
"""

# Save the Streamlit script to a file
with open("streamlit_app.py", "w") as f:
    f.write(streamlit_script)

# Function to run the Streamlit app
def run_streamlit_app():
    # Stop any existing Streamlit processes
    try:
        result = subprocess.run(["pgrep", "-f", "streamlit"], capture_output=True, text=True)
        pids = result.stdout.split()
        for pid in pids:
            subprocess.run(["kill", "-9", pid])
    except Exception as e:
        print(f"Error stopping existing Streamlit processes: {e}")

    # Run the Streamlit app
    streamlit_process = subprocess.Popen(["streamlit", "run", "streamlit_app.py"])

    return streamlit_process

# Run the Streamlit app
run_streamlit_app()

# Set up ngrok tunnel
ngrok.set_auth_token("2hiaP2Ku5MWuGgnHbi8FK4XFlwD_5yEXv7TvcAKvKKgFoK6Xo")
public_url = ngrok.connect(8501)
print(f"Streamlit app is running at {public_url}")