In [None]:
!pip install streamlit pyngrok xgboost imbalanced-learn scikit-learn pandas matplotlib seaborn -q

from pyngrok import ngrok
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import streamlit as st
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
import io


app_code = """
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
import streamlit as st

st.set_page_config(page_title="Faculty Career Path Predictor", layout="wide")
st.title("Faculty Career Path Prediction Dashboard")

st.markdown("Upload your faculty dataset to predict **Current Institute** from PhD + Postdoc details.")

uploaded_file = st.file_uploader("Upload CSV", type=["csv"])

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)
    st.subheader("Data Preview")
    st.dataframe(df.head())

    required_cols = ['PhD_institute', 'Postdoc_institute', 'Current_institute']
    if not all(col in df.columns for col in required_cols):
        st.error("Columns missing! Required: PhD_institute, Postdoc_institute, Current_institute")
    else:
        df = df[required_cols].dropna()

        encoders = {}
        for col in required_cols:
            encoders[col] = LabelEncoder()
            df[col] = encoders[col].fit_transform(df[col])

        X = df[['PhD_institute', 'Postdoc_institute']]
        y = df['Current_institute']

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.25, random_state=42, stratify=y
        )

        class_counts = Counter(y_train)
        min_class_count = min(class_counts.values())
        k_neighbors = max(1, min(3, min_class_count - 1))
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

        st.info(f"SMOTE applied with k_neighbors={k_neighbors}")

        models = {
            "Logistic Regression": LogisticRegression(max_iter=500),
            "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
            "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
        }

        results = {}
        progress = st.progress(0)
        for i, (name, model) in enumerate(models.items()):
            model.fit(X_train_res, y_train_res)
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            results[name] = acc
            progress.progress((i+1)/len(models))

        st.success("Training Complete!")

        st.subheader("Model Accuracy Comparison")
        acc_df = pd.DataFrame(list(results.items()), columns=["Model", "Accuracy"])
        st.bar_chart(acc_df.set_index("Model"))

        best_model_name = max(results, key=results.get)
        best_model = models[best_model_name]
        st.info(f"Best model: **{best_model_name}** (Accuracy: {results[best_model_name]:.2f})")

        st.subheader("Confusion Matrix")
        y_pred = best_model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        fig, ax = plt.subplots()
        sns.heatmap(cm, cmap="Blues", ax=ax)
        st.pyplot(fig)

        st.subheader("Career Path Recommendation")
        phd_choice = st.selectbox("Select PhD Institute", encoders['PhD_institute'].classes_)
        postdoc_choice = st.selectbox("Select Postdoc Institute", encoders['Postdoc_institute'].classes_)

        if st.button("Predict Next Institute"):
            phd_enc = encoders['PhD_institute'].transform([phd_choice])[0]
            postdoc_enc = encoders['Postdoc_institute'].transform([postdoc_choice])[0]
            probs = best_model.predict_proba([[phd_enc, postdoc_enc]])[0]
            top_idx = np.argsort(probs)[-3:][::-1]
            preds = encoders['Current_institute'].inverse_transform(top_idx)
            st.write("Top Recommended Current Institutes:")
            for i, (p, c) in enumerate(zip(preds, probs[top_idx])):
                st.write(f"{i+1}. {p} (Confidence: {c:.2f})")

else:
    st.info("Please upload your CSV file to begin.")
"""


with open("faculty_app.py", "w") as f:
    f.write(app_code)

ngrok.set_auth_token("34ZDk4NS5FpjMQbBcDd7NOds7Oq_4NDWpZhWcZG1dtZmvRDkX")


public_url = ngrok.connect(8501)
print("Public app URL:", public_url)
!streamlit run faculty_app.py --server.port 8501

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m
🌐 Public app URL: NgrokTunnel: "https://alisson-uncongratulated-friskily.ngrok-free.dev" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.106.58.153:8501[0m
[0m
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Par