In [2]:
from google.colab import files
uploaded = files.upload()


Saving train.csv to train.csv


In [10]:
# Uninstall conflicting packages
!pip uninstall -y torch torchvision torchaudio transformers

# Install compatible versions
!pip install streamlit==1.29.0 pandas==2.2.2 plotly==5.24.1 scikit-learn==1.5.2 transformers==4.44.2 torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 pyngrok==7.1.6

# Download and setup ngrok
!wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz
!tar -xvzf ngrok-v3-stable-linux-amd64.tgz

Found existing installation: torch 2.6.0
Uninstalling torch-2.6.0:
  Successfully uninstalled torch-2.6.0
Found existing installation: torchvision 0.21.0
Uninstalling torchvision-0.21.0:
  Successfully uninstalled torchvision-0.21.0
Found existing installation: torchaudio 2.6.0
Uninstalling torchaudio-2.6.0:
  Successfully uninstalled torchaudio-2.6.0
Found existing installation: transformers 4.44.2
Uninstalling transformers-4.44.2:
  Successfully uninstalled transformers-4.44.2
Collecting transformers==4.44.2
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting torch==2.6.0
  Using cached torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision==0.21.0
  Using cached torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio==2.6.0
  Using cached torchaudio-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Using cached transformers-4.44.2-py3-none-any.whl (9.5 MB)
Using cached torch-2.6.0

--2025-07-20 00:59:12--  https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz
Resolving bin.equinox.io (bin.equinox.io)... 75.2.60.68, 13.248.244.96, 99.83.220.108, ...
Connecting to bin.equinox.io (bin.equinox.io)|75.2.60.68|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9260570 (8.8M) [application/octet-stream]
Saving to: ‘ngrok-v3-stable-linux-amd64.tgz.4’


2025-07-20 00:59:12 (163 MB/s) - ‘ngrok-v3-stable-linux-amd64.tgz.4’ saved [9260570/9260570]

ngrok


In [2]:
from google.colab import files
uploaded = files.upload()


Saving train.csv to train (2).csv


In [11]:
%%writefile app.py
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import torch

# Try importing transformers.pipeline, with fallback
try:
    from transformers import pipeline
except ImportError as e:
    st.error(f"Failed to import transformers.pipeline: {str(e)}. Using fallback mode.")
    pipeline = None

# Streamlit app configuration
st.set_page_config(page_title="Airline Passenger Satisfaction App", layout="wide")
st.title("✈️ Airline Passenger Satisfaction Dashboard")
st.markdown("Upload `train.csv` to explore insights, generate comments, and interact with a chatbot.")

# Initialize DistilGPT2 or fallback
@st.cache_resource
def load_nlp_model():
    if pipeline is None:
        return None
    try:
        return pipeline("text-generation", model="distilgpt2", device=-1)  # CPU
    except Exception as e:
        st.error(f"Failed to load DistilGPT2: {str(e)}. Using fallback mode.")
        return None

nlp = load_nlp_model()

# Sidebar for file upload and filters
st.sidebar.header("Data Upload & Filters")
uploaded_file = st.sidebar.file_uploader("Upload train.csv", type="csv")

# Initialize session state
if "data" not in st.session_state:
    st.session_state["data"] = None
if "model" not in st.session_state:
    st.session_state["model"] = None
if "feature_columns" not in st.session_state:
    st.session_state["feature_columns"] = None

# Load and preprocess data
def load_and_preprocess_data(file):
    try:
        df = pd.read_csv(file)
        df = df.dropna()
        numeric_cols = ["Age", "Flight Distance", "Inflight wifi service", "Departure/Arrival time convenient",
                        "Ease of Online booking", "Gate location", "Food and drink", "Online boarding",
                        "Seat comfort", "Inflight entertainment", "On-board service", "Leg room service",
                        "Baggage handling", "Checkin service", "Inflight service", "Cleanliness",
                        "Departure Delay in Minutes", "Arrival Delay in Minutes"]
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors="coerce")
        df["Age Group"] = pd.cut(df["Age"], bins=[0, 20, 40, 60, 100], labels=["<20", "20-40", "40-60", "60+"])
        df["Distance Group"] = pd.cut(df["Flight Distance"], bins=[0, 500, 1500, 5000], labels=["<500km", "500-1500km", "1500+km"])
        return df
    except Exception as e:
        st.error(f"Error loading data: {str(e)}. Ensure train.csv has the correct format (24 columns).")
        return None

# Train Random Forest model
def train_model(df):
    try:
        X = df.drop(["id", "satisfaction", "Age Group", "Distance Group"], axis=1)
        y = df["satisfaction"].map({"satisfied": 1, "neutral or dissatisfied": 0})
        st.session_state["feature_columns"] = X.columns.tolist()  # Store feature columns
        categorical_cols = ["Gender", "Customer Type", "Type of Travel", "Class"]
        numeric_cols = [col for col in X.columns if col not in categorical_cols]
        preprocessor = ColumnTransformer(
            transformers=[
                ("num", StandardScaler(), numeric_cols),
                ("cat", OneHotEncoder(), categorical_cols)
            ])
        model = Pipeline([
            ("preprocessor", preprocessor),
            ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
        ])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        st.write("**Model Performance**")
        st.write(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
        st.write("Classification Report:")
        st.write(classification_report(y_test, y_pred, target_names=["Dissatisfied", "Satisfied"]))
        return model
    except Exception as e:
        st.error(f"Error training model: {str(e)}")
        return None

# Generate summary
def generate_summary(df, filter_col, filter_val):
    filtered_df = df[df[filter_col] == filter_val]
    satisfaction_rate = len(filtered_df[filtered_df["satisfaction"] == "satisfied"]) / len(filtered_df) * 100
    avg_ratings = filtered_df[["Inflight wifi service", "Seat comfort", "Inflight entertainment"]].mean().to_dict()
    template = f"""
    {filter_col} {filter_val} passengers have a {satisfaction_rate:.1f}% satisfaction rate.
    Average ratings: Wi-Fi ({avg_ratings['Inflight wifi service']:.1f}/5), Seat comfort ({avg_ratings['Seat comfort']:.1f}/5),
    Inflight entertainment ({avg_ratings['Inflight entertainment']:.1f}/5). Key trends: {filter_val} passengers are {'more' if satisfaction_rate > 50 else 'less'} satisfied.
    """
    if nlp is None:
        return template
    try:
        prompt = f"Refine this summary to sound natural and executive-style (100-150 words): {template}"
        result = nlp(prompt, max_length=200, num_return_sequences=1, truncation=True)[0]["generated_text"]
        return result
    except Exception as e:
        return f"Error generating summary: {str(e)}. Using template:\n{template}"

# Generate synthetic comments
def generate_synthetic_comment(row):
    template = f"""
    Wi-Fi was rated {row["Inflight wifi service"]}/5, seat comfort {row["Seat comfort"]}/5, entertainment {row["Inflight entertainment"]}/5.
    The flight was delayed by {row["Departure Delay in Minutes"]} minutes in {row["Class"]} class. Satisfaction: {row["satisfaction"]}.
    """
    if nlp is None:
        return template
    try:
        prompt = f"Convert this to a realistic customer comment (50-100 words): {template}"
        result = nlp(prompt, max_length=150, num_return_sequences=1, truncation=True)[0]["generated_text"]
        return result
    except Exception as e:
        return f"Error generating comment: {str(e)}. Fallback: {template}"

# Satisfaction explanation chatbot
def explain_satisfaction(row, model):
    required_columns = st.session_state.get("feature_columns", [])
    if not required_columns:
        return "Error: Model feature columns not initialized. Please upload and process train.csv first."
    missing_cols = [col for col in required_columns if col not in row.columns]
    if missing_cols:
        return f"Error: Missing columns in input data: {missing_cols}"
    X = row[required_columns]  # Ensure correct column order
    try:
        pred = model.predict(X)[0]
        pred_label = "satisfied" if pred == 1 else "neutral or dissatisfied"
        feature_importance = pd.DataFrame({
            "feature": model.named_steps["preprocessor"].get_feature_names_out(),
            "importance": model.named_steps["classifier"].feature_importances_
        }).sort_values("importance", ascending=False).head(3)
        template = f"""
        Passenger predicted as {pred_label}. Details: Age {row["Age"].iloc[0]}, Class {row["Class"].iloc[0]},
        Wi-Fi {row["Inflight wifi service"].iloc[0]}/5, Seat comfort {row["Seat comfort"].iloc[0]}/5,
        Delay {row["Departure Delay in Minutes"].iloc[0]} minutes. Top factors: {feature_importance.to_dict(orient="records")}.
        """
        if nlp is None:
            return template
        try:
            prompt = f"Explain this prediction in a natural, concise way (100-150 words): {template}"
            result = nlp(prompt, max_length=200, num_return_sequences=1, truncation=True)[0]["generated_text"]
            return result
        except Exception as e:
            return f"Error generating explanation: {str(e)}. Fallback: Predicted {pred_label} due to {feature_importance['feature'].iloc[0]}."
    except Exception as e:
        return f"Error predicting satisfaction: {str(e)}. Ensure input data matches training data columns."

# Load data if uploaded
if uploaded_file:
    st.session_state["data"] = load_and_preprocess_data(uploaded_file)
    if st.session_state["data"] is not None:
        st.session_state["model"] = train_model(st.session_state["data"])

# Filters
if st.session_state.get("data") is not None:
    df = st.session_state["data"]
    st.sidebar.subheader("Filters")
    class_filter = st.sidebar.selectbox("Class", ["All"] + list(df["Class"].unique()))
    customer_type_filter = st.sidebar.selectbox("Customer Type", ["All"] + list(df["Customer Type"].unique()))
    satisfaction_filter = st.sidebar.selectbox("Satisfaction", ["All"] + list(df["satisfaction"].unique()))

    # Apply filters
    filtered_df = df
    if class_filter != "All":
        filtered_df = filtered_df[filtered_df["Class"] == class_filter]
    if customer_type_filter != "All":
        filtered_df = filtered_df[filtered_df["Customer Type"] == customer_type_filter]
    if satisfaction_filter != "All":
        filtered_df = filtered_df[filtered_df["satisfaction"] == satisfaction_filter]

    # Tabs for features
    tab1, tab2, tab3, tab4 = st.tabs(["Summary Reports", "Synthetic Comments", "Chatbot", "Visualizations"])

    with tab1:
        st.header("Automated Summary Reports")
        filter_col = st.selectbox("Select filter for summary", ["Class", "Age Group", "Distance Group"])
        filter_val = st.selectbox("Select value", df[filter_col].unique())
        if st.button("Generate Summary"):
            summary = generate_summary(filtered_df, filter_col, filter_val)
            st.write(summary)

    with tab2:
        st.header("Synthetic Customer Comments")
        sample_rows = filtered_df.sample(3, random_state=42) if len(filtered_df) >= 3 else filtered_df
        for idx, row in sample_rows.iterrows():
            comment = generate_synthetic_comment(row)
            st.subheader(f"Passenger ID: {row['id']}")
            st.write(comment)

    with tab3:
        st.header("Satisfaction Explanation Chatbot")
        st.write("Enter passenger details to get an AI explanation of satisfaction prediction.")
        with st.form("chatbot_form"):
            age = st.number_input("Age", min_value=0, max_value=100, value=30)
            class_input = st.selectbox("Class", ["Eco", "Eco Plus", "Business"])
            wifi = st.slider("Inflight Wi-Fi Service (1-5)", 1, 5, 3)
            seat = st.slider("Seat Comfort (1-5)", 1, 5, 3)
            delay = st.number_input("Departure Delay (minutes)", min_value=0, value=0)
            gender = st.selectbox("Gender", ["Male", "Female"])
            customer_type = st.selectbox("Customer Type", ["Loyal Customer", "disloyal Customer"])
            travel_type = st.selectbox("Type of Travel", ["Business travel", "Personal Travel"])
            flight_distance = st.number_input("Flight Distance (km)", min_value=0, value=1000)
            departure_arrival = st.slider("Departure/Arrival Time Convenient (1-5)", 1, 5, 3)
            online_booking = st.slider("Ease of Online Booking (1-5)", 1, 5, 3)
            gate_location = st.slider("Gate Location (1-5)", 1, 5, 3)
            food_drink = st.slider("Food and Drink (1-5)", 1, 5, 3)
            online_boarding = st.slider("Online Boarding (1-5)", 1, 5, 3)
            entertainment = st.slider("Inflight Entertainment (1-5)", 1, 5, 3)
            onboard_service = st.slider("On-board Service (1-5)", 1, 5, 3)
            leg_room = st.slider("Leg Room Service (1-5)", 1, 5, 3)
            baggage_handling = st.slider("Baggage Handling (1-5)", 1, 5, 3)
            checkin_service = st.slider("Checkin Service (1-5)", 1, 5, 3)
            inflight_service = st.slider("Inflight Service (1-5)", 1, 5, 3)
            cleanliness = st.slider("Cleanliness (1-5)", 1, 5, 3)
            arrival_delay = st.number_input("Arrival Delay (minutes)", min_value=0, value=0)
            submitted = st.form_submit_button("Explain Satisfaction")
            if submitted and st.session_state.get("model") is not None:
                input_data = pd.DataFrame({
                    "Gender": [gender],
                    "Customer Type": [customer_type],
                    "Age": [age],
                    "Type of Travel": [travel_type],
                    "Class": [class_input],
                    "Flight Distance": [flight_distance],
                    "Inflight wifi service": [wifi],
                    "Departure/Arrival time convenient": [departure_arrival],
                    "Ease of Online booking": [online_booking],
                    "Gate location": [gate_location],
                    "Food and drink": [food_drink],
                    "Online boarding": [online_boarding],
                    "Seat comfort": [seat],
                    "Inflight entertainment": [entertainment],
                    "On-board service": [onboard_service],
                    "Leg room service": [leg_room],
                    "Baggage handling": [baggage_handling],
                    "Checkin service": [checkin_service],
                    "Inflight service": [inflight_service],
                    "Cleanliness": [cleanliness],
                    "Departure Delay in Minutes": [delay],
                    "Arrival Delay in Minutes": [arrival_delay]
                })
                explanation = explain_satisfaction(input_data, st.session_state["model"])
                st.write(explanation)
            elif submitted:
                st.error("Model not initialized. Please upload train.csv first.")

    with tab3:
        st.header("Visualizations")
        # Bar chart
        bar_data = filtered_df.groupby(["Class", "Customer Type", "satisfaction"]).size().unstack(fill_value=0)
        bar_data = bar_data.div(bar_data.sum(axis=1), axis=0) * 100
        bar_data = bar_data.reset_index()
        fig_bar = px.bar(bar_data, x="Class", y=["satisfied", "neutral or dissatisfied"],
                         color_discrete_map={"satisfied": "#4CAF50", "neutral or dissatisfied": "#F44336"},
                         facet_col="Customer Type", title="Satisfaction by Class and Customer Type")
        st.plotly_chart(fig_bar, use_container_width=True)

        # Scatter plot
        fig_scatter = px.scatter(filtered_df, x="Age", y="Flight Distance", color="satisfaction",
                                 color_discrete_map={"satisfied": "#4CAF50", "neutral or dissatisfied": "#F44336"},
                                 title="Flight Distance vs. Age")
        st.plotly_chart(fig_scatter, use_container_width=True)

        # Line chart
        line_data = filtered_df.groupby("Class")[["Inflight wifi service", "Seat comfort", "Inflight entertainment"]].mean().reset_index()
        fig_line = go.Figure()
        for col in ["Inflight wifi service", "Seat comfort", "Inflight entertainment"]:
            fig_line.add_trace(go.Scatter(x=line_data["Class"], y=line_data[col], mode="lines+markers", name=col))
        fig_line.update_layout(title="Average Service Ratings by Class", xaxis_title="Class", yaxis_title="Average Rating")
        st.plotly_chart(fig_line, use_container_width=True)

else:
    st.write("Please upload a CSV file to begin.")

Overwriting app.py


In [15]:
!./ngrok authtoken 307FvOEDr61KsOmM5Muly83vgu1_vX59QRvGMaJBE2By5K2w

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [17]:
# Start Streamlit server in the background
!streamlit run app.py &>/dev/null &

# Start ngrok to tunnel the Streamlit app
from pyngrok import ngrok

# Terminate any existing ngrok tunnels
ngrok.kill()

# Create a new tunnel to port 8501
public_url = ngrok.connect(8501, bind_tls=True)
print("Your Streamlit app is live at:", public_url)

Your Streamlit app is live at: NgrokTunnel: "https://4a2d71a29327.ngrok-free.app" -> "http://localhost:8501"
