<a href="https://colab.research.google.com/github/narenderkulariya/st20269341-CMP7005-PRAC1/blob/main/app_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
!pip install streamlit



In [39]:
!pip install --upgrade Streamlit



In [40]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [41]:
%%writefile app.py

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# ==========================================
# 1. Page Configuration & Setup
# ==========================================
st.set_page_config(
    page_title="Air Quality Analysis & Prediction System",
    page_icon="üè≠",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS for a professional look
st.markdown("""
    <style>
    .main {
        background-color: #f5f5f5;
    }
    h1 {
        color: #2c3e50;
        text-align: center;
    }
    .stButton>button {
        background-color: #ff4b4b;
        color: white;
        border-radius: 10px;
        width: 100%;
    }
    .stMetric {
        background-color: white;
        padding: 15px;
        border-radius: 5px;
        box-shadow: 2px 2px 5px rgba(0,0,0,0.1);
    }
    </style>
""", unsafe_allow_html=True)

# ==========================================
# 2. Data Loading (Cached for Performance)
# ==========================================
@st.cache_data
def load_data():
    try:
        # Load the cleaned dataset provided
        df = pd.read_csv('/content/drive/MyDrive/CMP7005/all_cities_combined_cleaned.csv')
        df['Date'] = pd.to_datetime(df['Date'])
        return df
    except FileNotFoundError:
        st.error("‚ö†Ô∏è Error: 'all_cities_combined_cleaned.csv' not found. Please place it in the same directory.")
        return None

df = load_data()

# ==========================================
# 3. Model Training (Cached)
# ==========================================
@st.cache_resource
def train_model(data):
    # Features identified from your 'model_prediction.ipynb'
    feature_cols = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']
    target_col = 'AQI'

    # Drop rows where target or features are missing (just in case)
    model_df = data.dropna(subset=feature_cols + [target_col])

    X = model_df[feature_cols]
    y = model_df[target_col]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Random Forest (as per your notebook)
    model = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)

    # Calculate Metrics
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    return model, r2, rmse, feature_cols

# ==========================================
# 4. Sidebar Navigation
# ==========================================
st.sidebar.image("https://cdn-icons-png.flaticon.com/512/2965/2965335.png", width=100)
st.sidebar.title("Navigation")
st.sidebar.info("Select a module below:")
page = st.sidebar.radio("Go to", ["üè† Home & Data Overview", "üìä Exploratory Data Analysis (EDA)", "ü§ñ Modelling & Prediction"])

st.sidebar.markdown("---")
st.sidebar.markdown("**Project Info**")
st.sidebar.markdown("Student ID: `st20269341`")
st.sidebar.markdown("Module: `CMP7005 PRAC1`")

# ==========================================
# 5. Main Content Sections
# ==========================================

if df is not None:

    # --------------------------------------
    # SECTION 1: Data Overview
    # --------------------------------------
    if page == "üè† Home & Data Overview":
        st.title("Air Quality Data Analysis System")
        st.markdown("### Welcome to the Air Quality Monitoring Platform")
        st.write("""
        This application provides a comprehensive analysis of air pollution data across **26 Indian Cities** (2015-2020).
        It allows stakeholders to monitor trends, understand pollutant compositions, and predict future Air Quality Index (AQI) levels.
        """)

        st.markdown("---")
        st.subheader("üìã Dataset Overview")

        # Top level metrics
        c1, c2, c3 = st.columns(3)
        c1.metric("Total Records", df.shape[0])
        c2.metric("Total Columns", df.shape[1])
        c3.metric("Cities Covered", df['City'].nunique())

        # Show Dataframe
        with st.expander("üîç View Raw Data (First 100 Rows)", expanded=True):
            st.dataframe(df.head(100), use_container_width=True)

        # Data Stats
        st.subheader("üìä Statistical Summary")
        st.write("Descriptive statistics for key pollutants:")
        st.dataframe(df.describe().T, use_container_width=True)

    # --------------------------------------
    # SECTION 2: Exploratory Data Analysis (EDA)
    # --------------------------------------
    elif page == "üìä Exploratory Data Analysis (EDA)":
        st.title("Exploratory Data Analysis")
        st.markdown("Visualize trends, correlations, and geographical disparities in air quality.")

        # Tabs for better organization
        tab1, tab2, tab3, tab4 = st.tabs(["üìà Time Trends", "üåç City Rankings", "üî• Correlations", "üì¶ Categories"])

        with tab1:
            st.subheader("Yearly & Seasonal Trends")

            # Yearly Trend
            yearly_aqi = df.groupby('Year')['AQI'].mean().reset_index()
            fig_trend = px.line(yearly_aqi, x='Year', y='AQI', markers=True,
                                title='Average AQI Trend (2015-2020)', template="plotly_white")
            st.plotly_chart(fig_trend, use_container_width=True)

            # Interactive City Comparison
            st.markdown("**Compare City Trends:**")
            selected_cities = st.multiselect("Select Cities", df['City'].unique(), default=['Delhi', 'Bengaluru'])
            if selected_cities:
                subset = df[df['City'].isin(selected_cities)]
                # Group by month for smoother plot
                monthly_trend = subset.groupby(['Date', 'City'])['AQI'].mean().reset_index()
                fig_city = px.line(monthly_trend, x='Date', y='AQI', color='City',
                                   title='Daily AQI Trends Comparison')
                st.plotly_chart(fig_city, use_container_width=True)

        with tab2:
            st.subheader("Most Polluted Cities")

            # Top 10 Bar Chart
            top_cities = df.groupby('City')['AQI'].mean().sort_values(ascending=False).head(10).reset_index()
            fig_bar = px.bar(top_cities, x='AQI', y='City', orientation='h', color='AQI',
                             color_continuous_scale='Reds', title='Top 10 Most Polluted Cities')
            fig_bar.update_layout(yaxis={'categoryorder':'total ascending'})
            st.plotly_chart(fig_bar, use_container_width=True)

        with tab3:
            st.subheader("Pollutant Correlation Matrix")
            st.write("Which pollutants have the strongest relationship with AQI?")

            corr_cols = ['AQI', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3']
            corr_matrix = df[corr_cols].corr()

            fig_corr = px.imshow(corr_matrix, text_auto=".2f", aspect="auto", color_continuous_scale="RdBu_r",
                                 title="Correlation Heatmap")
            st.plotly_chart(fig_corr, use_container_width=True)
            st.info("üí° **Insight:** PM2.5 and PM10 show the highest positive correlation with AQI.")

        with tab4:
            st.subheader("AQI Severity Distribution")

            # Stacked Bar Chart (North vs South concept simulated or just all cities)
            bucket_counts = df.groupby(['City', 'AQI_Bucket']).size().reset_index(name='Count')
            # Filter top 5 most polluted vs top 5 cleanest for clarity
            focus_cities = ['Delhi', 'Patna', 'Gurugram', 'Bengaluru', 'Thiruvananthapuram', 'Shillong']
            bucket_subset = bucket_counts[bucket_counts['City'].isin(focus_cities)]

            bucket_order = ['Good', 'Satisfactory', 'Moderate', 'Poor', 'Very Poor', 'Severe']

            fig_stack = px.bar(bucket_subset, x='City', y='Count', color='AQI_Bucket',
                               category_orders={'AQI_Bucket': bucket_order},
                               color_discrete_map={
                                   'Good': 'green', 'Satisfactory': 'lightgreen', 'Moderate': 'yellow',
                                   'Poor': 'orange', 'Very Poor': 'red', 'Severe': 'darkred'
                               },
                               title="Frequency of AQI Severity (Selected Cities)")
            st.plotly_chart(fig_stack, use_container_width=True)

    # --------------------------------------
    # SECTION 3: Modelling & Prediction
    # --------------------------------------
    elif page == "ü§ñ Modelling & Prediction":
        st.title("AI-Based AQI Predictor")
        st.markdown("""
        Predict the Air Quality Index (AQI) based on concentration levels of various pollutants.
        The model used is a **Random Forest Regressor**.
        """)

        # Train model (or load cached)
        with st.spinner("Training/Loading Model... Please wait..."):
            model, r2, rmse, feature_cols = train_model(df)

        # Display Model Performance
        st.subheader("üèÜ Model Performance")
        m1, m2 = st.columns(2)
        m1.metric("R¬≤ Score (Accuracy)", f"{r2:.2f}")
        m2.metric("RMSE (Error Margin)", f"{rmse:.2f}")
        st.success(f"Model trained successfully on {len(feature_cols)} features!")

        st.markdown("---")
        st.subheader("üéõÔ∏è Prediction Simulator")
        st.write("Adjust the sliders below to simulate pollutant levels:")

        # Create input form
        input_data = {}
        cols = st.columns(3) # 3 columns layout

        # Loop through features to create sliders dynamically
        for i, col_name in enumerate(feature_cols):
            # Get typical range from data for slider limits
            min_val = float(df[col_name].min())
            max_val = float(df[col_name].quantile(0.95)) # Use 95th percentile to avoid extreme outliers in slider
            mean_val = float(df[col_name].mean())

            with cols[i % 3]:
                input_data[col_name] = st.slider(f"{col_name}", 0.0, max_val, mean_val)

        # Prediction Button
        if st.button("üöÄ Predict AQI"):
            # Convert input to dataframe
            input_df = pd.DataFrame([input_data])

            # Make prediction
            prediction = model.predict(input_df)[0]

            # Display Result
            st.markdown("### üéØ Predicted Result")

            # Color coding result
            color = "green"
            status = "Good"
            if prediction > 50: status, color = "Satisfactory", "#9acd32"
            if prediction > 100: status, color = "Moderate", "orange"
            if prediction > 200: status, color = "Poor", "red"
            if prediction > 300: status, color = "Very Poor", "darkred"
            if prediction > 400: status, color = "Severe", "black"

            col_res1, col_res2 = st.columns([1, 3])
            with col_res1:
                st.metric("Predicted AQI", f"{prediction:.0f}")
            with col_res2:
                st.markdown(f"### <span style='color:{color}'>{status}</span>", unsafe_allow_html=True)
                st.progress(min(prediction/500, 1.0))

            st.info(f"Based on these pollutant levels, the air quality is expected to be **{status}**.")

Overwriting app.py


In [47]:
!wget -q -O - ipv4.icanhazip.com

34.23.209.138


In [48]:
!streamlit run app.py & npx localtunnel --port 8501

[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K‚†¥[1G[0K‚†¶[1G[0K‚†ß[1G[0K‚†á[1G[0K‚†è[1G[0K‚†ã[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[1G[0K‚†¶[1G[0K‚†ß[1G[0K‚†á[1G[0K‚†è[1G[0K‚†ã[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[1G[0K‚†¶[1G[0K‚†ß[1G[0K‚†á[1G[0K‚†è[1G[0K‚†ã[1G[0Kyour url is: https://five-views-send.loca.lt
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.23.209.138:8501[0m
[0m
2025-12-17 14:48:36.727 Please replace `use_container_width` with `width`.

`use_container_width` will be removed after 2025-12-31.

For `use_container_width=True`, use `width='stretch'`. For `use_container_width=False`, use `width='content'`.
2025-12-17 14:48:36.805 Please rep