In [26]:
import pandas as pd
import streamlit as st
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import nbformat as nbf

In [27]:
%pip install pandas streamlit seaborn matplotlib folium scikit-learn nbformat


Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



Function to load and preprocess data

In [28]:

def data_handling(data_handling):
    data_handling_list = []

    # Read and append each file into a list
    for data_handling_file in data_handling:
        print(f"Reading file: {data_handling_file}")
        df = pd.read_csv(data_handling_file)
        print(f"Shape of current file ({data_handling_file}): {df.shape}")
        data_handling_list.append(df)

    # Combine all datasets into one DataFrame
    print("\nCombining all datasets into a single DataFrame...")
    data_handling_part_system = pd.concat(data_handling_list, ignore_index=True)
    print(f"Combined DataFrame shape: {data_handling_part_system.shape}")

    # Handle missing values
    print("\nHandling missing values with forward fill...")
    missing_before = data_handling_part_system.isnull().sum()
    print(f"Missing values before handling:\n{missing_before}")
    data_handling_part_system.fillna(method='ffill', inplace=True)
    missing_after = data_handling_part_system.isnull().sum()
    print(f"Missing values after handling:\n{missing_after}")

    # Remove duplicate entries
    print("\nRemoving duplicate entries...")
    duplicates_before = data_handling_part_system.duplicated().sum()
    print(f"Number of duplicates before removal: {duplicates_before}")
    data_handling_part_system.drop_duplicates(inplace=True)
    duplicates_after = data_handling_part_system.duplicated().sum()
    print(f"Number of duplicates after removal: {duplicates_after}")

    # Feature engineering (e.g., create 'Month' from 'year')
    if 'year' in data_handling_part_system.columns:
        print("\nFeature engineering: Extracting 'Month' from 'year' column...")
        print(f"Converting 'year' column to datetime...")
        data_handling_part_system['year'] = pd.to_datetime(data_handling_part_system['year'], errors='coerce')
        print("Dropping rows with invalid 'year' values...")
        data_handling_part_system.dropna(subset=['year'], inplace=True)
        print("Creating 'Month' column from 'year' column...")
        data_handling_part_system['Month'] = data_handling_part_system['year'].dt.month
        print("Sample of transformed data:")
        print(data_handling_part_system[['year', 'Month']].head())

    return data_handling_part_system


Streamlit App

In [40]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import MarkerCluster
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define a function for data preprocessing
def data_handling(data_file):
    # Load the data
    df = pd.read_csv(data_file)

    # Handle missing values
    df.fillna(method='ffill', inplace=True)  # Forward fill missing values

    # Remove duplicate entries
    df.drop_duplicates(inplace=True)

    # Feature engineering: Create 'Month' from 'year'
    if 'year' in df.columns:
        df['year'] = pd.to_datetime(df['year'], errors='coerce')
        df['Month'] = df['year'].dt.month
        df.dropna(subset=['year'], inplace=True)
    
    return df

# Main function for Streamlit
def main():
    st.title("Air Quality Data Analysis System Using Streamlit")
    st.sidebar.title("Choose Options")

    # Upload CSV files
    uploaded_files = st.sidebar.file_uploader("Upload CSV Files", type=["csv"], accept_multiple_files=True)
    
    if uploaded_files:
        # Process uploaded files and combine them into one dataset
        data = pd.concat([data_handling(file) for file in uploaded_files], ignore_index=True)
        st.success("Datasets loaded and processed successfully!")
    else:
        st.warning("Please upload at least one CSV file.")
        return

    # Sidebar options
    show_data = st.sidebar.checkbox("Show Data")
    summary_stats = st.sidebar.checkbox("Exploratory Data Analysis (EDA)")
    model_building = st.sidebar.checkbox("Machine Learning Model Building")
    model_evaluation = st.sidebar.checkbox("Model Evaluation")

    # Show dataset details
    if 'data' in locals():
        st.subheader("Dataset Overview")
        rows, columns = data.shape
        st.write(f"The dataset contains **{rows} rows** and **{columns} columns**.")
        st.write("The columns in the dataset are:", data.columns.tolist())

        # Show data types and missing values
        st.write("**Data Types:**")
        st.write(data.dtypes)
        missing_values = data.isnull().sum()
        st.write("**Missing Values:**")
        st.write(missing_values)

        # Show raw data
        if show_data:
            st.subheader("Dataset Preview")
            st.dataframe(data.head())

        # Summary statistics and visualizations
        if summary_stats:
            st.subheader("Summary Statistics")
            st.write(data.describe())

            # Visualize numeric columns
            numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
            st.subheader("Visualizations")
            for column in numeric_columns:
                st.subheader(f"Visualization for {column}")
                # Histogram
                plt.figure(figsize=(10, 6))
                sns.histplot(data[column], kde=True, bins=30, color='skyblue')
                st.pyplot(plt)
                # Box Plot
                plt.figure(figsize=(10, 6))
                sns.boxplot(data[column], color='lightgreen')
                st.pyplot(plt)

        # Map for highest pollution levels
        if 'latitude' in data.columns and 'longitude' in data.columns and 'PM2.5' in data.columns:
            st.subheader("Map: Highest Pollution Levels")
            highest_pollution_station = data.loc[data['PM2.5'].idxmax()]
            m = folium.Map(location=[highest_pollution_station['latitude'], highest_pollution_station['longitude']], zoom_start=10)
            marker_cluster = MarkerCluster().add_to(m)
            folium.Marker(
                location=[highest_pollution_station['latitude'], highest_pollution_station['longitude']],
                popup=f"Station: {highest_pollution_station.get('Station', 'Unknown')}<br>PM2.5: {highest_pollution_station['PM2.5']}",
                icon=folium.Icon(color='red')
            ).add_to(marker_cluster)
            st.components.v1.html(m._repr_html_(), height=500)
        else:
            st.write("Map data (Latitude, Longitude, PM2.5) is incomplete in the dataset.")

        # Machine Learning Model Building
        if model_building:
            st.subheader("Machine Learning Model Building")
            target_column = st.selectbox("Select Target Variable:", numeric_columns)
            
            if target_column:
                # Feature and target separation
                features = data.drop(columns=['year', 'Station', target_column], errors='ignore')
                target = data[target_column]

                # Handle missing values
                features = pd.get_dummies(features, drop_first=True)  # Encode categorical features
                imputer = SimpleImputer(strategy='mean')
                features = imputer.fit_transform(features)

                # Scale the features
                scaler = StandardScaler()
                features = scaler.fit_transform(features)

                # Train-test split
                X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

                # Model selection
                model_type = st.selectbox("Choose Model:", ["Linear Regression", "Random Forest"])
                if model_type == "Linear Regression":
                    model = LinearRegression()
                else:
                    model = RandomForestRegressor(random_state=42)

                # Train the model
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                # Model Evaluation
                if model_evaluation:
                    st.subheader("Model Evaluation")
                    st.write("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
                    st.write("Mean Squared Error:", mean_squared_error(y_test, y_pred))
                    st.write("R-Squared Score:", r2_score(y_test, y_pred))

# Run the Streamlit app
if __name__ == "__main__":
    main()




In [30]:
if __name__ == '__main__':
    main()

