In [267]:
data_16S = pd.read_csv('NCOG_21_16S_redo2_asv_count_tax.tsv', sep='\t')
data_18Sv4 = pd.read_csv('NCOG_18sV4_asv_count_tax.tsv', sep='\t')
data_18Sv9 = pd.read_csv('NCOG_18sV9_asv_count_tax_S.tsv', sep='\t')
data_meta = pd.read_csv('NCOG_sample_log_DNA_stvx_meta_2014-2020_mod.tsv', sep='\t')
data_meta['sampleid'] = data_meta['sampleid'].apply(lambda x: 'X' + x)

In [44]:
px.set_mapbox_access_token('pk.eyJ1IjoibWFzb3JlbnMiLCJhIjoiY2x0ZHhpY2JsMGJwajJ2c2JkY2pwNnZvYyJ9.0BfLEl_lfk7-rs-96XghMQ')


In [88]:
data_meta['sample_type'].value_counts()

Surf    772
DCM     702
515       7
170       5
Name: sample_type, dtype: int64

## Sample types:
### 'Surf' IQR: 10-10 m
### 'DCM' IQR: 30-75 m
### 515: 515 m
### 170: 170 m

In [273]:
station_id = data_meta['Sta_ID'].iloc[0]
sample_type = 'DCM'
dataset = '16S'

cols_show_in_sunburst = ['Phylum', 'Class', 'Order']
station_data = data_meta[(data_meta['Sta_ID'] == station_id) & (data_meta['sample_type'] == sample_type)]
station_samples = station_data['sampleid'].tolist()

taxa_col_names = ['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

# Merge with 16S dataframe to get taxonomy data for the samples
asv_cols = pd.Series(data_16S.columns).isin(station_samples).values

asv_cols[0] = True
station_asvs = pd.concat([data_16S.loc[:,asv_cols], data_16S['silva_Taxon']], axis=1)

# Get relative abundances
values = station_asvs.drop(['Feature.ID', 'silva_Taxon'], axis=1).sum(axis=1)
#values = values / values.sum()

# Count occurrences of each taxonomy category
taxonomies = station_asvs['silva_Taxon'].str.split('; ', expand=True)
taxonomies.columns = taxa_col_names
taxonomies = taxonomies.dropna(subset=cols_show_in_sunburst[0]).fillna('___Undetermined')[cols_show_in_sunburst]
  
# get rid of the silva d__, p__, etc prefixes
for col in taxonomies.columns:
    taxonomies[col] = taxonomies[col].apply(lambda x: x[3:])

# Get relative abundances
taxonomies['values'] = values
taxonomies = taxonomies[taxonomies['values'] != 0]

In [274]:
taxonomies

Unnamed: 0,Phylum,Class,Order,values
16,Proteobacteria,Gammaproteobacteria,HgCo23,66
54,Marinimicrobia_(SAR406_clade),Marinimicrobia_(SAR406_clade),Marinimicrobia_(SAR406_clade),9
100,Proteobacteria,Alphaproteobacteria,Puniceispirillales,78
121,Proteobacteria,Gammaproteobacteria,SAR86_clade,43
175,Thermoplasmatota,Thermoplasmata,Marine_Group_II,841
...,...,...,...,...
28815,Marinimicrobia_(SAR406_clade),Marinimicrobia_(SAR406_clade),Marinimicrobia_(SAR406_clade),596
28830,Verrucomicrobiota,Verrucomicrobiae,Opitutales,3
28834,Bacteroidota,Bacteroidia,Flavobacteriales,27
28838,Proteobacteria,Alphaproteobacteria,SAR11_clade,37


In [258]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.graph_objs as go
import pandas as pd

data_16S = pd.read_csv('NCOG_21_16S_redo2_asv_count_tax.tsv', sep='\t')
data_18Sv4 = pd.read_csv('NCOG_18sV4_asv_count_tax.tsv', sep='\t')
data_18Sv9 = pd.read_csv('NCOG_18sV9_asv_count_tax_S.tsv', sep='\t')
data_meta = pd.read_csv('NCOG_sample_log_DNA_stvx_meta_2014-2020_mod.tsv', sep='\t')
data_meta['sampleid'] = data_meta['sampleid'].apply(lambda x: 'X' + x)

cal_coast_center = dict(
    lat=np.mean([min(data_meta['Lat_Dec']), max(data_meta['Lat_Dec'])]),
    lon=np.mean([min(data_meta['Lon_Dec']), max(data_meta['Lon_Dec'])])
)
env_var_cols = ['T_degC', 'Salnty', 'O2ml_L', 'PO4ug', 'SiO3ug', 'NO3ug', 'NH3ug', 'ChlorA', 'IntC14', 'NCDepth']
sample_type_vals = data_meta['sample_type'].dropna().unique()

# dash app
# app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])


# app.layout = dbc.Container([
#     html.H1("CO2 Emissions Dashboard", className="mt-4 mb-4"),
#     dbc.Row([
#         dbc.Col([
#             html.Div([
#                 html.Label("Select Sector"),
#                 dcc.Dropdown(
#                     id='sector-dropdown',
#                     options=sector_options,
#                     value='Electric Power'
#                 ),
#                 html.Label("Select Year"),
#                 dcc.Slider(
#                     id='year-slider',
#                     min=co2_data['Year'].min(),
#                     max=co2_data['Year'].max(),
#                     value=co2_data['Year'].max(),
#                     marks={str(year): str(year) for year in co2_data['Year'].unique()},
#                     step=None
#                 ),
#             ], className="mb-4")
#         ], width=3),
#         dbc.Col([
#             dcc.Graph(id='co2-heatmap')
#         ], width=6),
#         dbc.Col([
#             dcc.Graph(id='state-lineplot')
#         ], width=3)
#     ])
# ], fluid=True)
MYSTAID = None

app = dash.Dash(__name__)
app.layout = html.Div([
    html.Div([
        html.H3('Sample Type:'),
        dcc.Dropdown(sample_type_vals, sample_type_vals[0], id='sample-type-dropdown'),
        html.H3('Environmental Variable (Color):'),
        dcc.Dropdown(env_var_cols, 'NCDepth', id='env-var-dropdown')
    ]),
    html.Div([
        dcc.Graph(id='map-graph')
    ]),
    html.Div([
        html.H3('Dataset:'),
        dcc.Dropdown(['16S', '18Sv4', '18Sv9'], '16S', id='dataset-dropdown')
    ]),
    html.Div([
        dcc.Graph(id='sunburst-graph')
    ]),
])

# precompute map figures for different sample types
map_figs = {sample_type: {} for sample_type in data_meta['sample_type'].unique()}
for sample_type in data_meta['sample_type'].unique():
    for env_var in env_var_cols:
        meta_subset = data_meta[data_meta['sample_type'] == sample_type]
        num_samples = meta_subset.groupby('Sta_ID')['sample_num'].count()
        meta_subset = meta_subset.groupby('Sta_ID').mean().reset_index()
        meta_subset['num_samples'] = num_samples
        hover_names = meta_subset['Sta_ID'].apply(lambda x: '<b>Station: </b>' + x)
        subset_fig = px.scatter_mapbox(meta_subset, lat='Lat_Dec', lon='Lon_Dec', center=cal_coast_center,
                                       color=env_var, hover_name=hover_names, #size="num_samples",
                                       color_continuous_scale='viridis', size_max=15, zoom=4.5, mapbox_style='outdoors',
                                       width=600, height=700, custom_data='Sta_ID')
        map_figs[sample_type][env_var] = subset_fig

# empty sunburst figure if the station has no data of the selected sample type
empty_sunburst_data = {'Phylum': [], 'Class': [], 'Order': []}
for parent in ['Undetermined_1', 'Undetermined_2', 'Undetermined_3']:
    for child1 in ['Undetermined_1', 'Undetermined_2', 'Undetermined_3']:
        for child2 in ['Undetermined_1', 'Undetermined_2', 'Undetermined_3']:
            empty_sunburst_data['Phylum'].append(parent)
            empty_sunburst_data['Class'].append(child1)
            empty_sunburst_data['Order'].append(child2)
empty_sunburst_fig = px.sunburst(empty_sunburst_data, path=['Phylum', 'Class', 'Order'])

# precompute sunburst figures
sunburst_figs = {station_id: {} for station_id in data_meta['Sta_ID'].unique()}
for station_id in data_meta['Sta_ID'].unique():
    for sample_type in data_meta['sample_type'].unique():
        sunburst_figs[station_id][sample_type] = {}
        for dataset in ['16S', '18Sv4', '18Sv9']:
            cols_show_in_sunburst = ['Phylum', 'Class', 'Order']
            station_data = data_meta[(data_meta['Sta_ID'] == station_id) & (data_meta['sample_type'] == sample_type)]
            station_samples = station_data['sampleid'].tolist()
            if dataset =='16S':
                taxa_col_names = ['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

                # Merge with 16S dataframe to get taxonomy data for the samples
                asv_cols = pd.Series(data_16S.columns).isin(station_samples).values
                asv_cols[0] = True
                station_asvs = pd.concat([data_16S.loc[:,asv_cols], data_16S['silva_Taxon']], axis=1)

                # Get relative abundances
                values = station_asvs.drop(['Feature.ID', 'silva_Taxon'], axis=1).sum(axis=0)
                #values = values / values.sum()

                # Count occurrences of each taxonomy category
                taxonomies = station_asvs['silva_Taxon'].str.split('; ', expand=True)
                taxonomies.columns = taxa_col_names
                taxonomies = taxonomies.dropna(subset=cols_show_in_sunburst[0]).fillna('___Undetermined')[cols_show_in_sunburst]

                # get rid of the silva d__, p__, etc prefixes
                for col in taxonomies.columns:
                    taxonomies[col] = taxonomies[col].apply(lambda x: x[3:])

                # Get relative abundances
                taxonomies['values'] = values

                # set title of plot
                title = '16S Silva Taxonomy, Station "' + station_id + '"'
            elif dropdown_dataset == '18Sv4':
                taxa_col_names = ['Kingdom', 'Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
                # Merge with 18Sv4 dataframe to get taxonomy data for the samples
                asv_cols = pd.Series(data_18Sv4.columns).isin(station_samples).values
                asv_cols[0] = True
                station_asvs = pd.concat([data_18Sv4.loc[:,asv_cols], data_18Sv4['pr2_Taxon']], axis=1)

                # Get relative abundances
                values = station_asvs.drop(['Feature.ID', 'pr2_Taxon'], axis=1).sum(axis=0)
                #values = values / values.sum()

                # Count occurrences of each taxonomy category
                taxonomies = station_asvs['pr2_Taxon'].str.split(';', expand=True)
                taxonomies = taxonomies.iloc[:, :8]
                taxonomies.columns = taxa_col_names
                taxonomies = taxonomies.dropna(subset='Phylum').fillna('Undetermined')[cols_show_in_sunburst]

                # Add relative abundances
                taxonomies['values'] = values

                # set title of plot
                title = '18S v4 PR2 Taxonomy, Station "' + station_id + '"'

            elif dropdown_dataset == '18Sv9':
                taxa_col_names = ['Kingdom', 'Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

                # Merge with 18Sv9 dataframe to get taxonomy data for the samples
                asv_cols = pd.Series(data_18Sv9.columns).isin(station_samples).values
                asv_cols[0] = True
                station_asvs = pd.concat([data_18Sv9.loc[:,asv_cols], data_18Sv9['pr2_Taxon']], axis=1)

                # Get relative abundances
                values = station_asvs.drop(['Feature.ID', 'pr2_Taxon'], axis=1).sum(axis=0)
                #values = values / values.sum()

                # Count occurrences of each taxonomy category
                taxonomies = station_asvs['pr2_Taxon'].str.split(';', expand=True)
                taxonomies = taxonomies.iloc[:, :8]
                taxonomies.columns = taxa_col_names
                taxonomies = taxonomies.dropna(subset='Phylum').fillna('Undetermined')[cols_show_in_sunburst]

                # Add relative abundances
                taxonomies['values'] = values

                # set title of plot
                title = '18S v9 PR2 Taxonomy, Station "' + station_id + '"'
                
            fig = px.sunburst(taxonomies, path=['Phylum', 'Class', 'Order'], values='values', title=title)
            sunburst_figs[station_id][sample_type][dataset] = fig
# Map graph dropdown callback
@app.callback(
    Output('map-graph', 'figure'),
    [Input('sample-type-dropdown', 'value'),
     Input('env-var-dropdown', 'value')]
)
def update_map(dropdown_sample_type, dropdown_env_var):
    return map_figs[dropdown_sample_type][dropdown_env_var]

# Map graph click data callback
@app.callback(
    Output('sunburst-graph', 'figure'),
    [Input('sample-type-dropdown', 'value'),
     Input('map-graph', 'clickData'),
     Input('dataset-dropdown', 'value')]
)
def update_sunburst(dropdown_sample_type, click_data, dropdown_dataset):
    if click_data is None:
        station_id = data_meta['Sta_ID'].iloc[0]
    # Get station ID from hover data
    else:
        if 'customdata' not in click_data['points'][0]:
            station_id = station_id = data_meta['Sta_ID'].iloc[0]
        else:
            station_id = click_data['points'][0]['customdata'][0]
    testing = (dropdown_sample_type, click_data, dropdown_dataset)
    cols_show_in_sunburst = ['Phylum', 'Class', 'Order']
    station_data = data_meta[(data_meta['Sta_ID'] == station_id) & (data_meta['sample_type'] == dropdown_sample_type)]
    station_samples = station_data['sampleid'].tolist()
    if dropdown_dataset == '16S':
        taxa_col_names = ['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

        # Merge with 16S dataframe to get taxonomy data for the samples
        asv_cols = pd.Series(data_16S.columns).isin(station_samples).values
        asv_cols[0] = True
        station_asvs = pd.concat([data_16S.loc[:,asv_cols], data_16S['silva_Taxon']], axis=1)

        # Get relative abundances
        values = station_asvs.drop(['Feature.ID', 'silva_Taxon'], axis=1).sum(axis=0)
        #values = values / values.sum()

        # Count occurrences of each taxonomy category
        taxonomies = station_asvs['silva_Taxon'].str.split('; ', expand=True)
        taxonomies.columns = taxa_col_names
        taxonomies = taxonomies.dropna(subset=cols_show_in_sunburst[0]).fillna('___Undetermined')[cols_show_in_sunburst]

        # get rid of the silva d__, p__, etc prefixes
        for col in taxonomies.columns:
            taxonomies[col] = taxonomies[col].apply(lambda x: x[3:])

        # Get relative abundances
        taxonomies['values'] = values

        # set title of plot
        title = '16S Silva Taxonomy, Station "' + station_id + '"'
    elif dropdown_dataset == '18Sv4':
        taxa_col_names = ['Kingdom', 'Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
        # Merge with 18Sv4 dataframe to get taxonomy data for the samples
        asv_cols = pd.Series(data_18Sv4.columns).isin(station_samples).values
        asv_cols[0] = True
        station_asvs = pd.concat([data_18Sv4.loc[:,asv_cols], data_18Sv4['pr2_Taxon']], axis=1)
        
        # Get relative abundances
        values = station_asvs.drop(['Feature.ID', 'pr2_Taxon'], axis=1).sum(axis=0)
        #values = values / values.sum()
        
        # Count occurrences of each taxonomy category
        taxonomies = station_asvs['pr2_Taxon'].str.split(';', expand=True)
        taxonomies = taxonomies.iloc[:, :8]
        taxonomies.columns = taxa_col_names
        taxonomies = taxonomies.dropna(subset='Phylum').fillna('Undetermined')[cols_show_in_sunburst]
        
        # Add relative abundances
        taxonomies['values'] = values
        
        # set title of plot
        title = '18S v4 PR2 Taxonomy, Station "' + station_id + '"'
        
    elif dropdown_dataset == '18Sv9':
        taxa_col_names = ['Kingdom', 'Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
        
        # Merge with 18Sv9 dataframe to get taxonomy data for the samples
        asv_cols = pd.Series(data_18Sv9.columns).isin(station_samples).values
        asv_cols[0] = True
        station_asvs = pd.concat([data_18Sv9.loc[:,asv_cols], data_18Sv9['pr2_Taxon']], axis=1)
        
        # Get relative abundances
        values = station_asvs.drop(['Feature.ID', 'pr2_Taxon'], axis=1).sum(axis=0)
        #values = values / values.sum()
        
        # Count occurrences of each taxonomy category
        taxonomies = station_asvs['pr2_Taxon'].str.split(';', expand=True)
        taxonomies = taxonomies.iloc[:, :8]
        taxonomies.columns = taxa_col_names
        taxonomies = taxonomies.dropna(subset='Phylum').fillna('Undetermined')[cols_show_in_sunburst]
        
        # Add relative abundances
        taxonomies['values'] = values
        
        # set title of plot
        title = '18S v9 PR2 Taxonomy, Station "' + station_id + '"'

    fig = px.sunburst(taxonomies, path=['Phylum', 'Class', 'Order'], values='values', title=title)
    return fig

app.run_server(debug=True)



SyntaxError: invalid syntax (<ipython-input-258-8ded30cd9efa>, line 182)

In [237]:
testing

('DCM',
 {'points': [{'curveNumber': 0,
    'pointNumber': 81,
    'pointIndex': 81,
    'lon': -122.919264,
    'lat': 30.1766,
    'cluster.color': 109.27527107600001,
    'hovertext': '<b>Station: </b>093.3 110.0',
    'marker.color': 109.27527107600001,
    'bbox': {'x0': 250.58165279109159,
     'x1': 252.58165279109159,
     'y0': 664.649583672675,
     'y1': 666.649583672675},
    'customdata': ['093.3 110.0']}]},
 '16S')

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import json
import dash_bootstrap_components as dbc


px.set_mapbox_access_token('pk.eyJ1IjoibWFzb3JlbnMiLCJhIjoiY2x0ZHhpY2JsMGJwajJ2c2JkY2pwNnZvYyJ9.0BfLEl_lfk7-rs-96XghMQ')

# Step 2: Load the dataset, population data, and GeoJSON file
data_16S = pd.read_csv('NCOG_21_16S_redo2_asv_count_tax.tsv', sep='\t')
data_18Sv4 = pd.read_csv('NCOG_18sV4_asv_count_tax.tsv', sep='\t')
data_18Sv9 = pd.read_csv('NCOG_18sV9_asv_count_tax_S.tsv', sep='\t')
data_meta = pd.read_csv('NCOG_sample_log_DNA_stvx_meta_2014-2020_mod.tsv', sep='\t')

# Create map graph
# Create scattergeo trace for stations
trace = go.Scattergeo(
    lon=meta['Lon_Dec'],
    lat=meta['Lat_Dec'],
    text=meta['Sta_ID'],
    mode='markers',
    marker=dict(
        size=10,
        color='rgb(255, 0, 0)',
        line=dict(
            width=3,
            color='rgba(68, 68, 68, 0)'
        )
    ),
    hoverinfo='text'
)

    # Create layout for the map
    layout = go.Layout(
        title='Ocean Off the California Coast',
        geo=dict(
            scope='usa',
            projection=dict(type='albers usa'),
            showland=True,
            landcolor='rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor='rgb(255,255,255)',
            countrycolor='rgb(255,255,255)'
        )
    )

    # Create the map figure
    fig = go.Figure(data=[trace], layout=layout)

    return fig
# Step 3: Create a Dash application
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Step 4: Create dropdown menu for sector selection
sector_options = [{'label': sector, 'value': sector} for sector in co2_data['Sector'].unique()]

# Step 5: Define layout
app.layout = dbc.Container([
    html.H1("NCOG Data", className="mt-4 mb-4"),
    dbc.Row([
        dbc.Col([
            html.Div([
                html.Label("Select Sector"),
                dcc.Dropdown(
                    id='sector-dropdown',
                    options=sector_options,
                    value='Electric Power'
                ),
                html.Label("Select Year"),
                dcc.Slider(
                    id='year-slider',
                    min=co2_data['Year'].min(),
                    max=co2_data['Year'].max(),
                    value=co2_data['Year'].max(),
                    marks={str(year): str(year) for year in co2_data['Year'].unique()},
                    step=None
                ),
            ], className="mb-4")
        ], width=3),
        dbc.Col([
            dcc.Graph(id='co2-heatmap')
        ], width=6),
        dbc.Col([
            dcc.Graph(id='state-lineplot')
        ], width=3)
    ])
], fluid=True)

# Step 6: Define callbacks to update the heatmap and line plot based on user selection
@app.callback(
    [Output('co2-heatmap', 'figure'),
     Output('state-lineplot', 'figure')],
    [Input('sector-dropdown', 'value'),
     Input('year-slider', 'value'),
     Input('co2-heatmap', 'hoverData')]
)
def update_visualizations(selected_sector, selected_year, hoverData):
    # California coast NCOG sample map
    def create_map_graph():
    # Create scattergeo trace for stations
    trace = go.Scattergeo(
        lon=meta['Lon_Dec'],
        lat=meta['Lat_Dec'],
        text=meta['Sta_ID'],
        mode='markers',
        marker=dict(
            size=10,
            color='rgb(255, 0, 0)',
            line=dict(
                width=3,
                color='rgba(68, 68, 68, 0)'
            )
        ),
        hoverinfo='text'
    )

    # Create layout for the map
    layout = go.Layout(
        title='Ocean Off the California Coast',
        geo=dict(
            scope='usa',
            projection=dict(type='albers usa'),
            showland=True,
            landcolor='rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor='rgb(255,255,255)',
            countrycolor='rgb(255,255,255)'
        )
    )

    # Create the map figure
    fig = go.Figure(data=[trace], layout=layout)

    return fig
    # Update line plot
    if hoverData is not None:
        state_name = hoverData['points'][0]['location']
        state_data = filtered_data[filtered_data['State'] == state_name]
        fig_lineplot = px.line(state_data, x='Year', y='Value', title=f'CO2 Emissions over Time for {state_name} - {selected_sector}')
        # Calculate average emissions over time
        avg_emissions = filtered_data.groupby('Year')['Value'].mean().reset_index()
        fig_lineplot.add_scatter(x=avg_emissions['Year'], y=avg_emissions['Value'], mode='lines', name='Average All States', line=dict(color='black'))

        # Create scatter trace using Plotly Express
        scatter_trace = px.scatter(state_data, x='Year', y='Value', color='Sector', title=f'Sector-wise CO2 Emissions over Time for {state_name}')
        scatter_trace = scatter_trace.update_traces(marker=dict(size=8))

        # Convert Plotly Express figure to Plotly graph object
        scatter_trace_json = scatter_trace.to_plotly_json()

        # Append scatter trace to the line plot
        for trace in scatter_trace_json['data']:
            fig_lineplot.add_trace(go.Scatter(trace))

    else:
        fig_lineplot = px.line(title='Hover over a state on the heatmap to see its emissions over time')

    return fig_heatmap, fig_lineplot

if __name__ == '__main__':
    app.run_server(debug=True)


SyntaxError: invalid syntax (<ipython-input-1-c57538dfa3b0>, line 13)