In [1]:
# Import necessary libraries
import os
import pandas as pd
import plotly.express as px
from collections import Counter
from datetime import datetime
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import re

In [5]:
#AiO
import os
import pandas as pd
import re
from collections import Counter
import plotly.graph_objects as go

# Set directory paths
cwd = os.getcwd()
directory_path = cwd + '\\in\\'
outputDir = cwd + '\\out\\'

# List of specific terms you want to visualize
specific_terms = ['artensterben', 'bienensterben', 'insektensterben', 'höfesterben', 'waldsterben', 'fischsterben', 'baumsterben']

def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text).lower())
    text = re.sub(r'\s+', ' ', text)
    return text

def calculate_frequencies(df, specific_terms):
    df['clean_text'] = df['text_content'].apply(preprocess_text)
    df['year'] = pd.to_datetime(df['text_date']).dt.year
    term_frequencies = {}
    for year in df['year'].unique():
        year_text = ' '.join(df[df['year'] == year]['clean_text'])
        word_counts = Counter(year_text.split())
        total_words = sum(word_counts.values())
        if total_words > 0:  # Ensure there is text to process
            frequencies_per_million = {term: (word_counts.get(term, 0) / total_words) * 1e6 for term in specific_terms}
            term_frequencies[year] = frequencies_per_million
    return term_frequencies

def prepare_data_for_plotly(term_frequencies, source_file):
    plot_data = []
    for year, frequencies in term_frequencies.items():
        for term, frequency in frequencies.items():
            plot_data.append({'year': year, 'term': term, 'frequency_per_million': frequency, 'source': source_file})
    return pd.DataFrame(plot_data)

all_data = pd.DataFrame()

for file_name in os.listdir(directory_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(directory_path, file_name)
        df = pd.read_csv(file_path, delimiter='\t')
        term_frequencies = calculate_frequencies(df, specific_terms)
        plot_df = prepare_data_for_plotly(term_frequencies, file_name)
        all_data = pd.concat([all_data, plot_df], ignore_index=True)

# Create a figure
fig = go.Figure()

# Add traces for each term or source
for term in all_data['term'].unique():
    filtered_df = all_data[all_data['term'] == term].copy()
    for source in all_data['source'].unique():
        df_by_source = filtered_df[filtered_df['source'] == source]
        df_by_source = df_by_source.sort_values('year')  # Sort by year to ensure correct line plotting
        fig.add_trace(go.Scatter(x=df_by_source['year'], y=df_by_source['frequency_per_million'],
                                 mode='lines+markers',
                                 name=source.split(".")[0],  # Use only source name
                                 customdata=[term]*len(df_by_source),  # Add custom data for filtering
                                 visible=True))

# Generate dropdown items for each term
dropdown_items = []
for term in all_data['term'].unique():
    dropdown_items.append(
        {'label': term,
         'method': 'update',
         'args': [{'visible': [term == t.customdata[0] for t in fig.data]},
                  {'title': f'Term Frequencies: {term}'}]})

# Add dropdown
fig.update_layout(
    updatemenus=[
        go.layout.Updatemenu(
            buttons=dropdown_items,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.05,
            yanchor="top"
        ),
    ]
)

# Update layout
fig.update_layout(title_text="Term Frequencies Visualization",
                  title_x=0.5,  # Center the title
                  title_y=0.95,
                  xaxis=dict(title='Year'),
                  yaxis=dict(title='Frequency per Million Words'))

# Show figure
fig.show()

# Save figure as HTML
fig.write_html(file=os.getcwd() + '\\out\\timeline_term.html', include_plotlyjs=True)

In [13]:
#Solo Generate Figure

# Create a figure
fig = go.Figure()

# Add traces for each term or source
for term in all_data['term'].unique():
    filtered_df = all_data[all_data['term'] == term].copy()
    for source in all_data['source'].unique():
        df_by_source = filtered_df[filtered_df['source'] == source]
        df_by_source = df_by_source.sort_values('year')  # Sort by year to ensure correct line plotting
        fig.add_trace(go.Scatter(x=df_by_source['year'], y=df_by_source['frequency_per_million'],
                                 mode='lines+markers',
                                 name=source.split(".")[0],  # Use only source name
                                 customdata=[term]*len(df_by_source),  # Add custom data for filtering
                                 visible=True,
                                 #line=dict(width=3)
                                 ))

# Generate dropdown items for each term
dropdown_items = []
for term in all_data['term'].unique():
    dropdown_items.append(
        {'label': term,
         'method': 'update',
         'args': [{'visible': [term == t.customdata[0] for t in fig.data]},
                  {'title': f'Term Frequencies: {term}'}]})

# Add dropdown
fig.update_layout(
    updatemenus=[
        go.layout.Updatemenu(
            buttons=dropdown_items,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.1,
            yanchor="top"
        )
    ],
    legend=dict(
        font=dict(
            size=16,  # Adjust font size for legend labels
            #family="Arial, bold"
        )
    )
)

# Update layout
fig.update_layout(title_text="Term Frequencies Visualization",
                  title_x=0.5,  # Center the title
                  title_y=0.95,
                  xaxis=dict(title='Year'),
                  yaxis=dict(title='Frequency per Million Words'))

# Show figure
fig.show()

# Save figure as HTML
fig.write_html(file=os.getcwd() + '\\out\\timeline_term.html', include_plotlyjs=True)

In [1]:
import plotly.graph_objects as go
import os

# Assuming 'all_data' is your prepared DataFrame

# Define the sources to be preselected
preselected_sources = ['FAZ', 'taz', 'PP']

# Create a figure
fig = go.Figure()

# Add traces for each term or source
for term in all_data['term'].unique():
    filtered_df = all_data[all_data['term'] == term]
    for source in all_data['source'].unique():
        df_by_source = filtered_df[filtered_df['source'] == source]
        visible = source.split(".")[0] in preselected_sources
        fig.add_trace(go.Scatter(x=df_by_source['year'], y=df_by_source['frequency_per_million'],
                                 mode='lines+markers',
                                 name=f'{source.split(".")[0]}',
                                 visible=True))

# Generate dropdown items for each term
dropdown_items = []
for term in all_data['term'].unique():
    dropdown_items.append(
        {'label': term,
         'method': 'update',
         'args': [{'visible': [t['name'].endswith(term) for t in fig.data]},
                  {'title': f'Term Frequencies: {term}'}]})

# Add dropdown
fig.update_layout(
    updatemenus=[
        go.layout.Updatemenu(
            buttons=dropdown_items,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.05,
            yanchor="top"
        ),
    ]
)

# Update layout
fig.update_layout(title_text="Term Frequencies Visualization",
                  title_x=0.5,  # Center the title
                  title_y=0.95,
                  xaxis=dict(title='Year'),
                  yaxis=dict(title='Frequency per Million Words'))

# Show figure
fig.show()

# Save figure as HTML
fig.write_html(file=os.getcwd() + '\\out\\timeline_term.html', include_plotlyjs=True)

NameError: name 'all_data' is not defined

In [6]:
import plotly.graph_objects as go
import os

# Assuming 'all_data' is your prepared DataFrame

# Create a figure
fig = go.Figure()

# Add traces for each source or term
for source in all_data['source'].unique():
    filtered_df = all_data[all_data['source'] == source]
    for term in filtered_df['term'].unique():
        df_by_term = filtered_df[filtered_df['term'] == term]
        fig.add_trace(go.Scatter(x=df_by_term['year'], y=df_by_term['frequency_per_million'],
                                 mode='lines+markers',
                                 name=f'{term} - {source.split(".")[0]}',
                                 visible=True))

# Generate dropdown items for each source
dropdown_items = []
for source in all_data['source'].unique():
    dropdown_items.append(
        {'label': source.split(".")[0],  # Remove file extension from the label
         'method': 'update',
         'args': [{'visible': [t['name'].endswith(source.split(".")[0]) for t in fig.data]},
                  {'title': f'Source Frequencies: {source.split(".")[0]}'}]})

# Add dropdown
fig.update_layout(
    updatemenus=[
        go.layout.Updatemenu(
            buttons=dropdown_items,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.05,
            yanchor="top"
        ),
    ]
)

# Update layout
fig.update_layout(title_text="Source Frequencies Visualization",
                  title_x=0.5,  # Center the title
                  title_y=0.95,
                  xaxis=dict(title='Year'),
                  yaxis=dict(title='Frequency per Million Words'))

# Show figure
fig.show()
fig.write_html(file=cwd + '\\out\\interactive_timeline.html', include_plotlyjs=True)
# To save the figure to an HTML file, use:
# fig.write_html('path/to/save/interactive_plot.html')


In [None]:
app = dash.Dash(__name__)

# App layout
app.layout = html.Div([
    html.H1("Term Frequencies Visualization"),
    dcc.Dropdown(
        id='source-dropdown',
        options=[{'label': source, 'value': source} for source in all_data['source'].unique()],
        value=all_data['source'].unique().tolist(),  # Default to all sources
        multi=True
    ),
    dcc.Graph(id='frequency-graph')
])

# Callback to update graph based on filters
@app.callback(
    Output('frequency-graph', 'figure'),
    [Input('source-dropdown', 'value')]
)
def update_figure(selected_sources):
    filtered_df = all_data[all_data['source'].isin(selected_sources)]
    fig = px.line(filtered_df, x='year', y='frequency_per_million', color='term', line_group='source', markers=True)
    fig.update_layout(transition_duration=500)
    return fig

if __name__ == '__main__':
    app.run_server(debug=True)