In [None]:
# Cumulative (Table)
import os
import re
import pandas as pd
from collections import Counter

# Input
corpus = 'tp3p1.csv'

# Terms to visualize
specific_terms = [
                #'artensterben', 
                #'bienensterben', 
                #'insektensterben',
                #'waldsterben',
                #'waldschäden',
                #'baumsterben',
                #'fichtensterben',
                #'tannensterben',
                #'klimawandel',
                #'klimawandel',
                #'umweltschutz',
                #'nachhaltigkeit'
                #'skeptiker',
                'klimaskeptiker',
                'klimawandelskeptiker',
                #'leugner',
                'klimaleugner',
                'klimawandelleugner',
                ]

# Filters to apply
filters = {
    'text_text_type': ['zeitung', 'wochenzeitung'],
    # 'another_column': ['value1', 'value2'],
}

# Paths
cwd = os.getcwd()
directory_path = os.path.join(cwd, 'in')
outputDir = os.path.join(cwd, 'out/csv')

# Preprocess text
def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text).lower())
    text = re.sub(r'\s+', ' ', text)
    return text

# Apply filters to the chunk
def apply_filters(chunk, filters):
    for column, values in filters.items():
        if column in chunk.columns:
            chunk = chunk[chunk[column].isin(values)]
    return chunk

# Calculate absolute and relative frequencies for a chunk
def calculate_frequencies(chunk, specific_terms):
    chunk['clean_text'] = chunk['text_content'].apply(preprocess_text)
    chunk['year'] = chunk['text_date'].str[:4].astype(int)
    term_frequencies = {}
    total_word_counts = {}
    for year in chunk['year'].unique():
        year_text = ' '.join(chunk[chunk['year'] == year]['clean_text'])
        word_counts = Counter(year_text.split())
        total_words = sum(word_counts.values())
        total_word_counts[year] = total_words
        frequencies = {term: word_counts.get(term, 0) for term in specific_terms}
        if year not in term_frequencies:
            term_frequencies[year] = Counter()
        term_frequencies[year].update(frequencies)
    return term_frequencies, total_word_counts

# Prepare data for saving
def prepare_data_for_saving(term_frequencies, total_word_counts):
    plot_data = []
    for year, frequencies in term_frequencies.items():
        total_words = total_word_counts[year]
        for term, frequency in frequencies.items():
            relative_frequency = (frequency / total_words) * 1e6 if total_words > 0 else 0
            plot_data.append({
                'year': year, 
                'term': term, 
                'absolute_frequency': frequency, 
                'relative_frequency_per_million': relative_frequency
            })
    return pd.DataFrame(plot_data)

# Outfile named after specific_terms
terms_concatenated = '_'.join(specific_terms)

# Cumulative term frequencies and total word counts across the file
cumulative_frequencies = {}
cumulative_word_counts = {}

# Chunk size for reading the CSV file
chunk_size = 100000

file_path = os.path.join(directory_path, corpus)
print(f"Input: {file_path}")

# Read the file in chunks
for chunk in pd.read_csv(file_path, delimiter='\t', chunksize=chunk_size):
    # Apply filters to the chunk
    chunk = apply_filters(chunk, filters)
    
    if chunk.empty:
        continue

    term_frequencies, total_word_counts = calculate_frequencies(chunk, specific_terms)
    
    # Add to cumulative frequencies and word counts
    for year, frequencies in term_frequencies.items():
        if year not in cumulative_frequencies:
            cumulative_frequencies[year] = Counter()
            cumulative_word_counts[year] = 0
        cumulative_frequencies[year].update(frequencies)
        cumulative_word_counts[year] += total_word_counts[year]

# Convert cumulative frequencies to DataFrame
print("Converting cumulative frequencies to DataFrame")
cumulative_plot_df = prepare_data_for_saving(cumulative_frequencies, cumulative_word_counts)
cumulative_plot_df['year'] = cumulative_plot_df['year'].astype(int)
cumulative_plot_df = cumulative_plot_df.sort_values(by=['year'])

# Save the results into table (TSV)
occurrences_output_filename = f'{terms_concatenated}_Occurrences.csv'
cumulative_plot_df.to_csv(os.path.join(outputDir, occurrences_output_filename), sep='\t', index=False)
print(f"Table saved to {os.path.join(outputDir, occurrences_output_filename)}")


Input: e:\Frequency_Timelines\in\tp3p1.csv
Converting cumulative frequencies to DataFrame
Table saved to e:\Frequency_Timelines\out/csv\klimaskeptiker_klimawandelskeptiker_klimaleugner_klimawandelleugner_Occurrences.tsv


In [2]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import os

# Paths
cwd = os.getcwd()
input_dir = os.path.join(cwd, 'out/csv')
html_output_dir = os.path.join(cwd, 'out/html/')

# Get the list of all files in the tsv_output_dir that match "Occurrences.tsv"
files = [f for f in os.listdir(input_dir) if "Occurrences.tsv" in f]

if files:
    for occurrences_output_filename in files:
        print(f"Processing file: {occurrences_output_filename}")
        cumulative_plot_df = pd.read_csv(os.path.join(input_dir, occurrences_output_filename), sep='\t')

        # Restrict the data to the range 1990-2020
        cumulative_plot_df = cumulative_plot_df[(cumulative_plot_df['year'] >= 1990) & (cumulative_plot_df['year'] <= 2020)]

        # Use a Plotly built-in color scheme
        color_sequence = px.colors.qualitative.D3 

        # Create the plot with custom styling for both absolute and relative frequencies
        for freq_type, y_label, y_column in [('absolute_frequency', 'Absolute Frequency', 'absolute_frequency'), 
                                             ('relative_frequency_per_million', 'Relative Frequency per Million', 'relative_frequency_per_million')]:
            
            labels = cumulative_plot_df['term'].unique()
            fig = go.Figure()

            # Trace & Marker dimentions
            size = [8]
            width = [2]

            for i, term in enumerate(labels):
                term_data = cumulative_plot_df[cumulative_plot_df['term'] == term]
                x_data = term_data['year']
                y_data = term_data[y_column]

                # Add the line trace for the term, using the color sequence
                fig.add_trace(go.Scatter(x=x_data, y=y_data, mode='lines',
                    name=term.capitalize(),
                    line=dict(color=color_sequence[i % len(color_sequence)], width=width[i % len(width)]),
                    marker=dict(size=2),
                    connectgaps=True,
                ))

                # Add markers for the endpoints
                fig.add_trace(go.Scatter(
                    x=[x_data.iloc[0], x_data.iloc[-1]],
                    y=[y_data.iloc[0], y_data.iloc[-1]],
                    mode='markers',
                    marker=dict(color=color_sequence[i % len(color_sequence)], size=size[i % len(size)])
                ))

            # Custom layout styling
            fig.update_layout(
                width=1100,
                xaxis=dict(
                    showline=True,
                    showgrid=True,
                    gridcolor='rgba(200, 200, 200, 0.3)',
                    showticklabels=True,
                    linecolor='rgb(204, 204, 204)',
                    linewidth=2,
                    ticks='outside',
                    tickmode='array',  # Custom tick mode
                    tickvals=[i for i in range(1990, 2021, 2)], 
                    tickfont=dict(
                        family='Arial',
                        size=12,
                        color='rgb(82, 82, 82)',
                    ),
                ),
                yaxis=dict(
                    showgrid=True,
                    gridcolor='rgba(200, 200, 200, 0.3)', 
                    zeroline=False,
                    showline=False,
                    showticklabels=True,
                    tickfont=dict(family='Arial', size=12, color='rgb(82, 82, 82)'),
                ),
                autosize=False,
                margin=dict(
                    autoexpand=False,
                    l=100,
                    r=200,
                    t=110,
                ),
                plot_bgcolor='white',
                showlegend=False  # Remove the legend
            )

            # Add annotations for the right-side labels, slightly moved to the right
            annotations = []
            for i, term in enumerate(labels):
                term_data = cumulative_plot_df[cumulative_plot_df['term'] == term]
                y_trace = term_data[y_column].values
                # Labeling the right side of the plot (end of the line)
                annotations.append(dict(xref='paper', x=1.01, y=y_trace[-1],  # Adjusted to move labels slightly to the right
                                        xanchor='left', yanchor='middle',
                                        text=term.capitalize(),
                                        font=dict(family='Arial', size=16, color=color_sequence[i % len(color_sequence)]),
                                        showarrow=False))

            # Update layout with annotations
            fig.update_layout(annotations=annotations)

            # Save the plot as an HTML file
            output_filename = occurrences_output_filename.replace('Occurrences.tsv', f'{freq_type}_Cumulative.html')
            fig.write_html(file=os.path.join(html_output_dir, output_filename), include_plotlyjs=True)
            print(f"{y_label} Visualization saved to {os.path.join(html_output_dir, output_filename)}")

else:
    print("No 'Occurrences.tsv' files found in the specified directory.")


Processing file: klimaskeptiker_klimawandelskeptiker_klimaleugner_klimawandelleugner_Occurrences.tsv
Absolute Frequency Visualization saved to e:\Frequency_Timelines\out/html/klimaskeptiker_klimawandelskeptiker_klimaleugner_klimawandelleugner_absolute_frequency_Cumulative.html
Relative Frequency per Million Visualization saved to e:\Frequency_Timelines\out/html/klimaskeptiker_klimawandelskeptiker_klimaleugner_klimawandelleugner_relative_frequency_per_million_Cumulative.html
Processing file: klimawandel_umweltschutz_nachhaltigkeit_Occurrences.tsv
Absolute Frequency Visualization saved to e:\Frequency_Timelines\out/html/klimawandel_umweltschutz_nachhaltigkeit_absolute_frequency_Cumulative.html
Relative Frequency per Million Visualization saved to e:\Frequency_Timelines\out/html/klimawandel_umweltschutz_nachhaltigkeit_relative_frequency_per_million_Cumulative.html
Processing file: skeptiker_klimaskeptiker_klimawandelskeptiker_leugner_klimaleugner_klimawandelleugner_Occurrences.tsv
Absolut