In [3]:
# Import necessary libraries
import os
import pandas as pd
import plotly.express as px
from collections import Counter
from datetime import datetime
import re

In [9]:
cwd = os.getcwd()
#abspath = os.path.abspath("")

# Path to the directory containing CSV files
directory_path = cwd + '\\timeline\\in\\'
outputDir = cwd + '\\timeline\\out\\'

# List of specific terms you want to visualize
specific_terms = ['artensterben', 'bienensterben', 'insektensterben', 'höfesterben', 'waldsterben', 'fischsterben', 'baumsterben']

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text).lower())
    text = re.sub(r'\s+', ' ', text)
    return text

# Function to calculate frequencies per million words
def calculate_frequencies(df, specific_terms):
    df['clean_text'] = df['text_content'].apply(preprocess_text)
    df['year'] = pd.to_datetime(df['text_date']).dt.year
    term_frequencies = {}
    for year in df['year'].unique():
        year_text = ' '.join(df[df['year'] == year]['clean_text'])
        word_counts = Counter(year_text.split())
        total_words = sum(word_counts.values())
        frequencies_per_million = {term: (word_counts.get(term, 0) / total_words) * 1e6 for term in specific_terms}
        if frequencies_per_million:  # Only include years where terms are mentioned
            term_frequencies[year] = frequencies_per_million
    return term_frequencies

# Function to prepare data for Plotly
def prepare_data_for_plotly(term_frequencies, source_file):
    plot_data = []
    for year, frequencies in term_frequencies.items():
        for term, frequency in frequencies.items():
            plot_data.append({'year': year, 'term': term, 'frequency_per_million': frequency, 'source': source_file})
    return pd.DataFrame(plot_data)

'''
# Function to save frequencies to individual files
def save_frequencies_to_file(plot_df, output_directory, source_file):
    base_filename = os.path.splitext(source_file)[0]  # Removes the extension
    output_filename = f"{base_filename}_frequencies.csv"
    output_path = os.path.join(output_directory, output_filename)
    plot_df.to_csv(output_path, index=False)
    print(f"Saved frequencies to {output_path}")
'''

def create_and_save_visualization(plot_df, source_file):
    fig = px.line(plot_df, x='year', y='frequency_per_million', color='term',
                  line_group='term', markers=True,
                  title=f'Term Frequencies Per Million Words Over Years ({source_file})',
                  labels={'frequency_per_million': 'Frequency per Million Words', 'year': 'Year'})
    fig.update_layout(xaxis_title='Year', yaxis_title='Frequency per Million Words', xaxis=dict(tickmode='linear'))
    # Save figure to HTML file
    output_filename = os.path.splitext(source_file)[0] + '.html'  # Change extension to .html
    fig.write_html(file=outputDir + output_filename, include_plotlyjs=True)
    print(f"Visualization saved to {outputDir + output_filename}")

# Loop through each CSV file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(directory_path, file_name)
        df = pd.read_csv(file_path, delimiter='\t')
        term_frequencies = calculate_frequencies(df, specific_terms)
        plot_df = prepare_data_for_plotly(term_frequencies, source_file=file_name)
        plot_df['year'] = plot_df['year'].astype(int)
        plot_df = plot_df.sort_values(by=['year'])
        # Create and save visualization for each source text
        create_and_save_visualization(plot_df, file_name)

Visualization saved to e:\Workspaces\KoDi\timeline\out\BILD.html
Visualization saved to e:\Workspaces\KoDi\timeline\out\FAZ.html
Visualization saved to e:\Workspaces\KoDi\timeline\out\PP.html
Visualization saved to e:\Workspaces\KoDi\timeline\out\SPIEGEL.html
Visualization saved to e:\Workspaces\KoDi\timeline\out\SZ.html
Visualization saved to e:\Workspaces\KoDi\timeline\out\TZ.html
Visualization saved to e:\Workspaces\KoDi\timeline\out\ZEIT.html
Visualization saved to e:\Workspaces\KoDi\timeline\out\ZEIT_Online.html
