In [1]:
import pandas as pd
import plotly.express as px
from collections import Counter
import os
import re

In [9]:
cwd = os.getcwd()
#abspath = os.path.abspath("")

# Path to the directory containing CSV files
directory_path = cwd + '\\timeline\\in\\'

# List of specific terms you want to visualize
specific_terms = ['artensterben', 'bienensterben', 'insektensterben', 'höfesterben', 'waldsterben', 'fischsterben', 'baumsterben']

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text).lower())
    text = re.sub(r'\s+', ' ', text)
    return text

# Function to calculate frequencies per million words
def calculate_frequencies(df, specific_terms):
    df['clean_text'] = df['text_content'].apply(preprocess_text)
    df['year'] = pd.to_datetime(df['text_date']).dt.year
    term_frequencies = {}
    for year in df['year'].unique():
        year_text = ' '.join(df[df['year'] == year]['clean_text'])
        word_counts = Counter(year_text.split())
        total_words = sum(word_counts.values())
        frequencies_per_million = {term: (word_counts.get(term, 0) / total_words) * 1e6 for term in specific_terms}
        term_frequencies[year] = frequencies_per_million
    return term_frequencies

# Function to prepare data for Plotly
def prepare_data_for_plotly(term_frequencies, source_file):
    plot_data = []
    for year, frequencies in term_frequencies.items():
        for term, frequency in frequencies.items():
            plot_data.append({'year': year, 'term': term, 'frequency_per_million': frequency, 'source': os.path.splitext(source_file)[0]})
    return pd.DataFrame(plot_data)

# Function to create and save visualizations for each term
def create_and_save_visualization_per_term(all_data, terms):
    for term in terms:
        df_filtered = all_data[all_data['term'] == term].copy()
        fig = px.line(df_filtered, x='year', y='frequency_per_million', color='source',
                      line_group='source', markers=True,
                      title=f'Frequency of "{term}" Per Million Words Over Years Across Sources',
                      labels={'frequency_per_million': 'Frequency per Million Words', 'year': 'Year', 'source': 'Text Source'})
        fig.update_layout(xaxis_title='Year', yaxis_title='Frequency per Million Words', xaxis=dict(tickmode='linear'))
        output_filename = f'{term}.html'  # Save each term's chart as an HTML file
        fig.write_html(file=cwd + '\\timeline\\out\\' + output_filename, include_plotlyjs=True)
        print(f"Visualization for term '{term}' saved to {output_filename}")

# Initialize an empty DataFrame to hold all data
all_data = pd.DataFrame()

# Loop through each CSV file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(directory_path, file_name)
        df = pd.read_csv(file_path, delimiter='\t')
        term_frequencies = calculate_frequencies(df, specific_terms)
        plot_df = prepare_data_for_plotly(term_frequencies, source_file=file_name)
        all_data = pd.concat([all_data, plot_df], ignore_index=True)


all_data['year'] = all_data['year'].astype(int)
all_data = all_data.sort_values(by=['year', 'source', 'term'])

# Create and save a visualization for each term
create_and_save_visualization_per_term(all_data, specific_terms)

Visualization for term 'artensterben' saved to artensterben.html
Visualization for term 'bienensterben' saved to bienensterben.html
Visualization for term 'insektensterben' saved to insektensterben.html
Visualization for term 'höfesterben' saved to höfesterben.html
Visualization for term 'waldsterben' saved to waldsterben.html
Visualization for term 'fischsterben' saved to fischsterben.html
Visualization for term 'baumsterben' saved to baumsterben.html
