In [1]:
# Import necessary libraries
import os
import pandas as pd
import plotly.express as px
from collections import Counter
from datetime import datetime
import re

In [43]:
cwd = os.getcwd()
#abspath = os.path.abspath("")

inputDir = cwd + "\out\split\\"
outputDir = cwd + "\out\\timeline\\"
inputFilePre = "text_source_"
inputFile = "faz"
fileExtension = ".csv"

#DEBUG
#print("inputDir :" + inputDir)
#print("outputDir :" + outputDir)
#print("inputFile :" + inputFile)

# Read the CSV file
df = pd.read_csv(inputDir + inputFilePre + inputFile + fileExtension, sep='\t')

# Ensure 'text_date' is in datetime format and extract the year
df['text_date'] = pd.to_datetime(df['text_date'])
df['year'] = df['text_date'].dt.year

In [44]:
# Preprocess 'text_content' to clean text
def preprocess_text(text):
    # Lowercase, remove special characters, and multiple spaces
    text = re.sub(r'\W', ' ', str(text).lower())
    text = re.sub(r'\s+', ' ', text)
    return text

df['clean_text'] = df['text_content'].apply(preprocess_text)

In [46]:
# List of specific terms you want to visualize
terms = ['artensterben', 'bienensterben', 'insektensterben', 'höfesterben', 'waldsterben', 'fischsterben', 'baumsterben']  # Replace 'term1', 'term2', 'term3' with your actual terms

# Calculate term frequencies per year
def get_term_frequencies(df, terms):
    term_frequencies = {}
    for year in df['year'].unique():
        year_text = ' '.join(df[df['year'] == year]['clean_text'])
        word_counts = Counter(year_text.split())
        total_words = sum(word_counts.values())
        normalized_counts = {term: (word_counts.get(term, 0) / total_words) * 1e6 for term in terms}
        term_frequencies[year] = normalized_counts
    return term_frequencies

term_frequencies = get_term_frequencies(df, terms)

# Convert term frequencies to a format suitable for Plotly
def prepare_data_for_plotly(term_frequencies):
    plot_data = []
    for year, frequencies in term_frequencies.items():
        for term, count in frequencies.items():
            plot_data.append({'year': year, 'term': term, 'frequency': count})
    return pd.DataFrame(plot_data)

plot_df = prepare_data_for_plotly(term_frequencies)

In [47]:
# Create an interactive timeline visualization with Plotly
fig = px.line(plot_df, x='year', y='frequency', color='term',
              line_group='term', markers=True,
              title=inputFile + ': Term Frequencies Over Years',
              labels={'frequency_per_million': 'Frequency', 'year': 'Year'})

fig.update_layout(xaxis_title='Year', yaxis_title='Frequency', xaxis=dict(tickmode='linear'))
fig.write_html(outputDir + inputFile + "_timeline" + ".html", include_plotlyjs=True)
fig.show()