In [15]:
import os
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import defaultdict

# Path to the corpus
corpus_path = 'corpus_finished'

# Load the data
corpus = []
genders = []
categories = []
for filename in os.listdir(corpus_path):
    with open(os.path.join(corpus_path, filename), 'r') as f:
        try:
            poem = json.load(f)
            lines = poem.get('poem', {})
            text = ' '.join(line['text'] for line in lines.values())
            gender = poem.get('gender')
            category = poem.get('categories', [None])[0]  # Take the first category, if it exists
            if text and gender and category:  # Ensure all fields exist
                corpus.append(text)
                genders.append(gender)
                categories.append(category)
        except json.JSONDecodeError:
            print(f"Couldn't decode JSON from file: {filename}")

# Create the TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

# Create the Count matrix
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(corpus)

# Get the feature names (words) from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Separate the data based on gender and category
documents_by_category = defaultdict(lambda: defaultdict(list))
for doc, gender, category in zip(X_tfidf, genders, categories):
    documents_by_category[category][gender].append(doc)

# Calculate the average TF-IDF scores and word frequencies for each gender within each category
average_scores_by_category = {}
word_frequencies_by_category = {}
for category, docs_by_gender in documents_by_category.items():
    average_scores_by_category[category] = {}
    word_frequencies_by_category[category] = {}
    for gender, docs in docs_by_gender.items():
        # Convert the list of sparse matrices to a single dense matrix
        docs = np.vstack([doc.toarray() for doc in docs])
        average_scores = np.mean(docs, axis=0)
        word_frequencies = np.sum(docs, axis=0)
        average_scores_by_category[category][gender] = average_scores
        word_frequencies_by_category[category][gender] = word_frequencies

# Print the results
for category, scores_by_gender in average_scores_by_category.items():
    print("Category:", category)
    for gender, scores in scores_by_gender.items():
        # Get the indices of the top 10 scores
        top_indices = np.argsort(scores)[::-1][:10]
        # Get the corresponding words and their frequencies
        top_words_and_frequencies = [(feature_names[i], word_frequencies_by_category[category][gender][i]) for i in top_indices]
        print("Top words and their frequencies for {} authors: {}".format(gender, top_words_and_frequencies))


Category: Leben & Beziehungen
Top words and their frequencies for m authors: [('ich', 5.599531032203162), ('die', 4.290911554681375), ('und', 3.857597133711369), ('in', 2.8565699320638935), ('du', 2.764360362059125), ('wir', 2.642308961012641), ('der', 2.4935751885021844), ('sie', 2.330984454608119), ('nicht', 2.239624621666245), ('den', 1.9375295671117436)]
Top words and their frequencies for w authors: [('die', 2.718079379451495), ('ich', 2.606431776808473), ('du', 2.230537810921422), ('und', 1.8185665026192557), ('der', 1.7066945460218266), ('wir', 1.6966232059055462), ('das', 1.5815613115183738), ('nicht', 1.5610204863971895), ('in', 1.3404664199976473), ('ein', 1.2090209809450512)]
Category: Natur
Top words and their frequencies for w authors: [('wir', 1.5780395548300532), ('den', 1.4085572745737045), ('die', 1.2040332759486474), ('der', 0.961244622773085), ('zwischen', 0.8381701803418332), ('in', 0.7292027766708999), ('auf', 0.6947865475208665), ('müde', 0.6809905738167101), ('sc