In [6]:
import json
import os
import sys
from typing import Any

sys.path.append(os.path.abspath(".."))

from eda.models import Generation
from eda.parsing import Conversations, Participants
from eda.utils import FOLDER_DIR

DATA_PATH = FOLDER_DIR / "data"

def export_json_data(name: str, data: dict[str, Any]):
    with DATA_PATH.joinpath(name).open("w") as fp:
        json.dump(data, fp, indent=4, ensure_ascii=False)


def create_json_data() -> dict[str, Any]:
    return {"metadata": {}, "data": {}}

participants = Participants()
conversations = Conversations(participants)

### Question 1

In [6]:
conversations.read_all(parallel=True, load_sentiments=True)

In [None]:
from collections import Counter
from collections.abc import Iterable

from eda.models import ConversationLine
from eda.sentiments import SentimentType
from eda.utils import round_precise


def sentiments_from_lines(
    lines: Iterable[ConversationLine], exclude_true_neutrals: bool = False
) -> tuple[list[str], list[float]]:
    sentiments = Counter()

    for line in lines:
        if not line.sentiments.has_loaded_scores():
            continue
        if not line.sentiments.has_scores():
            continue
        if exclude_true_neutrals and line.sentiments.neutral == 1.0:
            continue
        sentiments += line.sentiments.score_counts

    total_sum = sum(sentiments.values())
    names, scores = zip(
        *(
            (
                SentimentType(sentiment_name).display_name,
                round_precise(score / total_sum * 100, 2),
            )
            for sentiment_name, score in sentiments.items()
        )
    )
    return list(names), list(scores)


def sentiment_percentages() -> dict[str, dict[str, Any]]:
    lines_by_generation = Generation.create_mapping()
    for conversation in conversations:
        for line in conversation.lines:
            generation = line.participant.generation
            lines_by_generation[generation].append(line)

    sentiment_names = None
    generation_names = []
    data = {}

    for generation, lines in lines_by_generation.items():
        generation_names.append(generation.name)
        sentiment_names, scores = sentiments_from_lines(
            lines, exclude_true_neutrals=True
        )

        total = sum(scores)
        proportions = [
            (sentiment_name, score / total * 100)
            for sentiment_name, score in zip(sentiment_names, scores)
        ]
        
        data[generation.name] = dict(proportions)

    return data

data = create_json_data()
data["metadata"]["title"] = "Sentiment percentages per generation"
data["data"] = sentiment_percentages()
export_json_data("sentiment_percentages.json", data)

In [10]:
conversations.read_all(parallel=True, load_prosodic=True)

In [17]:

from collections import defaultdict
from collections.abc import Generator

from eda.models import ConversationLine, Participant


def human_name_from_snake_case(name: str) -> str:
    return " ".join(name.split("_")).capitalize()


def filter_prosodic_attributes(
    line: ConversationLine,
) -> Generator[tuple[str, str]]:
    yield from (
        (name, value) for name, value in vars(line).items() 
        if name.endswith("phrases")
        and name != "overlapping_phrases"
    )


def prosodic_frequencies(participant: Participant) -> dict[str, float]:
    participant_lines = conversations.participant_lines(participant)
    participant_data = defaultdict(int)
    n_lines = 0
    for line in participant_lines:
        for name, value in filter_prosodic_attributes(line):
            participant_data[name] += len(value)
        n_lines += 1

    norm_participant_data = {
        key: value / n_lines * 100 for key, value in participant_data.items()
    }
    return norm_participant_data


def prosodic_counts_by_generation(data: dict[Generation, Any]) -> dict[str, dict[str, float]]:
    for participant in participants:
        generation = participant.generation
        data[generation].append(prosodic_frequencies(participant))

    result = {}
    for generation, participants_frequencies in data.items():
        total_generation_counts = Counter()
        for participant_frequencies in participants_frequencies:
            total_generation_counts += Counter(participant_frequencies)

        result[generation.name] = {
            human_name_from_snake_case(prosodic_feature).replace(" phrases", ""):
            round_precise(count / len(participants_frequencies))
            for prosodic_feature, count in total_generation_counts.items()
        }
    return result


generation_map = Generation.create_mapping()
counts = prosodic_counts_by_generation(generation_map)
data = create_json_data()
data["metadata"]["title"] = "Average prosodic feature frequency per line by generation"
data["data"] = counts
export_json_data("prosodic_features.json", data)

### Question 2

In [10]:
conversations.read_all(parallel=True, load_tagged=True)

In [None]:
import itertools
from dataclasses import dataclass
from typing import Optional

from eda.language import TaggedText

TOP_N_LEMMAS = 10
MIN_WORD_OCCURRENCES = 3
ALLOWED_POS_VALUES = None
PER_WORDS = 2500

@dataclass(frozen=True)
class PosTaggedLemma:
    lemma: str
    pos_name: str


@dataclass(frozen=True)
class GenerationLemmaInfo:
    pt_lemmas: list[PosTaggedLemma]
    lemma_counts: list[int]
    top_counter: Counter
    n_total_words: int


def top_lemmas_by_generation(
    conversations,
    *,
    top_n: Optional[int] = None,
    min_lemma_length: int = 3,
    min_word_occurences: int = 3,
    allowed_pos_values: Optional[set] = None,
) -> dict[str, GenerationLemmaInfo]:
    text_by_generation: dict[str, list[TaggedText]] = {
        key.name: value for key, value in Generation.create_mapping().items()
    }

    for conversation in conversations:
        for line in conversation:
            generation_name: str = line.participant.generation.name
            text_by_generation[generation_name].extend(line.tagged)

    for generation, words in text_by_generation.items():
        word_counts = Counter(words)
        words = list({word for word in words if word_counts[word] >= min_word_occurences})
        text_by_generation[generation] = words

    lemmas_by_generation = {}
    for generation, words in text_by_generation.items():
        words_by_lemma = defaultdict(list)
        for word in words:
            if len(word.lemma) < min_lemma_length:
                continue
            if allowed_pos_values is not None and word.pos not in allowed_pos_values:
                continue
            pt_lemma = PosTaggedLemma(word.lemma, word.pos_name)
            words_by_lemma[pt_lemma].append(word)

        counts = Counter({
            pt_lemma: len(group) for pt_lemma, group in words_by_lemma.items()
        })

        lemmas, frequencies = zip(*counts.most_common(top_n))
        lemmas_by_generation[generation] = GenerationLemmaInfo(
            list(lemmas), 
            list(frequencies), 
            counts,
            len(words)
        )

    return lemmas_by_generation

def important_lemmas() -> dict[str, Any]:
    top_lemmas = top_lemmas_by_generation(
        conversations, top_n=None, allowed_pos_values=ALLOWED_POS_VALUES
    )

    result: dict[str, dict[str, float]] = {key.name: {} for key in Generation.create_mapping()}
    for generation_name, info in top_lemmas.items():
        for ptl, count in zip(info.pt_lemmas, info.lemma_counts):
            lemma_frequency = round_precise(count / info.n_total_words * PER_WORDS)
            result[generation_name][ptl.lemma] = lemma_frequency

    original_result = result
    top_lemmas_result = {}
    for generation_name, info in top_lemmas.items():
        top_counts = info.top_counter.most_common(TOP_N_LEMMAS)
        allowed = {ptl.lemma for ptl, _ in top_counts}
        top_lemmas_result[generation_name] = {
            lemma: result[generation_name][lemma]
            for lemma in allowed
        }

    result = top_lemmas_result
    for generation_name, other_generation in itertools.product(result, repeat=2):
        if generation_name == other_generation:
            continue

        for lemma in result[other_generation]:
            if lemma not in result[generation_name]:
                result[generation_name][lemma] = original_result[generation_name][lemma]

        result[generation_name] = dict(
            sorted(result[generation_name].items(), key=lambda pair: pair[1], reverse=True)
        )

    return result


data = create_json_data()
data["metadata"]["title"] = "Top lemmas by generation"
data["metadata"]["top_n_lemmas"] = TOP_N_LEMMAS
data["metadata"]["per_n_words"] = PER_WORDS
data["metadata"]["min_word_occurrences"] = MIN_WORD_OCCURRENCES
data["data"] = important_lemmas()
export_json_data("top_lemmas.json", data)

In [55]:
from collections import Counter
from functools import partial

themes_by_generation = defaultdict(partial(defaultdict, partial(defaultdict, list)))
themes_by_code = json.loads(DATA_PATH.joinpath("ml_gen_themes_by_code.json").read_text())
theme_counts = Counter()
totals = Counter()

for conversation_data in themes_by_code.values():
    generation_values = conversation_data["values"]
    generation_name = conversation_data["generation"]

    for value in generation_values:
        themes = value["themes"]
        existing_lemmas = value["lemmas"]

        for theme in themes:
            entry = themes_by_generation[generation_name][theme]
            entry["lemmas"].extend(existing_lemmas)
            theme_counts[(generation_name, theme)] += 1

        totals[generation_name] += 1

top_lemmas = {
    generation: lemmas
    for generation, lemmas in json.loads(
        DATA_PATH.joinpath("top_lemmas.json").read_text()
    )["data"].items()
}

for generation_name, generation_values in themes_by_generation.items():
    for theme, theme_data in generation_values.items():
        unique_themes = frozenset(theme_data["lemmas"])
        theme_data["match"] = round_precise(
            theme_counts[(generation_name, theme)] / totals[generation_name] * 100
        )
        theme_data["filtered_lemmas"] = list(
            filter(top_lemmas[generation_name].__contains__, top_lemmas[generation_name])
        )
        theme_data["lemmas"] = sorted(unique_themes)

data = create_json_data()
data["metadata"]["title"] = "Themes by generation"
data["data"] = themes_by_generation
export_json_data("themes_by_generation.json", data)

### Question 3

In [12]:

from collections.abc import Callable, Generator
from functools import cache

from eda.language import AttributedWord
from eda.models import MacroRegion, ParticipantLines


def participant_macro_region(participant: Participant) -> MacroRegion:
    conversation = conversations.conversation(participant.conversation_code)
    return conversation.macro_region


def generate_participant_words(lines: ParticipantLines) -> Generator[AttributedWord]:
    for line in lines:
        yield from filter(lambda word: word.is_linguistic, line.normalised_words)

@cache
def participants_dialect_percentages(
    *, rounder: Callable[[int | float, int], int | float] = round_precise
) -> list[int | float]:
    percentages = []

    for participant in participants:
        participant_lines = conversations.participant_lines(participant)
        dialect_words = total_words = 0
        for word in generate_participant_words(participant_lines):
            dialect_words += word.is_dialect(strict=False)
            total_words += 1

        percentange_of_dialect_words = rounder(dialect_words / total_words * 100, 2)
        percentages.append(percentange_of_dialect_words)

    return percentages

def labeled_percentages() -> list[dict[str, Any]]:
    result = {
        key.name: value 
        for key, value in Generation.create_mapping().items()
    }
    
    region_percentages = defaultdict(float)

    dialect_percentages = iter(participants_dialect_percentages())
    for participant in participants:
        region = participant.geographic_origin
        percentage = next(dialect_percentages)
        participant_data = {
            "region": region,
            "dialect_percentage": percentage,
            "macro_region": participant.macro_region.name.lower()
        }
        region_percentages[region] += percentage
        result[participant.generation.name].append(participant_data)
    return result  # type: ignore

data = create_json_data()
data["metadata"]["title"] = "Dialect word percentages"
data["data"] = labeled_percentages()
export_json_data("dialect_percentages.json", data)

In [13]:
from functools import partial
from typing import Optional, cast

import numpy as np
import pandas as pd


def generational_dialect_percentages(top_n: Optional[int] = None) -> pd.DataFrame:
    data = []
    region_percentages = defaultdict(float)

    dialect_percentages = iter(participants_dialect_percentages())
    for participant in participants:
        region = participant.geographic_origin
        percentage = next(dialect_percentages)
        participant_data = {
            "generation": participant.generation.name,
            "dialect_percentage": percentage,
            "region": region
        }
        region_percentages[region] += percentage
        data.append(participant_data)

    top_regions = frozenset(
        region for region, _ in sorted(
            region_percentages.items(), key=lambda pair: pair[1], reverse=True
        )[:top_n]
    )

    for participant_data in data:
        region = participant_data["region"]
        if region not in top_regions:
            participant_data["region"] = "other"

    df = pd.DataFrame(data)
    df = df.groupby(["generation", "region"])["dialect_percentage"].mean().unstack()
    df = df.sort_values(by="generation")
    df = df.replace(np.nan, 0.0)
    return df

def get_region_dialects_df() -> pd.DataFrame:
    region_dialects_df = generational_dialect_percentages()
    region_dialects_df = region_dialects_df[region_dialects_df.columns]
    region_dialects_df = pd.DataFrame(region_dialects_df.apply(partial(round, ndigits=2), axis=1))
    region_dialects_df = region_dialects_df.loc[:, (region_dialects_df != 0).sum() >= 2]
    return region_dialects_df

def generation_grouped_percentages(region_dialects_df: pd.DataFrame) -> dict[str, Any]:
    percentages = {
        generation.name: {}
        for generation in Generation.create_mapping()
    }

    for i, row in region_dialects_df.iterrows():
        values = {
            key: float(cast(np.float64, value)) 
            for key, value in dict(row).items()
        }
        percentages[cast(str, i)] = values

    return percentages

def calculate_regional_deltas(region_dialects_df: pd.DataFrame) -> dict[str, float]:
    def percentage_delta(percentages: list[float]) -> float:
        delta = 0
        prev = percentages[0]
        for percentage in percentages[1:]:
            delta += percentage - prev
            prev = percentage
        return delta
    
    percentages_per_region = defaultdict(list)
    for _, row in region_dialects_df.iterrows():
        for region, percentage in dict(row).items():
            percentages_per_region[region].append(float(percentage))  # type: ignore

    result = {}
    for region, values in percentages_per_region.items():
        values = list(filter(None, values))
        result[region] = round_precise(percentage_delta(values), 2)

    return result

data = create_json_data()
data["metadata"]["title"] = "Changes of percentage of dialect words spoken over generations"
data["data"] = calculate_regional_deltas(get_region_dialects_df())
export_json_data("dialect_delta_percentages.json", data)

In [17]:
from eda.models import Conversation, Participant
from eda.utils import round_precise

UNKNOWN_EDUCATION = "N/A"
EDUCATION_RANKINGS = [
    "elem", "dip_tec_prof", "dip_lic", "laurea in corso", "laurea", "med", "phd"
]
GENERATION_ORDER = [
    Generation.BOOMERS, Generation.X, Generation.Y, Generation.Z
]

def approximate_participant_age(participant: Participant) -> int | float:
    if participant.age_range.is_oldest():
        return participant.age_range.oldest_age
    else:
        return (participant.age_range.youngest_age + participant.age_range.oldest_age) / 2

def conversation_generation(conversation: Conversation) -> Generation:
    # Lower median
    generations = [participant.generation for participant in conversation.participants]
    counts = Counter(generations)
    most_common, count = counts.most_common(1)[0]
    if count > len(generations) / 2:
        return most_common

    generations.sort(key=GENERATION_ORDER.index)
    median_index = (len(generations) - 1) // 2
    return generations[median_index]

def conversation_average_age(conversation: Conversation) -> int | float:
    age_ranges = list(map(approximate_participant_age, conversation.participants))
    return round_precise(sum(age_ranges) / len(age_ranges))
    
def conversation_dialect_percentage(conversation: Conversation) -> float:
    def generate_conversation_words():
        for line in conversation:
            yield from filter(lambda word: word.is_linguistic, line.normalised_words)

    total_words = dialect_words = 0
    for word in generate_conversation_words():
        dialect_words += word.is_dialect(strict=False)
        total_words += 1
    return round_precise(dialect_words / total_words * 100)

def conversation_educational_background(conversation: Conversation) -> str:
    # Lowest background
    backgrounds = [participant.degree for participant in conversation.participants]
    backgrounds = list(filter(lambda b: b != "N/A", backgrounds))
    counts = Counter(backgrounds)
    most_common, count = counts.most_common(1)[0]
    if count > len(backgrounds) / 2:
        return most_common
    
    return min(backgrounds, key=EDUCATION_RANKINGS.index)
    
def conversation_participant_data(conversation: Conversation) -> dict[str, Any]:
    n_participants = len(conversation.participants)

    result = {}
    result["n_participants"] = n_participants
    result["dialect_percentage"] = conversation_dialect_percentage(conversation)
    result["average_approximate_age"] = conversation_average_age(conversation)

    sort = {}
    sort["generation"] = conversation_generation(conversation).name
    sort["macro_region"] = conversation.macro_region.name.lower()
    sort["educational_background"] = conversation_educational_background(conversation)
    result["sort"] = sort
    return result

conversations.read_all()

data = create_json_data()
data["metadata"]["title"] = "Dialect percentages based on other participant statistics"
data["data"] = {
    conversation.code: conversation_participant_data(conversation) 
    for conversation in sorted(conversations, key=lambda c: c.code)
}
export_json_data("dialect_comparisons.json", data)