In [1]:
# Teacher Discourse Analysis: In-Service vs Pre-Service Teachers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import umap
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import spacy
from sentence_transformers import SentenceTransformer
# For now I'll simulate a small sample structure:
df = pd.read_csv('data/corpus_data_combined.csv')  # Adjust path if needed

print("Corpus loaded with shape:", df.shape)
df[['text', 'pre_or_in_service']].head()

# --- 2. Preprocessing (French Tokenization, Lemmatization) ---

nlp = spacy.load('fr_core_news_md')

  from .autonotebook import tqdm as notebook_tqdm


Corpus loaded with shape: (8751, 38)


In [2]:
# --- 0. Preprocessing Enhancements ---

print("Applying improved text cleaning...")

# Drop short or empty texts
df = df[df['clean_text_nostop'].str.split().apply(len) > 5]
df = df.dropna(subset=['clean_text_nostop'])

print(f"Corpus size after cleaning: {df.shape}")

# Normalize texts (optional)
df['clean_text_nostop'] = df['clean_text_nostop'].str.lower()

# (Optional) normalize accents: you can add if needed

# --- 1. Speech Acts Analysis (with framing comment) ---

# Following Tong et al. (2024) and Jensen et al. (2021), we analyze discourse structure through speech acts.

import spacy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, chi2_contingency

nlp = spacy.load('fr_core_news_md')

def detect_speech_acts(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    counts = {'question': 0, 'command': 0, 'affirmation': 0}
    
    for sent in sentences:
        if sent.text.strip().endswith('?'):
            counts['question'] += 1
        elif any(token.tag_ == 'VERB' and token.morph.get('Mood') == ['Imp'] for token in sent):
            counts['command'] += 1
        else:
            counts['affirmation'] += 1
    return pd.Series(counts)

print("Detecting speech acts...")
speech_acts = df['clean_text_nostop'].apply(detect_speech_acts)
df = pd.concat([df, speech_acts], axis=1)

# --- Plot speech acts per group ---
speech_summary = df.groupby('pre_or_in_service')[['question', 'command', 'affirmation']].mean()

speech_summary.plot(kind='bar', figsize=(10,6), yerr=df.groupby('pre_or_in_service')[['question', 'command', 'affirmation']].std())
plt.title('Average Number of Speech Acts per Text by Teacher Group')
plt.ylabel('Average number per text')
plt.xticks(rotation=45)
plt.show()

# --- T-test for questions ---
print("Statistical Testing (Questions):")
q_inservice = df[df['pre_or_in_service'] == 'In-service teacher']['question']
q_preservice = df[df['pre_or_in_service'] == 'Pre-service teacher']['question']
t_stat, p_val = ttest_ind(q_inservice, q_preservice)
print(f"T-test: t={t_stat:.3f}, p={p_val:.3f}")


Applying improved text cleaning...


KeyError: 'clean_text_nostop'