In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Load Data
df = pd.read_csv('official_row_data.csv', encoding='utf-8')  # Encoding adjusted

# Collect comments per model
comments = {
    model: df[df['model'] == model]['comment'].dropna().tolist()
    for model in df['model'].unique()
}

# Preprocessing function for text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Custom stopwords
custom_stopwords = [
    'the', 'a', 'an', 'in', 'on', 'of', 'to', 'with', 'at', 'by', 'for', 'from',
    'is', 'are', 'was', 'were', 'be', 'being', 'been',
    'that', 'this', 'these', 'those', 'and', 'but', 'or', 'if', 'then', 'so',
    'as', 'because', 'while', 'although', 'though', 'yet', 'also', 'just', 'it',
    'its', 'fs', 'about', 'ai', 'narrative', 'story', 'through', 'slightly',
    'somewhat', 'effectively', 'well', 'occasionally', 'into', 'where', 'between'
]

# Main TF-IDF analysis
ranking = []
for model in df['model'].unique():
    processed_comments = [preprocess_text(text) for text in comments[model]]
    
    # Vectorization
    vectorizer = TfidfVectorizer(stop_words=custom_stopwords)
    tfidf_matrix = vectorizer.fit_transform(processed_comments)
    
    # Score calculation
    feature_names = vectorizer.get_feature_names_out()
    mean_scores = tfidf_matrix.mean(axis=0).A1
    tfidf_scores = dict(zip(feature_names, mean_scores))
    
    # Top-N words
    top_n = 10
    top_words = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    for rank, (word, score) in enumerate(top_words, start=1):
        ranking.append([model, rank, word])

# Convert to DataFrame and pivot
df_ranking = pd.DataFrame(ranking, columns=['model', 'rank', 'word'])
pivot_df = df_ranking.pivot(index='rank', columns='model', values='word')

# Output Table E-1
print("\nTable E-1: Top 10 TF-IDF Words per Model")
print(pivot_df)

# Output comment count
print("\nTable E-1: Number of Comments per Model")
for model, comment_list in comments.items():
    print(f"{model}: {len(comment_list)} comments")



Table E-1: Top 10 TF-IDF Words per Model
model   ChatGPT4.5   Gemini-2.5      Grok3    Notebook LM    OpenAI-o3  \
rank                                                                     
1        emotional        human  emotional          human    emotional   
2        resonance      concept   literary  consciousness   exposition   
3         cohesion   unsettling    concept       resonant        prose   
4            depth     humanity  resonance  psychological  originality   
5      originality     identity   original       identity   structural   
6      exploration     original      prose        mystery    character   
7         literary  exploration    quality       humanity       pacing   
8            human    emotional     strong     efficiency    resonance   
9           strong   compelling  aesthetic        concept      overall   
10     existential     profound  structure       original      concept   

model        Sonnet4  
rank                  
1          emotional  
