In [18]:
import pandas as pd
import json
from pathlib import Path

BASE_DIR = Path("..").resolve()

## ASAP Dataset

The ASAP dataset contains essays from 8 different prompts with human-scored ratings.

In [19]:
asap_path = BASE_DIR / "asap-aes" / "training_set_rel3.tsv"
asap_df = pd.read_csv(asap_path, sep="\t", encoding="latin-1")

print(f"Shape: {asap_df.shape}")
asap_df.head(5)

Shape: (12976, 28)


Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [20]:
print("Columns:")
print(asap_df.columns.tolist())
print(f"\nEssay set distribution:")
print(asap_df["essay_set"].value_counts().sort_index())
print(f"\nMissing values:")
print(asap_df.isnull().sum()[asap_df.isnull().sum() > 0])

Columns:
['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1', 'rater3_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2', 'domain2_score', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3', 'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1', 'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5', 'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3', 'rater3_trait4', 'rater3_trait5', 'rater3_trait6']

Essay set distribution:
essay_set
1    1783
2    1800
3    1726
4    1770
5    1805
6    1800
7    1569
8     723
Name: count, dtype: int64

Missing values:
rater3_domain1    12848
rater1_domain2    11176
rater2_domain2    11176
domain2_score     11176
rater1_trait1     10684
rater1_trait2     10684
rater1_trait3     10684
rater1_trait4     10684
rater1_trait5     12253
rater1_trait6     12253
rater2_trait1     10684
rater2_trait2     10684
rater2_trait3     10684
rater2_trait4     10684
rater2_trait5     12253
rater2_t

In [21]:
asap_df["word_count"] = asap_df["essay"].str.split().str.len()

print("Essay length statistics (word count):")
print(asap_df.groupby("essay_set")["word_count"].describe().round(1))
print(f"\nSample essay (set 1):")
print(asap_df.loc[0, "essay"])

Essay length statistics (word count):
            count   mean    std   min    25%    50%    75%     max
essay_set                                                         
1          1783.0  365.7  119.6   8.0  286.5  365.0  441.0   785.0
2          1800.0  380.7  156.2  31.0  278.8  368.0  470.2  1064.0
3          1726.0  108.7   53.3  10.0   67.0  100.5  146.0   375.0
4          1770.0   94.5   51.9   2.0   54.0   87.0  127.0   357.0
5          1805.0  122.1   57.3   4.0   80.0  119.0  158.0   416.0
6          1800.0  153.3   55.8   3.0  117.0  153.0  188.0   454.0
7          1569.0  168.2   85.3   5.0  105.0  154.0  215.0   592.0
8           723.0  604.9  202.0   4.0  465.5  626.0  790.0   856.0

Sample essay (set 1):
Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would y

## ASAP++ Dataset

The ASAP++ dataset provides trait-level scores for essays from prompts 1-6. Different prompts use different scoring criteria.

In [22]:
asap_pp_dir = BASE_DIR / "asap++"
asap_pp_dfs = {}

for i in range(1, 7):
    path = asap_pp_dir / f"Prompt-{i}.csv"
    asap_pp_dfs[i] = pd.read_csv(path)
    print(f"Prompt {i}: {len(asap_pp_dfs[i])} rows")

Prompt 1: 1783 rows
Prompt 2: 1800 rows
Prompt 3: 1726 rows
Prompt 4: 1772 rows
Prompt 5: 1805 rows
Prompt 6: 1800 rows


In [23]:
print("Prompts 1-2 columns (Rubric 1):")
print(asap_pp_dfs[1].columns.tolist())
print("\nPrompts 3-6 columns (Rubric 2):")
print(asap_pp_dfs[3].columns.tolist())

Prompts 1-2 columns (Rubric 1):
['EssayID', 'Content', 'Organization', 'Word Choice', 'Sentence Fluency', 'Conventions']

Prompts 3-6 columns (Rubric 2):
['Essay ID', 'Content', 'Prompt Adherence', 'Language', 'Narrativity']


In [24]:
print("Score distributions for Prompt 1 (Rubric 1):")
for col in ["Content", "Organization", "Word Choice", "Sentence Fluency", "Conventions"]:
    if col in asap_pp_dfs[1].columns:
        print(f"\n{col}:")
        print(asap_pp_dfs[1][col].value_counts().sort_index())

print("\nScore distributions for Prompt 3 (Rubric 2):")
for col in ["Content", "Prompt Adherence", "Language", "Narrativity"]:
    if col in asap_pp_dfs[3].columns:
        print(f"\n{col}:")
        print(asap_pp_dfs[3][col].value_counts().sort_index())

Score distributions for Prompt 1 (Rubric 1):

Content:
Content
1     23
2     91
3    543
4    684
5    363
6     79
Name: count, dtype: int64

Organization:
Organization
1     28
2    102
3    586
4    702
5    324
6     41
Name: count, dtype: int64

Word Choice:
Word Choice
1     25
2    110
3    674
4    633
5    285
6     56
Name: count, dtype: int64

Sentence Fluency:
Sentence Fluency
1     27
2    104
3    578
4    676
5    352
6     46
Name: count, dtype: int64

Conventions:
Conventions
1     27
2    109
3    574
4    702
5    337
6     34
Name: count, dtype: int64

Score distributions for Prompt 3 (Rubric 2):

Content:
Content
0    258
1    597
2    734
3    137
Name: count, dtype: int64

Prompt Adherence:
Prompt Adherence
0    263
1    558
2    721
3    184
Name: count, dtype: int64

Language:
Language
0    233
1    615
2    708
3    170
Name: count, dtype: int64

Narrativity:
Narrativity
0    268
1    637
2    613
3    208
Name: count, dtype: int64


## Dataset Overlap

In [25]:
asap_sets_1_6 = asap_df[asap_df["essay_set"] <= 6]
asap_ids = set(asap_sets_1_6["essay_id"])

asap_pp_ids = set()
for prompt_num, df in asap_pp_dfs.items():
    id_col = "EssayID" if "EssayID" in df.columns else "Essay ID"
    asap_pp_ids.update(df[id_col].tolist())

overlap = asap_ids & asap_pp_ids

print(f"ASAP essays (sets 1-6): {len(asap_ids)}")
print(f"ASAP++ essays: {len(asap_pp_ids)}")
print(f"Overlapping IDs: {len(overlap)}")
print(f"Match rate: {len(overlap) / len(asap_ids) * 100:.1f}%")

ASAP essays (sets 1-6): 10684
ASAP++ essays: 10685
Overlapping IDs: 10683
Match rate: 100.0%


In [26]:
rubric_1_path = BASE_DIR / "training" / "rubrics" / "1.json"
rubric_2_path = BASE_DIR / "training" / "rubrics" / "2.json"

with open(rubric_1_path) as f:
    rubric_1 = json.load(f)
with open(rubric_2_path) as f:
    rubric_2 = json.load(f)

print("Rubric 1 (Prompts 1-2) criteria:")
for c in rubric_1["criteria"]:
    levels = [l["score"] for l in c["levels"]]
    print(f"* {c['name']} (id={c['id']}): scores {min(levels)}-{max(levels)}")

print("\nRubric 2 (Prompts 3-6) criteria:")
for c in rubric_2["criteria"]:
    levels = [l["score"] for l in c["levels"]]
    print(f"* {c['name']} (id={c['id']}): scores {min(levels)}-{max(levels)}")

Rubric 1 (Prompts 1-2) criteria:
* Content (id=1): scores 1-6
* Organization (id=2): scores 1-6
* Word Choice (id=3): scores 1-6
* Sentence Fluency (id=4): scores 1-6
* Conventions (id=5): scores 1-6

Rubric 2 (Prompts 3-6) criteria:
* Content (id=1): scores 0-3
* Prompt Adherence (id=2): scores 0-3
* Language (id=3): scores 0-3
* Narrativity (id=4): scores 0-3


## Output Format

In [27]:
sample_path = BASE_DIR / "training" / "sample.json"
with open(sample_path) as f:
    sample = json.load(f)

print(f"Sample contains {len(sample)} essays\n")
print("Schema: id, prompt_id, rubric_id, essay, scores")
print("* scores: [{criteria_id, level_id}, ...]\n")

example = sample[0]
print(f"Example essay (id={example['id']}):")
print(f"* prompt_id: {example['prompt_id']}")
print(f"* rubric_id: {example['rubric_id']}")
print(f"* essay: {example['essay']}")
print(f"* scores: {example['scores']}")

Sample contains 3 essays

Schema: id, prompt_id, rubric_id, essay, scores
* scores: [{criteria_id, level_id}, ...]

Example essay (id=1):
* prompt_id: 1
* rubric_id: 1
* essay: Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by comput

In [28]:
essays_path = BASE_DIR / "training" / "essays.json"
with open(essays_path) as f:
    essays = json.load(f)

essays_df = pd.DataFrame(essays)

print(f"Total merged essays: {len(essays_df)}")
print(f"\nDistribution by prompt_id:")
print(essays_df["prompt_id"].value_counts().sort_index())
print(f"\nDistribution by rubric_id:")
print(essays_df["rubric_id"].value_counts().sort_index())

Total merged essays: 10683

Distribution by prompt_id:
prompt_id
1    1783
2    1799
3    1726
4    1770
5    1805
6    1800
Name: count, dtype: int64

Distribution by rubric_id:
rubric_id
1    3582
2    7101
Name: count, dtype: int64
