In [1]:
import pickle
import pprint
from datetime import datetime
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import pytz
import seaborn as sns
from omegaconf import OmegaConf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from src.config import cfg
from src.dir import create_dir
from src.seed import seed_everything

cfg.exp_number = Path().resolve().name
print(OmegaConf.to_yaml(cfg, resolve=True))

seed_everything(cfg.seed)


exp_number: '000'
run_time: base
data:
  input_root: ../../data/input
  train_path: ../../data/input/train.csv
  test_path: ../../data/input/test.csv
  sample_submission_path: ../../data/input/sample_submission.csv
  mapping_path: ../../data/input/misconception_mapping.csv
  output_root: ../../data/output
  results_root: ../../results
  results_path: ../../results/000/base
seed: 42



### データの読み込み

In [2]:
# データの読み込み
train_df = pl.read_csv(cfg.data.train_path, try_parse_dates=True)
test_df = pl.read_csv(cfg.data.test_path, try_parse_dates=True)
sample_submission_df = pl.read_csv(cfg.data.sample_submission_path, try_parse_dates=True)
mapping_df = pl.read_csv(cfg.data.mapping_path, try_parse_dates=True)

# # データの結合
# train_test_df = pl.concat([train_df, test_df], how="diagonal")


### train

In [3]:
pl.Config.set_fmt_str_lengths(1000)
train_df.head(3)


QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId
i64,i64,str,i64,str,str,str,str,str,str,str,f64,f64,f64,f64
0,856,"""Use the order of operations to carry out calculations involving powers""",33,"""BIDMAS""","""A""","""\[ 3 \times 2+4-5 \] Where do the brackets need to go to make the answer equal \( 13 \) ?""","""\( 3 \times(2+4)-5 \)""","""\( 3 \times 2+(4-5) \)""","""\( 3 \times(2+4-5) \)""","""Does not need brackets""",,,,1672.0
1,1612,"""Simplify an algebraic fraction by factorising the numerator""",1077,"""Simplifying Algebraic Fractions""","""D""","""Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \)""","""\( m+1 \)""","""\( m+2 \)""","""\( m-1 \)""","""Does not simplify""",2142.0,143.0,2142.0,
2,2774,"""Calculate the range from a list of data""",339,"""Range and Interquartile Range from a List of Data""","""B""","""Tom and Katie are discussing the \( 5 \) plants with these heights: \( 24 \mathrm{~cm}, 17 \mathrm{~cm}, 42 \mathrm{~cm}, 26 \mathrm{~cm}, 13 \mathrm{~cm} \) Tom says if all the plants were cut in half, the range wouldn't change. Katie says if all the plants grew by \( 3 \mathrm{~cm} \) each, the range wouldn't change. Who do you agree with?""","""Only Tom""","""Only Katie""","""Both Tom and Katie""","""Neither is correct""",1287.0,,1287.0,1073.0


In [4]:
train_df.shape


(1869, 15)

In [5]:
train_df.schema


Schema([('QuestionId', Int64),
        ('ConstructId', Int64),
        ('ConstructName', String),
        ('SubjectId', Int64),
        ('SubjectName', String),
        ('CorrectAnswer', String),
        ('QuestionText', String),
        ('AnswerAText', String),
        ('AnswerBText', String),
        ('AnswerCText', String),
        ('AnswerDText', String),
        ('MisconceptionAId', Float64),
        ('MisconceptionBId', Float64),
        ('MisconceptionCId', Float64),
        ('MisconceptionDId', Float64)])

In [6]:
train_df.describe()


statistic,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId
str,f64,f64,str,f64,str,str,str,str,str,str,str,f64,f64,f64,f64
"""count""",1869.0,1869.0,"""1869""",1869.0,"""1869""","""1869""","""1869""","""1869""","""1869""","""1869""","""1869""",1135.0,1118.0,1080.0,1037.0
"""null_count""",0.0,0.0,"""0""",0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""",734.0,751.0,789.0,832.0
"""mean""",934.0,1613.261637,,225.370787,,,,,,,,1308.599119,1308.029517,1285.300926,1264.568949
"""std""",539.678145,1060.591804,,238.536233,,,,,,,,744.51837,766.487351,742.212198,759.818341
"""min""",0.0,4.0,"""Add algebraic fractions where one denominator is a multiple of the other""",33.0,"""2D Names and Properties of Shapes-Others""","""A""","""![2 arrows pointing to the right. + 7 in the first arrow. divide by 2 in the second arrow]() The rule above is used to generate a term-to-term sequence. If the third term is \( 9 \), what is the first term?""","""![ Long multiplication for 72 multiplied by 36 with incorrect working and incorrect final answer. First row of working is incorrect: 4 2 2. Second row of working is incorrect: 2 7. Final answer is incorrect: 4 4 9.]()""","""![ Long multiplication for 72 multiplied by 36 with incorrect working and incorrect final answer. First row of working is correct: 4 3 2. Second row of working is incorrect: 2 1 6. Final answer is incorrect: 6 4 8.]()""","""![ Long multiplication for 72 multiplied by 36 with correct working and incorrect final answer. First row of working is correct: 4 3 2. Second row of working is correct: 2 1 6 0. Final answer is incorrect: 2 7 0 0.]()""","""![ Long multiplication for 72 multiplied by 36 with correct working and correct final answer. First row of working is correct: 4 3 2. Second row of working is correct: 2 1 6 0. Final answer is correct: 2 5 9 2.]()""",1.0,1.0,2.0,0.0
"""25%""",467.0,575.0,,92.0,,,,,,,,686.0,625.0,655.0,578.0
"""50%""",934.0,1470.0,,203.0,,,,,,,,1336.0,1379.0,1295.0,1282.0
"""75%""",1401.0,2637.0,,238.0,,,,,,,,1954.0,1970.0,1911.0,1897.0
"""max""",1868.0,3526.0,"""Write the next term of an ascending integer linear sequence""",1984.0,"""Written Subtraction""","""D""","""input \(\Rightarrow-3 \Rightarrow \times 3 \Rightarrow 9a-21 \) What is the input of this function machine?""","""£6""","""£4.80""","""£5.80""","""ССССССССС""",2585.0,2586.0,2585.0,2583.0


In [7]:
train_df.select(pl.all().n_unique())


QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
1869,757,757,163,163,4,1857,1219,1230,1222,1184,741,728,696,674


### misconception_mappingにtrainの情報を追加
- SubjectNameの情報を追加

In [8]:
sub_a_df = (
    train_df.group_by("MisconceptionAId")
    .agg(pl.col("SubjectName").unique().alias("SubjectName_A"))
    .sort("MisconceptionAId")
    .rename({"MisconceptionAId": "MisconceptionId"})
    .with_columns(pl.col("MisconceptionId").cast(pl.Int64))
)
sub_b_df = (
    train_df.group_by("MisconceptionBId")
    .agg(pl.col("SubjectName").unique().alias("SubjectName_B"))
    .sort("MisconceptionBId")
    .rename({"MisconceptionBId": "MisconceptionId"})
    .with_columns(pl.col("MisconceptionId").cast(pl.Int64))
)
sub_c_df = (
    train_df.group_by("MisconceptionCId")
    .agg(pl.col("SubjectName").unique().alias("SubjectName_C"))
    .sort("MisconceptionCId")
    .rename({"MisconceptionCId": "MisconceptionId"})
    .with_columns(pl.col("MisconceptionId").cast(pl.Int64))
)
sub_d_df = (
    train_df.group_by("MisconceptionDId")
    .agg(pl.col("SubjectName").unique().alias("SubjectName_D"))
    .sort("MisconceptionDId")
    .rename({"MisconceptionDId": "MisconceptionId"})
    .with_columns(pl.col("MisconceptionId").cast(pl.Int64))
)


In [9]:
mis_id_series = mapping_df.select("MisconceptionId")
mis_id_and_subject_name_df = (
    mis_id_series.join(sub_a_df, on="MisconceptionId", how="left")
    .join(sub_b_df, on="MisconceptionId", how="left")
    .join(sub_c_df, on="MisconceptionId", how="left")
    .join(sub_d_df, on="MisconceptionId", how="left")
    .with_columns(
        pl.all().exclude("MisconceptionId").fill_null([]),
    )
    .with_columns(pl.col("SubjectName_A").list.concat("SubjectName_B").alias("SubjectNames"))
    .with_columns(pl.col("SubjectNames").list.concat("SubjectName_C").alias("SubjectNames"))
    .with_columns(pl.col("SubjectNames").list.concat("SubjectName_D").alias("SubjectNames"))
    .with_columns(pl.col("SubjectNames").list.unique())
    .select("MisconceptionId", "SubjectNames")
)


In [10]:
# misconception_dfと結合
mapping_meta_df = mapping_df.join(mis_id_and_subject_name_df, on="MisconceptionId", how="left")

mapping_meta_df


MisconceptionId,MisconceptionName,SubjectNames
i64,str,list[str]
0,"""Does not know that angles in a triangle sum to 180 degrees""","[""Angles in Triangles""]"
1,"""Uses dividing fractions method for multiplying fractions""","[""Multiplying Fractions"", ""Multiplying and Dividing Negative Numbers""]"
2,"""Believes there are 100 degrees in a full turn""","[""Types, Naming and Estimating"", ""Measuring Angles""]"
3,"""Thinks a quadratic without a non variable term, can not be factorised""","[""Factorising into a Single Bracket""]"
4,"""Believes addition of terms and powers of terms are equivalent e.g. a + c = a^c""","[""Simplifying Expressions by Collecting Like Terms""]"
…,…,…
2582,"""When multiplying numbers with the same base, multiplies the powers """,[]
2583,"""Does not know what a cube number is""","[""Squares, Cubes, etc"", ""Square Roots, Cube Roots, etc""]"
2584,"""Believes that any percentage of a larger number will be greater than any percentage of a smaller number""",[]
2585,"""Believes a cubic expression should have three terms""","[""Expanding Triple Brackets and more""]"


In [11]:
tmp = mapping_meta_df.get_column("SubjectNames").to_numpy()[5]
print(tmp)

for subject_name in tmp:
    print(subject_name)

# Jinja2フォーマットで条件を設定したら[]のときはプロンプトを変更できそう


[]


In [12]:
# TODO: 同じ要領でConstructNameが対応しているかの情報を追加しておく
