# 01_data_ingestion
Research notebook to read and explore the ingested data from Reasona project.

In [1]:
from pathlib import Path
import pandas as pd
from Reasona.utils.logger import setup_logger

logger = setup_logger('../logs/research/research.log')

COMBINED_FILE = Path('../artifacts/data_ingestion/combined/combined_data.parquet')

if not COMBINED_FILE.exists():
    logger.error(f"File not found: {COMBINED_FILE}")
else:
    df = pd.read_parquet(COMBINED_FILE)
    logger.info(f"Data loaded. Shape: {df.shape}")
    df.head()

[2025-12-09 16:54:06,421] Data loaded. Shape: (100, 14)


## Data Exploration

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   synth_id             100 non-null    object
 1   language             100 non-null    object
 2   exercise             100 non-null    object
 3   model                100 non-null    object
 4   query                100 non-null    object
 5   query_seed_url       99 non-null     object
 6   query_seed_text      100 non-null    object
 7   additional_seed_url  100 non-null    object
 8   seed_license         100 non-null    object
 9   constraints          100 non-null    object
 10  script               100 non-null    object
 11  synthetic_reasoning  100 non-null    object
 12  synthetic_answer     100 non-null    object
 13  words                100 non-null    Int64 
dtypes: Int64(1), object(13)
memory usage: 11.2+ KB


In [3]:
df.dtypes

synth_id               object
language               object
exercise               object
model                  object
query                  object
query_seed_url         object
query_seed_text        object
additional_seed_url    object
seed_license           object
constraints            object
script                 object
synthetic_reasoning    object
synthetic_answer       object
words                   Int64
dtype: object

In [4]:
df.describe()

Unnamed: 0,words
count,100.0
mean,507.09
std,178.497664
min,151.0
25%,411.75
50%,521.5
75%,589.75
max,1464.0


In [5]:
if 'id' in df.columns:
    dup_count = df.duplicated(subset=['id']).sum()
    print(f"Duplicate rows based on 'id': {dup_count}")
else:
    dup_count = df.duplicated().sum()
    print(f"Full duplicate rows: {dup_count}")

Full duplicate rows: 0


In [6]:

for col in df.select_dtypes(include='object').columns:
    print(f"\nValue counts for {col}:")
    print(df[col].value_counts().head(10))


Value counts for synth_id:
synth_id
recipes_2608                          1
memorization_88_77220                 1
mcq_math_4_240373                     1
memorization_specialized_3_93_3003    1
memorization_specialized_6_83_2646    1
memorization_spanish_3_138177         1
memorization_spanish_1_126231         1
memorization_latin_5_181503           1
mcq_math_4_84343                      1
memorization_german_8_203144          1
Name: count, dtype: int64

Value counts for language:
language
en    77
es     6
pl     5
fr     4
de     3
it     3
la     2
Name: count, dtype: int64

Value counts for exercise:
exercise
memorization        88
cooking              3
math mcq             3
creative writing     3
mcq                  2
math exercise        1
Name: count, dtype: int64

Value counts for model:
model
qwen-3-8b-memorization                              88
qwen-3-8b-memorization+seed rewriting with Qwen3     3
qwen-3-8b-mcq-math+deepseek-prover-8b-solving        3
qwen-3-8b-crea