In [15]:
# Import relevant libraries
import pandas as pd

In [16]:
# Import the master database
df = pd.read_csv('../data/new/master_final.csv')

df.describe()

Unnamed: 0,answer_id,user_id,school_id,user_level,question_id,difficulty,topic_id,subject_id,axis_id,guide_id,...,student_subject_attempts,student_subject_correct,student_axis_attempts,student_axis_correct,question_attempts_count,question_success_count,question_success_ratio,avg_question_time,user_level_percentile,student_age
count,196083.0,196083.0,167553.0,196083.0,196083.0,196083.0,196083.0,196083.0,196083.0,196083.0,...,196083.0,196083.0,196083.0,196083.0,196083.0,196083.0,196083.0,196083.0,196083.0,166972.0
mean,1461729.0,37926.608044,97.705717,-0.422881,22661.937501,-1.290956,525.486717,145.025836,23.74111,394082.718288,...,20.169571,15.066956,61.117415,46.437794,34.567994,24.812166,0.698035,57.094943,49.948445,15.576923
std,80772.67,15973.799295,64.290289,1.594154,4073.089612,1.161867,68.331395,32.878704,0.945592,14126.80479,...,25.020991,19.622534,73.8405,60.747589,36.697974,28.59641,0.238587,42.779977,29.042479,1.872362
min,1321329.0,2315.0,1.0,-3.0,15654.0,-3.0,409.0,113.0,23.0,371032.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,10.0
25%,1392200.0,22057.0,57.0,-1.662527,19264.0,-2.067897,462.0,116.0,23.0,381112.0,...,5.0,3.0,15.0,10.0,9.0,5.0,0.555556,31.13865,25.0,14.0
50%,1462145.0,50099.0,91.0,-0.5188,23449.0,-1.410794,540.0,128.0,23.0,393880.0,...,12.0,8.0,38.0,27.0,23.0,15.0,0.741379,49.055804,50.0,16.0
75%,1530326.0,51893.0,105.0,0.648684,26200.0,-0.932703,572.0,181.0,24.0,406650.0,...,26.0,19.0,80.0,60.0,47.0,33.0,0.883528,72.066864,75.0,17.0
max,1603613.0,53189.0,238.0,3.0,29256.0,2.994811,667.0,199.0,26.0,419681.0,...,313.0,221.0,817.0,768.0,274.0,225.0,1.0,596.291865,97.0,18.0


In [17]:
# Print initial number of rows
print(f"Initial number of rows: {len(df)}")

# Group by user_id and topic_id and count rows
grouped_counts = df.groupby(['user_id', 'topic_id']).size()

# Convert to dataframe and filter for counts >= 10
filtered_groups = grouped_counts[grouped_counts >= 10].reset_index()

# Merge back with original dataframe to keep only rows meeting criteria
df = df.merge(filtered_groups[['user_id', 'topic_id']], on=['user_id', 'topic_id'])

# Print final number of rows
print(f"Final number of rows: {len(df)}")



Initial number of rows: 196083
Final number of rows: 93588


In [20]:
# Create sample meeting criteria
sample_df = (df
    # First get unique user-topic combinations
    .drop_duplicates(subset=['user_id', 'topic_id'])
    
    # Stratify by axis_id and is_correct
    .groupby(['axis_id', 'is_correct'])
    .apply(lambda x: x.sample(n=min(25, len(x)), random_state=42))
    .reset_index(drop=True)
    
    # Take first 200 rows after stratification
    .head(200)
)

# Verify distributions
print("\nAxis ID distribution:")
print(sample_df['axis_id'].value_counts(normalize=True))

print("\nCorrect answer distribution:")
print(sample_df['is_correct'].value_counts(normalize=True))

print("\nUser level distribution:")
print(sample_df['user_level'].describe())

# Verify uniqueness
print("\nUnique values:")
print(f"Users: {sample_df['user_id'].nunique()}")
print(f"Topics: {sample_df['topic_id'].nunique()}")



# Save sample to CSV
sample_df.to_csv('../data/new/sample_50.csv', index=False)



Axis ID distribution:
axis_id
23    0.25
24    0.25
25    0.25
26    0.25
Name: proportion, dtype: float64

Correct answer distribution:
is_correct
False    0.5
True     0.5
Name: proportion, dtype: float64

User level distribution:
count    200.000000
mean      -0.293022
std        1.516375
min       -3.000000
25%       -1.472716
50%       -0.199829
75%        0.680723
max        3.000000
Name: user_level, dtype: float64

Unique values:
Users: 159
Topics: 86


  .apply(lambda x: x.sample(n=min(25, len(x)), random_state=42))


In [21]:
# Select and rename columns for export
columns_mapping = {
    'answer_id': 'answerId',
    'is_correct': 'isCorrect', 
    'question_title': 'questionTitle',
    'correct_option': 'correctOptionLetter',
    'student_answer': 'studentAnswerLetter',
    'student_age': 'studentAge',
    'user_level': 'userLevel',
    'user_level_percentile': 'userLevelPercentile',
    'topic_name': 'topicName',
    'subject_name': 'subjectName',
    'axis_name': 'axisName',
    'option_a': 'optionA',
    'option_b': 'optionB',
    'option_c': 'optionC',
    'option_d': 'optionD',
    'option_e': 'optionE'
}

# Select only mapped columns and rename them
export_df = sample_df[columns_mapping.keys()].rename(columns=columns_mapping)

# Export to CSV
export_df.to_csv('../data/new/sample_200_renamed.csv', index=False)

