# PISA 2022 Amazon SageMaker Linear Learner

More info on SageMaker Immersion Day: [Workshop Link](https://catalog.us-east-1.prod.workshops.aws/workshops/63069e26-921c-4ce1-9cc7-dd882ff62575/en-US/lab2-model-training/pro-code)


### ***Change country name below!***

In [1]:
country_name = 'United_States'

In [2]:
country_name_edited = country_name.replace("_", "-")

In [3]:
# cell 02
import sagemaker
bucket=sagemaker.Session().default_bucket()
prefix = 'sagemaker/factorization-mick-'+country_name_edited
 
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


Now let's bring in the Python libraries that we'll use throughout the analysis

In [4]:
# cell 03
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker 
import zipfile     # Amazon SageMaker's Python SDK provides many helper functions

#### Download PISA 2022 Prepared Dataset

This is our dataset output from our cleaned notebook [here](https://7z4vtvpqcoxouiu.studio.us-west-2.sagemaker.aws/jupyterlab/default/lab/tree/RTC%3Amids-capstone/notebooks/eda/Data_merging.ipynb)


In [5]:
%%time 

# cell 06

# Define local file path
local_file_path = "../eda/with-wle-latent/new_PISA_cleaned_dataset.csv"  # Change as needed

# Define S3 details
bucket_name = "sagemaker-us-west-2-986030204467"
file_key = "capstone/testfiles/new_PISA_cleaned_dataset.csv"

# Check if the file exists locally
if os.path.exists(local_file_path):
    print("📂 Loading data from local file...")
    data = pd.read_csv(local_file_path, usecols=None)
    
else:
    print("☁️ Downloading data from S3...")
    
    # Create S3 client
    s3_client = boto3.client("s3")

    # Download the file from S3
    response = s3_client.get_object(Bucket=bucket_name, Key=file_key)

    # Read the file into pandas DataFrame
    data = pd.read_csv(response["Body"], usecols=None)

    # Save a local copy for future use
    data.to_csv(local_file_path, index=False)
    print(f"✅ File saved locally as {local_file_path}")

# Display first few rows
#data.head()

pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 20)         # Keep the output on one page
data

📂 Loading data from local file...
CPU times: user 52 s, sys: 5.51 s, total: 57.5 s
Wall time: 1min 1s


Unnamed: 0,CNT,CNTSCHID,CNTSTUID,MATH_Proficient,SISCO,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,ST251Q01JA,ST251Q02JA,ST251Q03JA,ST251Q04JA,ST251Q06JA,ST251Q07JA,ST253Q01JA,ST254Q01JA,ST254Q02JA,ST254Q03JA,ST254Q04JA,ST254Q05JA,ST254Q06JA,ST255Q01JA,ST256Q01JA,ST256Q02JA,ST256Q03JA,ST256Q06JA,ST256Q07JA,ST256Q08JA,ST256Q09JA,ST256Q10JA,ST267Q01JA,ST267Q02JA,ST267Q03JA,ST267Q04JA,ST267Q05JA,ST267Q06JA,ST267Q07JA,ST267Q08JA,ST034Q01TA,ST034Q02TA,ST034Q03TA,ST034Q04TA,ST034Q05TA,ST034Q06TA,ST038Q03NA,ST038Q04NA,ST038Q05NA,ST038Q06NA,ST038Q07NA,ST038Q08NA,ST038Q09JA,ST038Q10JA,ST038Q11JA,ST265Q01JA,ST265Q02JA,ST265Q03JA,ST265Q04JA,ST266Q01JA,ST266Q02JA,ST266Q03JA,ST266Q04JA,ST266Q05JA,ST307Q01JA,ST307Q02JA,ST307Q03JA,ST307Q04JA,ST307Q05JA,ST307Q06JA,ST307Q07JA,ST307Q08JA,ST307Q09JA,ST307Q10JA,ST301Q01JA,ST301Q02JA,ST301Q03JA,ST301Q04JA,ST301Q05JA,ST301Q06JA,ST301Q07JA,ST301Q08JA,ST301Q09JA,ST301Q10JA,ST343Q01JA,ST343Q02JA,ST343Q03JA,ST343Q04JA,ST343Q05JA,ST343Q06JA,ST343Q07JA,ST343Q08JA,ST343Q09JA,ST343Q10JA,ST311Q01JA,ST311Q02JA,ST311Q03JA,ST311Q04JA,ST311Q05JA,ST311Q06JA,ST311Q07JA,ST311Q08JA,ST311Q09JA,ST311Q10JA,ST305Q01JA,ST305Q02JA,ST305Q03JA,ST305Q04JA,ST305Q05JA,ST305Q06JA,ST305Q07JA,ST305Q08JA,ST305Q09JA,ST305Q10JA,ST345Q01JA,ST345Q02JA,ST345Q03JA,ST345Q04JA,ST345Q05JA,ST345Q06JA,ST345Q07JA,ST345Q08JA,ST345Q09JA,ST345Q10JA,ST313Q01JA,ST313Q02JA,ST313Q03JA,ST313Q04JA,ST313Q05JA,ST313Q06JA,ST313Q07JA,ST313Q08JA,ST313Q09JA,ST313Q10JA,ST263Q02JA,ST263Q04JA,ST263Q06JA,ST263Q08JA,ST273Q01JA,ST273Q02JA,ST273Q03JA,ST273Q04JA,ST273Q05JA,ST273Q06JA,ST273Q07JA,ST270Q01JA,ST270Q02JA,ST270Q03JA,ST270Q04JA,ST285Q01JA,ST285Q02JA,ST285Q03JA,ST285Q04JA,ST285Q05JA,ST285Q06JA,ST285Q07JA,ST285Q08JA,ST285Q09JA,ST283Q01JA,ST283Q02JA,ST283Q03JA,ST283Q04JA,ST283Q05JA,ST283Q06JA,ST283Q07JA,ST283Q08JA,ST283Q09JA,ST275Q01WA,ST275Q02WA,ST275Q03WA,ST275Q04WA,ST275Q05WA,ST275Q06WA,ST275Q07WA,ST275Q08WA,ST275Q09WA,ST276Q01JA,ST276Q02JA,ST276Q03JA,ST276Q04JA,ST276Q05JA,ST276Q06JA,ST276Q07JA,ST276Q08JA,ST276Q09JA,ST276Q10JA,ST290Q01WA,ST290Q02WA,ST290Q03WA,ST290Q04WA,ST290Q05WA,ST290Q06WA,ST290Q07WA,ST290Q08WA,ST290Q09WA,ST291Q01JA,ST291Q02JA,ST291Q03JA,ST291Q04JA,ST291Q05JA,ST291Q06JA,ST291Q07JA,ST291Q08JA,ST291Q09JA,ST291Q10JA,ST289Q01WA,ST289Q02JA,ST289Q04JA,ST289Q05WA,ST289Q06JA,ST289Q07JA,ST289Q08WA,ST289Q09WA,ST289Q10WA,ST289Q14JA,ST293Q01JA,ST293Q02JA,ST293Q03JA,ST293Q05JA,ST293Q06JA,ST293Q07JA,ST293Q08JA,ST293Q09JA,ST292Q01JA,ST292Q02JA,ST292Q03JA,ST292Q04JA,ST292Q05JA,ST292Q06JA,ST334Q01JA,ST334Q02JA,ST334Q03JA,ST334Q04JA,ST334Q05JA,ST334Q06JA,ST334Q07JA,ST334Q08JA,ST334Q09JA,ST334Q10JA,ST335Q01JA,ST335Q02JA,ST335Q03JA,ST335Q05JA,ST335Q06JA,ST335Q07JA,ST336Q01JA,ST336Q03JA,ST336Q04JA,ST336Q05JA,ST336Q06JA,...,ST349Q01JA_2,ST349Q01JA_3,ST349Q01JA_4,ST349Q01JA_0,LANGN_105,LANGN_108,LANGN_118,LANGN_140,LANGN_148,LANGN_150,LANGN_156,LANGN_200,LANGN_204,LANGN_232,LANGN_273,LANGN_313,LANGN_316,LANGN_322,LANGN_329,LANGN_344,LANGN_351,LANGN_415,LANGN_463,LANGN_493,LANGN_496,LANGN_500,LANGN_520,LANGN_531,LANGN_602,LANGN_606,LANGN_615,LANGN_621,LANGN_625,LANGN_640,LANGN_641,LANGN_663,LANGN_669,LANGN_670,LANGN_800,LANGN_801,LANGN_802,LANGN_804,LANGN_805,LANGN_806,LANGN_807,LANGN_808,LANGN_865,LANGN_892,LANGN_895,LANGN_917,SC177Q01JA_1,SC177Q01JA_2,SC177Q01JA_3,SC177Q02JA_1,SC177Q02JA_2,SC177Q02JA_3,SC177Q03JA_1,SC177Q03JA_2,SC177Q03JA_3,MATHEXC_0,MATHEXC_1,MATHEXC_2,MATHEXC_3,SCHLTYPE_1,SCHLTYPE_2,SCHLTYPE_3,LANGN_121,LANGN_130,LANGN_137,LANGN_170,LANGN_244,LANGN_258,LANGN_263,LANGN_264,LANGN_266,LANGN_317,LANGN_340,LANGN_369,LANGN_381,LANGN_404,LANGN_420,LANGN_449,LANGN_467,LANGN_494,LANGN_495,LANGN_514,LANGN_523,LANGN_529,LANGN_540,LANGN_547,LANGN_600,LANGN_607,LANGN_618,LANGN_619,LANGN_630,LANGN_635,LANGN_650,LANGN_661,LANGN_673,LANGN_674,LANGN_809,LANGN_810,LANGN_811,LANGN_812,LANGN_813,LANGN_814,LANGN_815,LANGN_816,LANGN_818,LANGN_832,LANGN_868,LANGN_870,LANGN_920,LANGN_921,LANGN_113,LANGN_147,LANGN_275,LANGN_286,LANGN_363,LANGN_422,LANGN_434,LANGN_442,LANGN_471,LANGN_611,LANGN_614,LANGN_624,LANGN_642,LANGN_675,LANGN_676,LANGN_677,LANGN_678,LANGN_817,LANGN_819,LANGN_821,LANGN_823,LANGN_824,LANGN_825,LANGN_826,LANGN_827,LANGN_828,LANGN_885,LANGN_896,LANGN_916,LANGN_112,LANGN_154,LANGN_202,LANGN_246,LANGN_254,LANGN_272,LANGN_301,LANGN_325,LANGN_338,LANGN_358,LANGN_371,LANGN_375,LANGN_383,LANGN_409,LANGN_428,LANGN_465,LANGN_517,LANGN_527,LANGN_561,LANGN_562,LANGN_563,LANGN_565,LANGN_566,LANGN_567,LANGN_601,LANGN_622,LANGN_623,LANGN_628,LANGN_631,LANGN_831,LANGN_833,LANGN_836,LANGN_837,LANGN_838,LANGN_839,LANGN_840,LANGN_841,LANGN_845,LANGN_872,LANGN_873,LANGN_881,LANGN_890,LANGN_897,LANGN_898,LANGN_899,LANGN_900,LANGN_901,LANGN_902,LANGN_903,LANGN_904,LANGN_905,LANGN_906,LANGN_907,LANGN_908,LANGN_909,LANGN_910,LANGN_911,LANGN_912,LANGN_913,LANGN_914,LANGN_918,LANGN_919,LANGN_160,LANGN_327,LANGN_451,LANGN_474,LANGN_503,LANGN_608,LANGN_627,LANGN_639,LANGN_668,LANGN_842,LANGN_843,LANGN_844,LANGN_846,LANGN_849,LANGN_850,LANGN_851,LANGN_852,LANGN_861,LANGN_879,LANGN_133,LANGN_195,LANGN_237,LANGN_379,LANGN_382,LANGN_472,LANGN_492,LANGN_555,LANGN_605,LANGN_616,LANGN_626,LANGN_634,LANGN_648,LANGN_662,LANGN_665,LANGN_666,LANGN_667,LANGN_829,LANGN_854,LANGN_855,LANGN_857,LANGN_859,LANGN_860,LANGN_866,LANGN_877,LANGN_922
0,Albania,800282.0,800001.0,0.0,,,1.0,,,,,,,,4.0,,8.0,,,,,4.0,,4.0,,3.0,3.0,,,3.0,,,4.0,4.0,,1.0,3.0,3.0,,,3.0,1.0,3.0,3.0,2.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,1.0,5.0,5.0,5.0,,1.0,,,,1.0,5.0,,1.0,,1.0,5.0,,,1.0,,5.0,,,,1.0,5.0,,5.0,,,2.0,,2.0,4.0,2.0,,,3.0,,5.0,3.0,3.0,,,5.0,,,3.0,5.0,,,,5.0,,,3.0,5.0,3.0,4.0,3.0,3.0,4.0,,4.0,,4.0,4.0,3.0,3.0,1.0,1.0,1.0,1.0,,,5.0,,,1.0,5.0,5.0,5.0,5.0,5.0,5.0,,,,,5.0,5.0,,4.0,4.0,2.0,,2.0,,2.0,,,,1.0,,1.0,,1.0,,1.0,1.0,2.0,,,2.0,2.0,,,2.0,2.0,,,3.0,,3.0,,,,,,,,,,,,,,,,,,2.0,4.0,5.0,,5.0,,1.0,3.0,3.0,3.0,1.0,,,4.0,,4.0,4.0,4.0,,,4.0,,1.0,,3.0,2.0,4.0,4.0,3.0,3.0,3.0,4.0,4.0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Albania,800115.0,800002.0,0.0,,2.0,2.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,,,,2.0,,,,,,,,1.0,3.0,3.0,3.0,1.0,4.0,2.0,,3.0,1.0,4.0,1.0,4.0,,3.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Albania,800242.0,800003.0,0.0,,1.0,1.0,1.0,1.0,1.0,2.0,3.0,3.0,3.0,2.0,2.0,8.0,2.0,2.0,1.0,4.0,2.0,,4.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,1.0,4.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,5.0,,,,,,,4.0,,,,,,,,,,4.0,,,,,,4.0,,4.0,,,,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Albania,800245.0,800005.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,4.0,,,1.0,,4.0,4.0,1.0,4.0,2.0,2.0,,2.0,4.0,3.0,1.0,1.0,1.0,1.0,3.0,1.0,3.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,5.0,,5.0,,,3.0,3.0,,5.0,,2.0,3.0,,3.0,2.0,,,,,5.0,5.0,3.0,,1.0,,,3.0,,5.0,,4.0,,4.0,,1.0,4.0,,5.0,,,4.0,5.0,,,4.0,,,,4.0,4.0,,4.0,4.0,,,,4.0,4.0,,4.0,4.0,,,4.0,,4.0,4.0,4.0,,,3.0,3.0,2.0,4.0,1.0,4.0,4.0,,4.0,,4.0,1.0,1.0,1.0,1.0,1.0,,,1.0,,2.0,4.0,,3.0,2.0,3.0,2.0,3.0,,,,2.0,,3.0,,2.0,,3.0,,2.0,3.0,,,2.0,,2.0,2.0,2.0,,,,2.0,,2.0,,2.0,,,2.0,2.0,2.0,,,4.0,4.0,4.0,4.0,4.0,,,,,,4.0,4.0,,3.0,,,4.0,,3.0,3.0,,,,3.0,,4.0,,1.0,2.0,3.0,2.0,3.0,,2.0,,3.0,,,2.0,3.0,2.0,,2.0,3.0,3.0,,2.0,3.0,2.0,3.0,2.0,,2.0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Albania,800285.0,800006.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,2.0,3.0,1.0,4.0,7.0,2.0,2.0,2.0,1.0,1.0,3.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,4.0,,3.0,1.0,4.0,,,1.0,4.0,,2.0,4.0,2.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,,4.0,4.0,,4.0,,,,5.0,1.0,,5.0,3.0,,,,2.0,2.0,5.0,,4.0,,,4.0,,,2.0,5.0,5.0,,,4.0,4.0,4.0,,,,4.0,4.0,,3.0,,3.0,3.0,3.0,,,,5.0,,4.0,3.0,,3.0,4.0,,,,4.0,,5.0,,4.0,,3.0,,,,4.0,4.0,3.0,3.0,3.0,3.0,4.0,,3.0,3.0,,3.0,4.0,1.0,1.0,2.0,3.0,,,3.0,3.0,3.0,,,4.0,4.0,4.0,3.0,,3.0,,,,4.0,5.0,4.0,3.0,,,,3.0,3.0,,3.0,3.0,,,,,3.0,,3.0,3.0,4.0,1.0,,,,2.0,3.0,4.0,3.0,,3.0,,2.0,3.0,,,4.0,,4.0,,,5.0,4.0,4.0,,,,4.0,4.0,,,,3.0,3.0,3.0,,4.0,4.0,2.0,2.0,,3.0,3.0,4.0,,1.0,2.0,2.0,,,,4.0,3.0,,3.0,2.0,2.0,3.0,4.0,,2.0,2.0,2.0,3.0,,...,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591852,Uzbekistan,86000120.0,86007488.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,,,,,,,,4.0,,4.0,3.0,,2.0,2.0,,3.0,2.0,4.0,2.0,4.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,4.0,4.0,,,2.0,,,5.0,,,4.0,4.0,,,4.0,,2.0,4.0,,,,4.0,2.0,,,2.0,4.0,,2.0,2.0,,4.0,4.0,,,,2.0,,2.0,,2.0,2.0,,,4.0,2.0,,4.0,,2.0,,,2.0,2.0,,2.0,2.0,,,4.0,4.0,,,4.0,5.0,,,5.0,,4.0,2.0,2.0,2.0,4.0,,4.0,,4.0,4.0,4.0,1.0,1.0,1.0,1.0,,5.0,,,5.0,5.0,5.0,,5.0,5.0,5.0,5.0,,5.0,,,,5.0,2.0,,2.0,2.0,2.0,,2.0,,,4.0,,,4.0,4.0,,4.0,,4.0,,,,4.0,4.0,4.0,4.0,,,4.0,4.0,4.0,,,4.0,4.0,,,,4.0,,,5.0,5.0,,5.0,5.0,,,,5.0,,5.0,5.0,,5.0,,5.0,2.0,2.0,2.0,2.0,2.0,,,4.0,,4.0,,4.0,4.0,4.0,,,3.0,3.0,3.0,3.0,,3.0,3.0,,3.0,3.0,3.0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
591853,Uzbekistan,86000140.0,86007489.0,0.0,,1.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,3.0,1.0,1.0,1.0,,,,,,,2.0,4.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,,,3.0,3.0,3.0,,,3.0,2.0,2.0,3.0,2.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,,,2.0,2.0,,1.0,2.0,,,,,,4.0,4.0,4.0,,,4.0,4.0,,2.0,4.0,,,4.0,2.0,,,2.0,2.0,2.0,,3.0,,,,2.0,,2.0,2.0,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
591854,Uzbekistan,86000024.0,86007490.0,0.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,4.0,5.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0,2.0,2.0,2.0,3.0,1.0,2.0,2.0,4.0,,,,2.0,3.0,3.0,3.0,1.0,3.0,2.0,2.0,,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,4.0,2.0,,,2.0,4.0,,,4.0,,2.0,4.0,,4.0,,,,4.0,4.0,,,4.0,,4.0,2.0,,,4.0,4.0,4.0,4.0,,,2.0,,,2.0,,,2.0,,2.0,2.0,,,,4.0,2.0,4.0,,,,4.0,4.0,2.0,,,2.0,4.0,4.0,,2.0,4.0,,,,2.0,,3.0,2.0,3.0,3.0,,3.0,,2.0,4.0,4.0,4.0,1.0,1.0,2.0,1.0,1.0,,4.0,5.0,,5.0,,,5.0,4.0,,,,4.0,4.0,,4.0,4.0,,4.0,4.0,,1.0,2.0,,,1.0,1.0,2.0,,,,3.0,,1.0,1.0,,,3.0,3.0,,4.0,,3.0,,4.0,3.0,,3.0,3.0,3.0,,,4.0,,,,4.0,,4.0,,,4.0,,,4.0,2.0,,,5.0,4.0,,,4.0,,3.0,3.0,3.0,3.0,3.0,,,,3.0,3.0,3.0,,,2.0,3.0,3.0,3.0,3.0,3.0,3.0,,3.0,3.0,3.0,2.0,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
591855,Uzbekistan,86000174.0,86007491.0,0.0,,1.0,1.0,1.0,2.0,1.0,4.0,1.0,3.0,2.0,1.0,1.0,8.0,2.0,2.0,2.0,1.0,1.0,2.0,4.0,,3.0,3.0,2.0,1.0,2.0,4.0,4.0,4.0,4.0,,,4.0,4.0,,1.0,4.0,1.0,3.0,,2.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,5.0,,,5.0,5.0,,2.0,,,1.0,,,1.0,,5.0,5.0,5.0,,,5.0,5.0,,5.0,,,5.0,1.0,4.0,,,1.0,1.0,,,,4.0,1.0,,5.0,,5.0,,5.0,,,,4.0,,5.0,5.0,1.0,,1.0,1.0,,,,5.0,1.0,,,1.0,1.0,,,1.0,,1.0,,1.0,1.0,2.0,1.0,1.0,4.0,,4.0,,4.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,,5.0,5.0,,,,5.0,5.0,,,,,5.0,5.0,5.0,3.0,5.0,,3.0,2.0,,,2.0,1.0,2.0,,2.0,2.0,,,,2.0,,2.0,,2.0,3.0,3.0,,3.0,3.0,,3.0,,,4.0,4.0,4.0,,,4.0,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Download dictionary for the variable names

In [6]:
# Download the file from S3
s3_client = boto3.client("s3")
dictionary_file = s3_client.get_object(Bucket=bucket_name, Key="capstone/testfiles/all_vars.csv")

# Read the file into pandas DataFrame
dictionary = pd.read_csv(dictionary_file["Body"], usecols=None)

#### Subset the data to a specific COUNTRY

In [7]:
model_data = data[data['CNT'] == country_name]
print(model_data.shape)
model_data.head()

(4552, 1121)


Unnamed: 0,CNT,CNTSCHID,CNTSTUID,MATH_Proficient,SISCO,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,ST251Q01JA,ST251Q02JA,ST251Q03JA,ST251Q04JA,ST251Q06JA,ST251Q07JA,ST253Q01JA,ST254Q01JA,ST254Q02JA,ST254Q03JA,ST254Q04JA,ST254Q05JA,ST254Q06JA,ST255Q01JA,ST256Q01JA,ST256Q02JA,ST256Q03JA,ST256Q06JA,ST256Q07JA,ST256Q08JA,ST256Q09JA,ST256Q10JA,ST267Q01JA,ST267Q02JA,ST267Q03JA,ST267Q04JA,ST267Q05JA,ST267Q06JA,ST267Q07JA,ST267Q08JA,ST034Q01TA,ST034Q02TA,ST034Q03TA,ST034Q04TA,ST034Q05TA,ST034Q06TA,ST038Q03NA,ST038Q04NA,ST038Q05NA,ST038Q06NA,ST038Q07NA,ST038Q08NA,ST038Q09JA,ST038Q10JA,ST038Q11JA,ST265Q01JA,ST265Q02JA,ST265Q03JA,ST265Q04JA,ST266Q01JA,ST266Q02JA,ST266Q03JA,ST266Q04JA,ST266Q05JA,ST307Q01JA,ST307Q02JA,ST307Q03JA,ST307Q04JA,ST307Q05JA,ST307Q06JA,ST307Q07JA,ST307Q08JA,ST307Q09JA,ST307Q10JA,ST301Q01JA,ST301Q02JA,ST301Q03JA,ST301Q04JA,ST301Q05JA,ST301Q06JA,ST301Q07JA,ST301Q08JA,ST301Q09JA,ST301Q10JA,ST343Q01JA,ST343Q02JA,ST343Q03JA,ST343Q04JA,ST343Q05JA,ST343Q06JA,ST343Q07JA,ST343Q08JA,ST343Q09JA,ST343Q10JA,ST311Q01JA,ST311Q02JA,ST311Q03JA,ST311Q04JA,ST311Q05JA,ST311Q06JA,ST311Q07JA,ST311Q08JA,ST311Q09JA,ST311Q10JA,ST305Q01JA,ST305Q02JA,ST305Q03JA,ST305Q04JA,ST305Q05JA,ST305Q06JA,ST305Q07JA,ST305Q08JA,ST305Q09JA,ST305Q10JA,ST345Q01JA,ST345Q02JA,ST345Q03JA,ST345Q04JA,ST345Q05JA,ST345Q06JA,ST345Q07JA,ST345Q08JA,ST345Q09JA,ST345Q10JA,ST313Q01JA,ST313Q02JA,ST313Q03JA,ST313Q04JA,ST313Q05JA,ST313Q06JA,ST313Q07JA,ST313Q08JA,ST313Q09JA,ST313Q10JA,ST263Q02JA,ST263Q04JA,ST263Q06JA,ST263Q08JA,ST273Q01JA,ST273Q02JA,ST273Q03JA,ST273Q04JA,ST273Q05JA,ST273Q06JA,ST273Q07JA,ST270Q01JA,ST270Q02JA,ST270Q03JA,ST270Q04JA,ST285Q01JA,ST285Q02JA,ST285Q03JA,ST285Q04JA,ST285Q05JA,ST285Q06JA,ST285Q07JA,ST285Q08JA,ST285Q09JA,ST283Q01JA,ST283Q02JA,ST283Q03JA,ST283Q04JA,ST283Q05JA,ST283Q06JA,ST283Q07JA,ST283Q08JA,ST283Q09JA,ST275Q01WA,ST275Q02WA,ST275Q03WA,ST275Q04WA,ST275Q05WA,ST275Q06WA,ST275Q07WA,ST275Q08WA,ST275Q09WA,ST276Q01JA,ST276Q02JA,ST276Q03JA,ST276Q04JA,ST276Q05JA,ST276Q06JA,ST276Q07JA,ST276Q08JA,ST276Q09JA,ST276Q10JA,ST290Q01WA,ST290Q02WA,ST290Q03WA,ST290Q04WA,ST290Q05WA,ST290Q06WA,ST290Q07WA,ST290Q08WA,ST290Q09WA,ST291Q01JA,ST291Q02JA,ST291Q03JA,ST291Q04JA,ST291Q05JA,ST291Q06JA,ST291Q07JA,ST291Q08JA,ST291Q09JA,ST291Q10JA,ST289Q01WA,ST289Q02JA,ST289Q04JA,ST289Q05WA,ST289Q06JA,ST289Q07JA,ST289Q08WA,ST289Q09WA,ST289Q10WA,ST289Q14JA,ST293Q01JA,ST293Q02JA,ST293Q03JA,ST293Q05JA,ST293Q06JA,ST293Q07JA,ST293Q08JA,ST293Q09JA,ST292Q01JA,ST292Q02JA,ST292Q03JA,ST292Q04JA,ST292Q05JA,ST292Q06JA,ST334Q01JA,ST334Q02JA,ST334Q03JA,ST334Q04JA,ST334Q05JA,ST334Q06JA,ST334Q07JA,ST334Q08JA,ST334Q09JA,ST334Q10JA,ST335Q01JA,ST335Q02JA,ST335Q03JA,ST335Q05JA,ST335Q06JA,ST335Q07JA,ST336Q01JA,ST336Q03JA,ST336Q04JA,ST336Q05JA,ST336Q06JA,...,ST349Q01JA_2,ST349Q01JA_3,ST349Q01JA_4,ST349Q01JA_0,LANGN_105,LANGN_108,LANGN_118,LANGN_140,LANGN_148,LANGN_150,LANGN_156,LANGN_200,LANGN_204,LANGN_232,LANGN_273,LANGN_313,LANGN_316,LANGN_322,LANGN_329,LANGN_344,LANGN_351,LANGN_415,LANGN_463,LANGN_493,LANGN_496,LANGN_500,LANGN_520,LANGN_531,LANGN_602,LANGN_606,LANGN_615,LANGN_621,LANGN_625,LANGN_640,LANGN_641,LANGN_663,LANGN_669,LANGN_670,LANGN_800,LANGN_801,LANGN_802,LANGN_804,LANGN_805,LANGN_806,LANGN_807,LANGN_808,LANGN_865,LANGN_892,LANGN_895,LANGN_917,SC177Q01JA_1,SC177Q01JA_2,SC177Q01JA_3,SC177Q02JA_1,SC177Q02JA_2,SC177Q02JA_3,SC177Q03JA_1,SC177Q03JA_2,SC177Q03JA_3,MATHEXC_0,MATHEXC_1,MATHEXC_2,MATHEXC_3,SCHLTYPE_1,SCHLTYPE_2,SCHLTYPE_3,LANGN_121,LANGN_130,LANGN_137,LANGN_170,LANGN_244,LANGN_258,LANGN_263,LANGN_264,LANGN_266,LANGN_317,LANGN_340,LANGN_369,LANGN_381,LANGN_404,LANGN_420,LANGN_449,LANGN_467,LANGN_494,LANGN_495,LANGN_514,LANGN_523,LANGN_529,LANGN_540,LANGN_547,LANGN_600,LANGN_607,LANGN_618,LANGN_619,LANGN_630,LANGN_635,LANGN_650,LANGN_661,LANGN_673,LANGN_674,LANGN_809,LANGN_810,LANGN_811,LANGN_812,LANGN_813,LANGN_814,LANGN_815,LANGN_816,LANGN_818,LANGN_832,LANGN_868,LANGN_870,LANGN_920,LANGN_921,LANGN_113,LANGN_147,LANGN_275,LANGN_286,LANGN_363,LANGN_422,LANGN_434,LANGN_442,LANGN_471,LANGN_611,LANGN_614,LANGN_624,LANGN_642,LANGN_675,LANGN_676,LANGN_677,LANGN_678,LANGN_817,LANGN_819,LANGN_821,LANGN_823,LANGN_824,LANGN_825,LANGN_826,LANGN_827,LANGN_828,LANGN_885,LANGN_896,LANGN_916,LANGN_112,LANGN_154,LANGN_202,LANGN_246,LANGN_254,LANGN_272,LANGN_301,LANGN_325,LANGN_338,LANGN_358,LANGN_371,LANGN_375,LANGN_383,LANGN_409,LANGN_428,LANGN_465,LANGN_517,LANGN_527,LANGN_561,LANGN_562,LANGN_563,LANGN_565,LANGN_566,LANGN_567,LANGN_601,LANGN_622,LANGN_623,LANGN_628,LANGN_631,LANGN_831,LANGN_833,LANGN_836,LANGN_837,LANGN_838,LANGN_839,LANGN_840,LANGN_841,LANGN_845,LANGN_872,LANGN_873,LANGN_881,LANGN_890,LANGN_897,LANGN_898,LANGN_899,LANGN_900,LANGN_901,LANGN_902,LANGN_903,LANGN_904,LANGN_905,LANGN_906,LANGN_907,LANGN_908,LANGN_909,LANGN_910,LANGN_911,LANGN_912,LANGN_913,LANGN_914,LANGN_918,LANGN_919,LANGN_160,LANGN_327,LANGN_451,LANGN_474,LANGN_503,LANGN_608,LANGN_627,LANGN_639,LANGN_668,LANGN_842,LANGN_843,LANGN_844,LANGN_846,LANGN_849,LANGN_850,LANGN_851,LANGN_852,LANGN_861,LANGN_879,LANGN_133,LANGN_195,LANGN_237,LANGN_379,LANGN_382,LANGN_472,LANGN_492,LANGN_555,LANGN_605,LANGN_616,LANGN_626,LANGN_634,LANGN_648,LANGN_662,LANGN_665,LANGN_666,LANGN_667,LANGN_829,LANGN_854,LANGN_855,LANGN_857,LANGN_859,LANGN_860,LANGN_866,LANGN_877,LANGN_922
573394,United_States,84000060.0,84000002.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,4.0,4.0,4.0,,8.0,2.0,2.0,3.0,2.0,2.0,3.0,5.0,3.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,,,3.0,2.0,3.0,3.0,3.0,,3.0,,2.0,3.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,3.0,,3.0,3.0,,3.0,,4.0,4.0,1.0,1.0,1.0,2.0,,3.0,,,4.0,2.0,2.0,,4.0,1.0,,4.0,,4.0,,,1.0,1.0,,,3.0,,4.0,,4.0,4.0,4.0,2.0,,4.0,4.0,,,,4.0,4.0,,,3.0,3.0,,3.0,,3.0,,3.0,,3.0,,3.0,,3.0,3.0,,,3.0,4.0,4.0,,4.0,,4.0,,,,5.0,4.0,4.0,4.0,,4.0,,4.0,,3.0,1.0,3.0,3.0,3.0,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573395,United_States,84000055.0,84000003.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,4.0,4.0,3.0,,8.0,4.0,2.0,2.0,2.0,1.0,3.0,4.0,,,,,,,,,3.0,,3.0,,3.0,3.0,,2.0,4.0,2.0,2.0,4.0,2.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,3.0,2.0,,3.0,4.0,4.0,3.0,4.0,,,2.0,1.0,1.0,2.0,,,1.0,1.0,1.0,,3.0,2.0,,,,,2.0,2.0,1.0,1.0,,1.0,4.0,4.0,,,2.0,,2.0,4.0,,1.0,,4.0,4.0,,,,4.0,,1.0,3.0,3.0,3.0,,,,3.0,,3.0,3.0,,,,3.0,3.0,3.0,,3.0,,,2.0,4.0,,,,2.0,,,,,,4.0,2.0,,,3.0,4.0,3.0,3.0,3.0,3.0,3.0,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573396,United_States,84000121.0,84000004.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,4.0,4.0,2.0,,8.0,4.0,2.0,3.0,2.0,1.0,3.0,6.0,2.0,2.0,3.0,3.0,2.0,2.0,1.0,2.0,3.0,,3.0,1.0,3.0,3.0,,,3.0,2.0,2.0,3.0,,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,2.0,,,4.0,4.0,,3.0,3.0,3.0,2.0,2.0,1.0,2.0,3.0,4.0,,4.0,4.0,2.0,,,,2.0,,5.0,,,2.0,4.0,,4.0,,3.0,2.0,,1.0,,1.0,4.0,,,,,2.0,2.0,,,4.0,2.0,1.0,2.0,4.0,,4.0,,,4.0,2.0,,,,4.0,3.0,3.0,,4.0,,4.0,,5.0,5.0,,5.0,,,,,,5.0,,5.0,4.0,4.0,,,3.0,,,3.0,3.0,4.0,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573397,United_States,84000013.0,84000005.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,3.0,1.0,3.0,3.0,1.0,,6.0,2.0,2.0,2.0,2.0,1.0,3.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,3.0,,2.0,1.0,,,3.0,2.0,3.0,2.0,3.0,,2.0,3.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,3.0,,1.0,,1.0,1.0,,1.0,1.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,,,2.0,,,,,3.0,,3.0,3.0,2.0,,2.0,3.0,,3.0,3.0,,,,3.0,,,2.0,3.0,2.0,,3.0,,,,3.0,,2.0,,,,1.0,2.0,2.0,3.0,,3.0,,3.0,,1.0,,3.0,,1.0,1.0,3.0,,,3.0,3.0,,,,1.0,2.0,4.0,4.0,,,,,3.0,1.0,1.0,1.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573398,United_States,84000010.0,84000006.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,,8.0,3.0,3.0,4.0,2.0,1.0,4.0,2.0,2.0,,,,,,,2.0,,3.0,3.0,,3.0,4.0,4.0,,4.0,1.0,1.0,,1.0,4.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,,,,4.0,4.0,4.0,4.0,4.0,2.0,1.0,1.0,2.0,,5.0,,,2.0,4.0,,2.0,5.0,,1.0,2.0,4.0,,1.0,,1.0,,,4.0,4.0,,3.0,,1.0,,1.0,1.0,,4.0,3.0,1.0,,,,,1.0,,,3.0,3.0,1.0,4.0,1.0,,,,2.0,2.0,,3.0,,3.0,1.0,,,2.0,4.0,,,,,,,5.0,,,5.0,5.0,5.0,,,5.0,5.0,3.0,3.0,,3.0,3.0,3.0,,,,,,,,,,,,,,,,,,,,,,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Take out additional variables

In [8]:
# Define the list of columns to drop
columns_to_remove = ["CNT", "CNTSCHID", "CNTSTUID", "OECD",
    "HOMEPOS", "RELATST", "BELONG", "BULLIED", "FEELSAFE", "SCHRISK", "PERSEVAGR", "CURIOAGR", 
    "COOPAGR", "EMPATAGR", "ASSERAGR", "STRESAGR", "EMOCOAGR", "GROSAGR", "INFOSEEK", "FAMSUP", 
    "DISCLIM", "TEACHSUP", "COGACRCO", "COGACMCO", "EXPOFA", "EXPO21ST", "MATHEFF", "MATHEF21", 
    "FAMCON", "ANXMAT", "MATHPERS", "CREATEFF", "CREATSCH", "CREATFAM", "CREATAS", "CREATOOS", 
    "CREATOP", "OPENART", "IMAGINE", "SCHSUST", "LEARRES", "PROBSELF", "FAMSUPSL", "FEELLAH", 
    "SDLEFF", "ICTRES", "FLSCHOOL", "FLMULTSB", "FLFAMILY", "ACCESSFP", "FLCONFIN", "FLCONICT", 
    "ACCESSFA", "ATTCONFM", "FRINFLFM", "ICTSCH", "ICTHOME", "ICTQUAL", "ICTSUBJ", "ICTENQ", 
    "ICTFEED", "ICTOUT", "ICTWKDY", "ICTWKEND", "ICTREG", "ICTINFO", "ICTEFFIC", "BODYIMA", 
    "SOCONPA", "LIFESAT", "PSYCHSYM", "SOCCON", "EXPWB", "CURSUPP", "PQMIMP", "PQMCAR", 
    "PARINVOL", "PQSCHOOL", "PASCHPOL", "ATTIMMP", "CREATHME", "CREATACT", "CREATOPN", 
    "CREATOR", "SCHAUTO", "TCHPART", "EDULEAD", "INSTLEAD", "ENCOURPG", "DIGDVPOL", "TEAFDBK", 
    "MTTRAIN", "DMCVIEWS", "NEGSCLIM", "STAFFSHORT", "EDUSHORT", "STUBEHA", "TEACHBEHA", 
    "STDTEST", "TDTEST", "ALLACTIV", "BCREATSC", "CREENVSC", "ACTCRESC", "OPENCUL", 
    "PROBSCRI", "SCPREPBP", "SCPREPAP", "DIGPREP", 
    "ESCS", "BMMJ1", "BFMJ2", "EFFORT1", "EFFORT2", "Option_UH", "SC209Q04JA", "SC209Q05JA", "SC209Q06JA"
]

# Drop the columns above
model_data = model_data.drop(columns=columns_to_remove, errors='ignore')  # `errors='ignore'` prevents errors if a column isn't found


In [9]:
print(model_data.shape)
model_data.head()

(4552, 1083)


Unnamed: 0,MATH_Proficient,SISCO,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,ST251Q01JA,ST251Q02JA,ST251Q03JA,ST251Q04JA,ST251Q06JA,ST251Q07JA,ST253Q01JA,ST254Q01JA,ST254Q02JA,ST254Q03JA,ST254Q04JA,ST254Q05JA,ST254Q06JA,ST255Q01JA,ST256Q01JA,ST256Q02JA,ST256Q03JA,ST256Q06JA,ST256Q07JA,ST256Q08JA,ST256Q09JA,ST256Q10JA,ST267Q01JA,ST267Q02JA,ST267Q03JA,ST267Q04JA,ST267Q05JA,ST267Q06JA,ST267Q07JA,ST267Q08JA,ST034Q01TA,ST034Q02TA,ST034Q03TA,ST034Q04TA,ST034Q05TA,ST034Q06TA,ST038Q03NA,ST038Q04NA,ST038Q05NA,ST038Q06NA,ST038Q07NA,ST038Q08NA,ST038Q09JA,ST038Q10JA,ST038Q11JA,ST265Q01JA,ST265Q02JA,ST265Q03JA,ST265Q04JA,ST266Q01JA,ST266Q02JA,ST266Q03JA,ST266Q04JA,ST266Q05JA,ST307Q01JA,ST307Q02JA,ST307Q03JA,ST307Q04JA,ST307Q05JA,ST307Q06JA,ST307Q07JA,ST307Q08JA,ST307Q09JA,ST307Q10JA,ST301Q01JA,ST301Q02JA,ST301Q03JA,ST301Q04JA,ST301Q05JA,ST301Q06JA,ST301Q07JA,ST301Q08JA,ST301Q09JA,ST301Q10JA,ST343Q01JA,ST343Q02JA,ST343Q03JA,ST343Q04JA,ST343Q05JA,ST343Q06JA,ST343Q07JA,ST343Q08JA,ST343Q09JA,ST343Q10JA,ST311Q01JA,ST311Q02JA,ST311Q03JA,ST311Q04JA,ST311Q05JA,ST311Q06JA,ST311Q07JA,ST311Q08JA,ST311Q09JA,ST311Q10JA,ST305Q01JA,ST305Q02JA,ST305Q03JA,ST305Q04JA,ST305Q05JA,ST305Q06JA,ST305Q07JA,ST305Q08JA,ST305Q09JA,ST305Q10JA,ST345Q01JA,ST345Q02JA,ST345Q03JA,ST345Q04JA,ST345Q05JA,ST345Q06JA,ST345Q07JA,ST345Q08JA,ST345Q09JA,ST345Q10JA,ST313Q01JA,ST313Q02JA,ST313Q03JA,ST313Q04JA,ST313Q05JA,ST313Q06JA,ST313Q07JA,ST313Q08JA,ST313Q09JA,ST313Q10JA,ST263Q02JA,ST263Q04JA,ST263Q06JA,ST263Q08JA,ST273Q01JA,ST273Q02JA,ST273Q03JA,ST273Q04JA,ST273Q05JA,ST273Q06JA,ST273Q07JA,ST270Q01JA,ST270Q02JA,ST270Q03JA,ST270Q04JA,ST285Q01JA,ST285Q02JA,ST285Q03JA,ST285Q04JA,ST285Q05JA,ST285Q06JA,ST285Q07JA,ST285Q08JA,ST285Q09JA,ST283Q01JA,ST283Q02JA,ST283Q03JA,ST283Q04JA,ST283Q05JA,ST283Q06JA,ST283Q07JA,ST283Q08JA,ST283Q09JA,ST275Q01WA,ST275Q02WA,ST275Q03WA,ST275Q04WA,ST275Q05WA,ST275Q06WA,ST275Q07WA,ST275Q08WA,ST275Q09WA,ST276Q01JA,ST276Q02JA,ST276Q03JA,ST276Q04JA,ST276Q05JA,ST276Q06JA,ST276Q07JA,ST276Q08JA,ST276Q09JA,ST276Q10JA,ST290Q01WA,ST290Q02WA,ST290Q03WA,ST290Q04WA,ST290Q05WA,ST290Q06WA,ST290Q07WA,ST290Q08WA,ST290Q09WA,ST291Q01JA,ST291Q02JA,ST291Q03JA,ST291Q04JA,ST291Q05JA,ST291Q06JA,ST291Q07JA,ST291Q08JA,ST291Q09JA,ST291Q10JA,ST289Q01WA,ST289Q02JA,ST289Q04JA,ST289Q05WA,ST289Q06JA,ST289Q07JA,ST289Q08WA,ST289Q09WA,ST289Q10WA,ST289Q14JA,ST293Q01JA,ST293Q02JA,ST293Q03JA,ST293Q05JA,ST293Q06JA,ST293Q07JA,ST293Q08JA,ST293Q09JA,ST292Q01JA,ST292Q02JA,ST292Q03JA,ST292Q04JA,ST292Q05JA,ST292Q06JA,ST334Q01JA,ST334Q02JA,ST334Q03JA,ST334Q04JA,ST334Q05JA,ST334Q06JA,ST334Q07JA,ST334Q08JA,ST334Q09JA,ST334Q10JA,ST335Q01JA,ST335Q02JA,ST335Q03JA,ST335Q05JA,ST335Q06JA,ST335Q07JA,ST336Q01JA,ST336Q03JA,ST336Q04JA,ST336Q05JA,ST336Q06JA,ST336Q07JA,ST337Q01JA,ST337Q02JA,...,ST349Q01JA_2,ST349Q01JA_3,ST349Q01JA_4,ST349Q01JA_0,LANGN_105,LANGN_108,LANGN_118,LANGN_140,LANGN_148,LANGN_150,LANGN_156,LANGN_200,LANGN_204,LANGN_232,LANGN_273,LANGN_313,LANGN_316,LANGN_322,LANGN_329,LANGN_344,LANGN_351,LANGN_415,LANGN_463,LANGN_493,LANGN_496,LANGN_500,LANGN_520,LANGN_531,LANGN_602,LANGN_606,LANGN_615,LANGN_621,LANGN_625,LANGN_640,LANGN_641,LANGN_663,LANGN_669,LANGN_670,LANGN_800,LANGN_801,LANGN_802,LANGN_804,LANGN_805,LANGN_806,LANGN_807,LANGN_808,LANGN_865,LANGN_892,LANGN_895,LANGN_917,SC177Q01JA_1,SC177Q01JA_2,SC177Q01JA_3,SC177Q02JA_1,SC177Q02JA_2,SC177Q02JA_3,SC177Q03JA_1,SC177Q03JA_2,SC177Q03JA_3,MATHEXC_0,MATHEXC_1,MATHEXC_2,MATHEXC_3,SCHLTYPE_1,SCHLTYPE_2,SCHLTYPE_3,LANGN_121,LANGN_130,LANGN_137,LANGN_170,LANGN_244,LANGN_258,LANGN_263,LANGN_264,LANGN_266,LANGN_317,LANGN_340,LANGN_369,LANGN_381,LANGN_404,LANGN_420,LANGN_449,LANGN_467,LANGN_494,LANGN_495,LANGN_514,LANGN_523,LANGN_529,LANGN_540,LANGN_547,LANGN_600,LANGN_607,LANGN_618,LANGN_619,LANGN_630,LANGN_635,LANGN_650,LANGN_661,LANGN_673,LANGN_674,LANGN_809,LANGN_810,LANGN_811,LANGN_812,LANGN_813,LANGN_814,LANGN_815,LANGN_816,LANGN_818,LANGN_832,LANGN_868,LANGN_870,LANGN_920,LANGN_921,LANGN_113,LANGN_147,LANGN_275,LANGN_286,LANGN_363,LANGN_422,LANGN_434,LANGN_442,LANGN_471,LANGN_611,LANGN_614,LANGN_624,LANGN_642,LANGN_675,LANGN_676,LANGN_677,LANGN_678,LANGN_817,LANGN_819,LANGN_821,LANGN_823,LANGN_824,LANGN_825,LANGN_826,LANGN_827,LANGN_828,LANGN_885,LANGN_896,LANGN_916,LANGN_112,LANGN_154,LANGN_202,LANGN_246,LANGN_254,LANGN_272,LANGN_301,LANGN_325,LANGN_338,LANGN_358,LANGN_371,LANGN_375,LANGN_383,LANGN_409,LANGN_428,LANGN_465,LANGN_517,LANGN_527,LANGN_561,LANGN_562,LANGN_563,LANGN_565,LANGN_566,LANGN_567,LANGN_601,LANGN_622,LANGN_623,LANGN_628,LANGN_631,LANGN_831,LANGN_833,LANGN_836,LANGN_837,LANGN_838,LANGN_839,LANGN_840,LANGN_841,LANGN_845,LANGN_872,LANGN_873,LANGN_881,LANGN_890,LANGN_897,LANGN_898,LANGN_899,LANGN_900,LANGN_901,LANGN_902,LANGN_903,LANGN_904,LANGN_905,LANGN_906,LANGN_907,LANGN_908,LANGN_909,LANGN_910,LANGN_911,LANGN_912,LANGN_913,LANGN_914,LANGN_918,LANGN_919,LANGN_160,LANGN_327,LANGN_451,LANGN_474,LANGN_503,LANGN_608,LANGN_627,LANGN_639,LANGN_668,LANGN_842,LANGN_843,LANGN_844,LANGN_846,LANGN_849,LANGN_850,LANGN_851,LANGN_852,LANGN_861,LANGN_879,LANGN_133,LANGN_195,LANGN_237,LANGN_379,LANGN_382,LANGN_472,LANGN_492,LANGN_555,LANGN_605,LANGN_616,LANGN_626,LANGN_634,LANGN_648,LANGN_662,LANGN_665,LANGN_666,LANGN_667,LANGN_829,LANGN_854,LANGN_855,LANGN_857,LANGN_859,LANGN_860,LANGN_866,LANGN_877,LANGN_922
573394,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,4.0,4.0,4.0,,8.0,2.0,2.0,3.0,2.0,2.0,3.0,5.0,3.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,,,3.0,2.0,3.0,3.0,3.0,,3.0,,2.0,3.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,3.0,,3.0,3.0,,3.0,,4.0,4.0,1.0,1.0,1.0,2.0,,3.0,,,4.0,2.0,2.0,,4.0,1.0,,4.0,,4.0,,,1.0,1.0,,,3.0,,4.0,,4.0,4.0,4.0,2.0,,4.0,4.0,,,,4.0,4.0,,,3.0,3.0,,3.0,,3.0,,3.0,,3.0,,3.0,,3.0,3.0,,,3.0,4.0,4.0,,4.0,,4.0,,,,5.0,4.0,4.0,4.0,,4.0,,4.0,,3.0,1.0,3.0,3.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573395,1.0,0.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,4.0,4.0,3.0,,8.0,4.0,2.0,2.0,2.0,1.0,3.0,4.0,,,,,,,,,3.0,,3.0,,3.0,3.0,,2.0,4.0,2.0,2.0,4.0,2.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,3.0,2.0,,3.0,4.0,4.0,3.0,4.0,,,2.0,1.0,1.0,2.0,,,1.0,1.0,1.0,,3.0,2.0,,,,,2.0,2.0,1.0,1.0,,1.0,4.0,4.0,,,2.0,,2.0,4.0,,1.0,,4.0,4.0,,,,4.0,,1.0,3.0,3.0,3.0,,,,3.0,,3.0,3.0,,,,3.0,3.0,3.0,,3.0,,,2.0,4.0,,,,2.0,,,,,,4.0,2.0,,,3.0,4.0,3.0,3.0,3.0,3.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573396,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,4.0,4.0,2.0,,8.0,4.0,2.0,3.0,2.0,1.0,3.0,6.0,2.0,2.0,3.0,3.0,2.0,2.0,1.0,2.0,3.0,,3.0,1.0,3.0,3.0,,,3.0,2.0,2.0,3.0,,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,2.0,,,4.0,4.0,,3.0,3.0,3.0,2.0,2.0,1.0,2.0,3.0,4.0,,4.0,4.0,2.0,,,,2.0,,5.0,,,2.0,4.0,,4.0,,3.0,2.0,,1.0,,1.0,4.0,,,,,2.0,2.0,,,4.0,2.0,1.0,2.0,4.0,,4.0,,,4.0,2.0,,,,4.0,3.0,3.0,,4.0,,4.0,,5.0,5.0,,5.0,,,,,,5.0,,5.0,4.0,4.0,,,3.0,,,3.0,3.0,4.0,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573397,1.0,1.0,1.0,1.0,2.0,1.0,1.0,3.0,1.0,3.0,3.0,1.0,,6.0,2.0,2.0,2.0,2.0,1.0,3.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,3.0,,2.0,1.0,,,3.0,2.0,3.0,2.0,3.0,,2.0,3.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,3.0,,1.0,,1.0,1.0,,1.0,1.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,,,2.0,,,,,3.0,,3.0,3.0,2.0,,2.0,3.0,,3.0,3.0,,,,3.0,,,2.0,3.0,2.0,,3.0,,,,3.0,,2.0,,,,1.0,2.0,2.0,3.0,,3.0,,3.0,,1.0,,3.0,,1.0,1.0,3.0,,,3.0,3.0,,,,1.0,2.0,4.0,4.0,,,,,3.0,1.0,1.0,1.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573398,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,,8.0,3.0,3.0,4.0,2.0,1.0,4.0,2.0,2.0,,,,,,,2.0,,3.0,3.0,,3.0,4.0,4.0,,4.0,1.0,1.0,,1.0,4.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,,,,4.0,4.0,4.0,4.0,4.0,2.0,1.0,1.0,2.0,,5.0,,,2.0,4.0,,2.0,5.0,,1.0,2.0,4.0,,1.0,,1.0,,,4.0,4.0,,3.0,,1.0,,1.0,1.0,,4.0,3.0,1.0,,,,,1.0,,,3.0,3.0,1.0,4.0,1.0,,,,2.0,2.0,,3.0,,3.0,1.0,,,2.0,4.0,,,,,,,5.0,,,5.0,5.0,5.0,,,5.0,5.0,3.0,3.0,,3.0,3.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Amazon SageMaker's XGBoost container expects data in the libSVM or CSV data format.  **Note that the first column must be the target variable and the CSV should not include headers.**  Although repetitive, it's easiest to do this after the train|validation|test split rather than before.  This avoids any misalignment issues due to random reordering.
* `MATH_Proficient`: Is the student falling behind in Math? (Average of 10 Math plausible values < 420.07)

In [10]:
# Get percent of students not proficient in Math
proficient_n = (model_data['MATH_Proficient'] == 1).sum()
not_proficient_n = (model_data['MATH_Proficient'] == 0).sum()
not_proficient_p = round( not_proficient_n / (not_proficient_n + proficient_n) * 100, 1)
print("Students who are NOT proficient in Math: ", not_proficient_n, "(", not_proficient_p, "%)")

Students who are NOT proficient in Math:  1607 ( 35.3 %)


In [11]:
# Get imbalance ratio 
not_proficient_pp = not_proficient_n / (not_proficient_n + proficient_n)

if not_proficient_pp < 0.5:
    imbalance_ratio = (1 - not_proficient_pp) / not_proficient_pp
else:
    imbalance_ratio = not_proficient_pp / (1 - not_proficient_pp)
    
print("Imbalance ratio:", round(imbalance_ratio,1))

Imbalance ratio: 1.8


In [12]:
# Reorder columns to bring 'MATH_Proficient' first
new_order = ['MATH_Proficient'] + [col for col in model_data.columns if col != 'MATH_Proficient']
model_data = model_data[new_order]

# Get number of features
n_features_original = model_data.shape[1]-1

# Check the shape after dropping
print(model_data.shape)

model_data.head()

(4552, 1083)


Unnamed: 0,MATH_Proficient,SISCO,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,ST251Q01JA,ST251Q02JA,ST251Q03JA,ST251Q04JA,ST251Q06JA,ST251Q07JA,ST253Q01JA,ST254Q01JA,ST254Q02JA,ST254Q03JA,ST254Q04JA,ST254Q05JA,ST254Q06JA,ST255Q01JA,ST256Q01JA,ST256Q02JA,ST256Q03JA,ST256Q06JA,ST256Q07JA,ST256Q08JA,ST256Q09JA,ST256Q10JA,ST267Q01JA,ST267Q02JA,ST267Q03JA,ST267Q04JA,ST267Q05JA,ST267Q06JA,ST267Q07JA,ST267Q08JA,ST034Q01TA,ST034Q02TA,ST034Q03TA,ST034Q04TA,ST034Q05TA,ST034Q06TA,ST038Q03NA,ST038Q04NA,ST038Q05NA,ST038Q06NA,ST038Q07NA,ST038Q08NA,ST038Q09JA,ST038Q10JA,ST038Q11JA,ST265Q01JA,ST265Q02JA,ST265Q03JA,ST265Q04JA,ST266Q01JA,ST266Q02JA,ST266Q03JA,ST266Q04JA,ST266Q05JA,ST307Q01JA,ST307Q02JA,ST307Q03JA,ST307Q04JA,ST307Q05JA,ST307Q06JA,ST307Q07JA,ST307Q08JA,ST307Q09JA,ST307Q10JA,ST301Q01JA,ST301Q02JA,ST301Q03JA,ST301Q04JA,ST301Q05JA,ST301Q06JA,ST301Q07JA,ST301Q08JA,ST301Q09JA,ST301Q10JA,ST343Q01JA,ST343Q02JA,ST343Q03JA,ST343Q04JA,ST343Q05JA,ST343Q06JA,ST343Q07JA,ST343Q08JA,ST343Q09JA,ST343Q10JA,ST311Q01JA,ST311Q02JA,ST311Q03JA,ST311Q04JA,ST311Q05JA,ST311Q06JA,ST311Q07JA,ST311Q08JA,ST311Q09JA,ST311Q10JA,ST305Q01JA,ST305Q02JA,ST305Q03JA,ST305Q04JA,ST305Q05JA,ST305Q06JA,ST305Q07JA,ST305Q08JA,ST305Q09JA,ST305Q10JA,ST345Q01JA,ST345Q02JA,ST345Q03JA,ST345Q04JA,ST345Q05JA,ST345Q06JA,ST345Q07JA,ST345Q08JA,ST345Q09JA,ST345Q10JA,ST313Q01JA,ST313Q02JA,ST313Q03JA,ST313Q04JA,ST313Q05JA,ST313Q06JA,ST313Q07JA,ST313Q08JA,ST313Q09JA,ST313Q10JA,ST263Q02JA,ST263Q04JA,ST263Q06JA,ST263Q08JA,ST273Q01JA,ST273Q02JA,ST273Q03JA,ST273Q04JA,ST273Q05JA,ST273Q06JA,ST273Q07JA,ST270Q01JA,ST270Q02JA,ST270Q03JA,ST270Q04JA,ST285Q01JA,ST285Q02JA,ST285Q03JA,ST285Q04JA,ST285Q05JA,ST285Q06JA,ST285Q07JA,ST285Q08JA,ST285Q09JA,ST283Q01JA,ST283Q02JA,ST283Q03JA,ST283Q04JA,ST283Q05JA,ST283Q06JA,ST283Q07JA,ST283Q08JA,ST283Q09JA,ST275Q01WA,ST275Q02WA,ST275Q03WA,ST275Q04WA,ST275Q05WA,ST275Q06WA,ST275Q07WA,ST275Q08WA,ST275Q09WA,ST276Q01JA,ST276Q02JA,ST276Q03JA,ST276Q04JA,ST276Q05JA,ST276Q06JA,ST276Q07JA,ST276Q08JA,ST276Q09JA,ST276Q10JA,ST290Q01WA,ST290Q02WA,ST290Q03WA,ST290Q04WA,ST290Q05WA,ST290Q06WA,ST290Q07WA,ST290Q08WA,ST290Q09WA,ST291Q01JA,ST291Q02JA,ST291Q03JA,ST291Q04JA,ST291Q05JA,ST291Q06JA,ST291Q07JA,ST291Q08JA,ST291Q09JA,ST291Q10JA,ST289Q01WA,ST289Q02JA,ST289Q04JA,ST289Q05WA,ST289Q06JA,ST289Q07JA,ST289Q08WA,ST289Q09WA,ST289Q10WA,ST289Q14JA,ST293Q01JA,ST293Q02JA,ST293Q03JA,ST293Q05JA,ST293Q06JA,ST293Q07JA,ST293Q08JA,ST293Q09JA,ST292Q01JA,ST292Q02JA,ST292Q03JA,ST292Q04JA,ST292Q05JA,ST292Q06JA,ST334Q01JA,ST334Q02JA,ST334Q03JA,ST334Q04JA,ST334Q05JA,ST334Q06JA,ST334Q07JA,ST334Q08JA,ST334Q09JA,ST334Q10JA,ST335Q01JA,ST335Q02JA,ST335Q03JA,ST335Q05JA,ST335Q06JA,ST335Q07JA,ST336Q01JA,ST336Q03JA,ST336Q04JA,ST336Q05JA,ST336Q06JA,ST336Q07JA,ST337Q01JA,ST337Q02JA,...,ST349Q01JA_2,ST349Q01JA_3,ST349Q01JA_4,ST349Q01JA_0,LANGN_105,LANGN_108,LANGN_118,LANGN_140,LANGN_148,LANGN_150,LANGN_156,LANGN_200,LANGN_204,LANGN_232,LANGN_273,LANGN_313,LANGN_316,LANGN_322,LANGN_329,LANGN_344,LANGN_351,LANGN_415,LANGN_463,LANGN_493,LANGN_496,LANGN_500,LANGN_520,LANGN_531,LANGN_602,LANGN_606,LANGN_615,LANGN_621,LANGN_625,LANGN_640,LANGN_641,LANGN_663,LANGN_669,LANGN_670,LANGN_800,LANGN_801,LANGN_802,LANGN_804,LANGN_805,LANGN_806,LANGN_807,LANGN_808,LANGN_865,LANGN_892,LANGN_895,LANGN_917,SC177Q01JA_1,SC177Q01JA_2,SC177Q01JA_3,SC177Q02JA_1,SC177Q02JA_2,SC177Q02JA_3,SC177Q03JA_1,SC177Q03JA_2,SC177Q03JA_3,MATHEXC_0,MATHEXC_1,MATHEXC_2,MATHEXC_3,SCHLTYPE_1,SCHLTYPE_2,SCHLTYPE_3,LANGN_121,LANGN_130,LANGN_137,LANGN_170,LANGN_244,LANGN_258,LANGN_263,LANGN_264,LANGN_266,LANGN_317,LANGN_340,LANGN_369,LANGN_381,LANGN_404,LANGN_420,LANGN_449,LANGN_467,LANGN_494,LANGN_495,LANGN_514,LANGN_523,LANGN_529,LANGN_540,LANGN_547,LANGN_600,LANGN_607,LANGN_618,LANGN_619,LANGN_630,LANGN_635,LANGN_650,LANGN_661,LANGN_673,LANGN_674,LANGN_809,LANGN_810,LANGN_811,LANGN_812,LANGN_813,LANGN_814,LANGN_815,LANGN_816,LANGN_818,LANGN_832,LANGN_868,LANGN_870,LANGN_920,LANGN_921,LANGN_113,LANGN_147,LANGN_275,LANGN_286,LANGN_363,LANGN_422,LANGN_434,LANGN_442,LANGN_471,LANGN_611,LANGN_614,LANGN_624,LANGN_642,LANGN_675,LANGN_676,LANGN_677,LANGN_678,LANGN_817,LANGN_819,LANGN_821,LANGN_823,LANGN_824,LANGN_825,LANGN_826,LANGN_827,LANGN_828,LANGN_885,LANGN_896,LANGN_916,LANGN_112,LANGN_154,LANGN_202,LANGN_246,LANGN_254,LANGN_272,LANGN_301,LANGN_325,LANGN_338,LANGN_358,LANGN_371,LANGN_375,LANGN_383,LANGN_409,LANGN_428,LANGN_465,LANGN_517,LANGN_527,LANGN_561,LANGN_562,LANGN_563,LANGN_565,LANGN_566,LANGN_567,LANGN_601,LANGN_622,LANGN_623,LANGN_628,LANGN_631,LANGN_831,LANGN_833,LANGN_836,LANGN_837,LANGN_838,LANGN_839,LANGN_840,LANGN_841,LANGN_845,LANGN_872,LANGN_873,LANGN_881,LANGN_890,LANGN_897,LANGN_898,LANGN_899,LANGN_900,LANGN_901,LANGN_902,LANGN_903,LANGN_904,LANGN_905,LANGN_906,LANGN_907,LANGN_908,LANGN_909,LANGN_910,LANGN_911,LANGN_912,LANGN_913,LANGN_914,LANGN_918,LANGN_919,LANGN_160,LANGN_327,LANGN_451,LANGN_474,LANGN_503,LANGN_608,LANGN_627,LANGN_639,LANGN_668,LANGN_842,LANGN_843,LANGN_844,LANGN_846,LANGN_849,LANGN_850,LANGN_851,LANGN_852,LANGN_861,LANGN_879,LANGN_133,LANGN_195,LANGN_237,LANGN_379,LANGN_382,LANGN_472,LANGN_492,LANGN_555,LANGN_605,LANGN_616,LANGN_626,LANGN_634,LANGN_648,LANGN_662,LANGN_665,LANGN_666,LANGN_667,LANGN_829,LANGN_854,LANGN_855,LANGN_857,LANGN_859,LANGN_860,LANGN_866,LANGN_877,LANGN_922
573394,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,4.0,4.0,4.0,,8.0,2.0,2.0,3.0,2.0,2.0,3.0,5.0,3.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,,,3.0,2.0,3.0,3.0,3.0,,3.0,,2.0,3.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,3.0,,3.0,3.0,,3.0,,4.0,4.0,1.0,1.0,1.0,2.0,,3.0,,,4.0,2.0,2.0,,4.0,1.0,,4.0,,4.0,,,1.0,1.0,,,3.0,,4.0,,4.0,4.0,4.0,2.0,,4.0,4.0,,,,4.0,4.0,,,3.0,3.0,,3.0,,3.0,,3.0,,3.0,,3.0,,3.0,3.0,,,3.0,4.0,4.0,,4.0,,4.0,,,,5.0,4.0,4.0,4.0,,4.0,,4.0,,3.0,1.0,3.0,3.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573395,1.0,0.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,4.0,4.0,3.0,,8.0,4.0,2.0,2.0,2.0,1.0,3.0,4.0,,,,,,,,,3.0,,3.0,,3.0,3.0,,2.0,4.0,2.0,2.0,4.0,2.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,3.0,2.0,,3.0,4.0,4.0,3.0,4.0,,,2.0,1.0,1.0,2.0,,,1.0,1.0,1.0,,3.0,2.0,,,,,2.0,2.0,1.0,1.0,,1.0,4.0,4.0,,,2.0,,2.0,4.0,,1.0,,4.0,4.0,,,,4.0,,1.0,3.0,3.0,3.0,,,,3.0,,3.0,3.0,,,,3.0,3.0,3.0,,3.0,,,2.0,4.0,,,,2.0,,,,,,4.0,2.0,,,3.0,4.0,3.0,3.0,3.0,3.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573396,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,4.0,4.0,2.0,,8.0,4.0,2.0,3.0,2.0,1.0,3.0,6.0,2.0,2.0,3.0,3.0,2.0,2.0,1.0,2.0,3.0,,3.0,1.0,3.0,3.0,,,3.0,2.0,2.0,3.0,,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,3.0,2.0,,,4.0,4.0,,3.0,3.0,3.0,2.0,2.0,1.0,2.0,3.0,4.0,,4.0,4.0,2.0,,,,2.0,,5.0,,,2.0,4.0,,4.0,,3.0,2.0,,1.0,,1.0,4.0,,,,,2.0,2.0,,,4.0,2.0,1.0,2.0,4.0,,4.0,,,4.0,2.0,,,,4.0,3.0,3.0,,4.0,,4.0,,5.0,5.0,,5.0,,,,,,5.0,,5.0,4.0,4.0,,,3.0,,,3.0,3.0,4.0,3.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573397,1.0,1.0,1.0,1.0,2.0,1.0,1.0,3.0,1.0,3.0,3.0,1.0,,6.0,2.0,2.0,2.0,2.0,1.0,3.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,3.0,,2.0,1.0,,,3.0,2.0,3.0,2.0,3.0,,2.0,3.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,3.0,,1.0,,1.0,1.0,,1.0,1.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,,,2.0,,,,,3.0,,3.0,3.0,2.0,,2.0,3.0,,3.0,3.0,,,,3.0,,,2.0,3.0,2.0,,3.0,,,,3.0,,2.0,,,,1.0,2.0,2.0,3.0,,3.0,,3.0,,1.0,,3.0,,1.0,1.0,3.0,,,3.0,3.0,,,,1.0,2.0,4.0,4.0,,,,,3.0,1.0,1.0,1.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573398,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,,8.0,3.0,3.0,4.0,2.0,1.0,4.0,2.0,2.0,,,,,,,2.0,,3.0,3.0,,3.0,4.0,4.0,,4.0,1.0,1.0,,1.0,4.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,,,,4.0,4.0,4.0,4.0,4.0,2.0,1.0,1.0,2.0,,5.0,,,2.0,4.0,,2.0,5.0,,1.0,2.0,4.0,,1.0,,1.0,,,4.0,4.0,,3.0,,1.0,,1.0,1.0,,4.0,3.0,1.0,,,,,1.0,,,3.0,3.0,1.0,4.0,1.0,,,,2.0,2.0,,3.0,,3.0,1.0,,,2.0,4.0,,,,,,,5.0,,,5.0,5.0,5.0,,,5.0,5.0,3.0,3.0,,3.0,3.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Drop columns with more than 20% missing values

In [13]:
model_data.dropna(thresh=int(0.8 * len(model_data)), axis=1, inplace=True)
print(model_data.shape)

(4552, 558)


In [14]:
n_features_final = model_data.shape[1]-1
print("Number of features (before dropping features with more than 20% missing):", n_features_original)
print("Number of features (after dropping features with more than 20% missing):", n_features_final)
print("Number of features with more than 20% missing:", n_features_original - n_features_final)

Number of features (before dropping features with more than 20% missing): 1082
Number of features (after dropping features with more than 20% missing): 557
Number of features with more than 20% missing: 525


#### For columns with less than 20% missing values, fill missing values with the median value of the column

In [15]:
model_data.fillna(model_data.median(), inplace=True)

We'll randomly split the data into 3 uneven groups.  **The model will be trained on 70% of data, it will then be evaluated on 15% of data to give us an estimate of the accuracy we hope to have on "new" data, and 15% will be held back as a final testing dataset which will be used later on.**

A seed is included in the code so the splits can be replicated!

In [16]:
# cell 12
# Randomly sort the data then split out first 70%, second 15%, and last 15%
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.85 * len(model_data))])   

  return bound(*args, **kwds)


In [17]:
print("Number of rows in FULL dataset:", model_data.shape[0])

train_data_percent = round(train_data.shape[0]/model_data.shape[0] * 100, 0)
print("Number of rows in TRAINING dataset:", train_data.shape[0], "(", train_data_percent, "% )")

validation_data_percent = round(validation_data.shape[0]/model_data.shape[0] * 100, 0)
print("Number of rows in VALIDATION dataset:", validation_data.shape[0], "(", validation_data_percent, "% )")

test_data_percent = round(test_data.shape[0]/model_data.shape[0] * 100, 0)
print("Number of rows in TEST dataset:", test_data.shape[0], "(", test_data_percent, "% )")

Number of rows in FULL dataset: 4552
Number of rows in TRAINING dataset: 3186 ( 70.0 % )
Number of rows in VALIDATION dataset: 683 ( 15.0 % )
Number of rows in TEST dataset: 683 ( 15.0 % )


In [18]:
# Save train dataset 
train_data.to_csv('train.csv', index=False, header=False)

# Save validation dataset 
validation_data.to_csv('validation.csv', index=False, header=False)


In [19]:
# Training data - Saved later to S3 as CSV
print(train_data.shape)
train_data.head()

(3186, 558)


Unnamed: 0,MATH_Proficient,SISCO,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,ST251Q01JA,ST251Q02JA,ST251Q03JA,ST251Q04JA,ST251Q06JA,ST253Q01JA,ST254Q01JA,ST254Q02JA,ST254Q03JA,ST254Q04JA,ST254Q05JA,ST254Q06JA,ST255Q01JA,ST038Q03NA,ST038Q04NA,ST038Q05NA,ST038Q06NA,ST038Q07NA,ST038Q08NA,ST038Q09JA,ST038Q10JA,ST038Q11JA,ST265Q01JA,ST265Q02JA,ST265Q03JA,ST265Q04JA,ST266Q01JA,ST266Q02JA,ST266Q03JA,ST266Q04JA,ST266Q05JA,ST263Q02JA,ST263Q04JA,ST263Q06JA,ST270Q01JA,ST270Q02JA,ST270Q03JA,ST270Q04JA,FL166Q01HA,FL166Q02HA,FL166Q03HA,FL166Q05HA,FL166Q06HA,FL166Q07HA,FL174Q01JA,FL174Q02JA,FL174Q03JA,FL174Q04JA,FL174Q05JA,FL174Q06JA,FL174Q07JA,FL167Q01HA,FL167Q02HA,FL167Q06JA,FL167Q03HA,FL167Q04HA,FL167Q05HA,FL167Q07JA,FL170Q01JA,FL170Q02JA,FL170Q03JA,FL170Q04JA,FL170Q05JA,FL170Q06JA,FL170Q07JA,FL162Q01HA,FL162Q02HA,FL162Q03HA,FL162Q04HA,FL162Q05HA,FL162Q06HA,FL163Q01HA,FL163Q02HA,FL163Q03HA,FL163Q04HA,FL163Q05HA,FL171Q01JA,FL171Q02JA,FL171Q03JA,FL171Q04JA,FL171Q05JA,FL171Q07JA,FL171Q08JA,FL171Q09JA,FL171Q10JA,FL171Q11JA,FL171Q12JA,FL169Q01HA,FL169Q05JA,FL169Q02HA,FL169Q04HA,FL169Q08JA,FL169Q10JA,FL169Q11JA,FL172Q01JA,FL172Q03JA,FL172Q05JA,FL172Q06JA,IC170Q01JA,IC170Q02JA,IC170Q03JA,IC170Q04JA,IC170Q05JA,IC170Q06JA,IC170Q07JA,IC171Q01JA,IC171Q02JA,IC171Q03JA,IC171Q04JA,IC171Q05JA,IC171Q06JA,IC172Q01JA,IC172Q02JA,IC172Q03JA,IC172Q04JA,IC172Q05JA,IC172Q06JA,IC172Q07JA,IC172Q08JA,IC172Q09JA,IC173Q01JA,IC173Q02JA,IC173Q03JA,IC173Q04JA,IC174Q01JA,IC174Q02JA,IC174Q03JA,IC174Q04JA,IC174Q05JA,IC174Q06JA,IC174Q07JA,IC174Q08JA,IC174Q09JA,IC174Q10JA,IC175Q01JA,IC175Q02JA,IC175Q03JA,IC175Q05JA,IC176Q01JA,IC176Q02JA,IC176Q03JA,IC176Q04JA,IC176Q05JA,IC176Q06JA,IC176Q07JA,IC176Q08JA,IC177Q01JA,IC177Q02JA,IC177Q03JA,IC177Q04JA,IC177Q05JA,IC177Q06JA,IC177Q07JA,IC178Q01JA,IC178Q02JA,IC178Q03JA,IC178Q04JA,IC178Q05JA,IC178Q06JA,IC178Q07JA,IC179Q01JA,IC179Q02JA,IC179Q03JA,IC179Q04JA,IC179Q05JA,IC179Q06JA,IC180Q02JA,IC180Q03JA,IC180Q04JA,IC180Q05JA,IC180Q06JA,IC180Q07JA,IC183Q01JA,IC183Q02JA,IC183Q03JA,IC183Q04JA,IC183Q05JA,IC183Q07JA,IC183Q08JA,IC183Q09JA,IC183Q10JA,IC183Q12JA,IC183Q13JA,IC183Q14JA,IC183Q15JA,IC183Q16JA,ST347Q01JA,ST347Q02JA,ST259Q01JA,ST004D01T,GRADE,REPEAT,EXPECEDU,ICTAVSCH,ICTAVHOM,IMMIG,TARDYSD,ST226Q01JA,MISSSC,PAREDINT,ST230Q01JA,SKIPPING,IC180Q01JA,IC180Q08JA,ST059Q02JA,ST296Q04JA,STUDYHMW,IC184Q01JA,IC184Q02JA,ST059Q01TA,ST296Q01JA,ST268Q01JA,ST268Q04JA,ST268Q07JA,ST297Q01JA,ST297Q03JA,ST297Q05JA,ST297Q06JA,ST297Q07JA,ST297Q09JA,ST258Q01JA,ST294Q01JA,ST295Q01JA,EXERPRAC,WORKPAY,WORKHOME,SC001Q01TA,SC211Q01JA,SC211Q02JA,SC211Q03JA,SC211Q04JA,SC211Q05JA,SC211Q06JA,SC037Q11JA,SC183Q02JA,SC183Q03JA,SC183Q04JA,SC175Q01JA,SC188Q01JA,SC188Q02JA,SC188Q03JA,SC188Q04JA,SC188Q05JA,...,ST349Q01JA_2,ST349Q01JA_3,ST349Q01JA_4,ST349Q01JA_0,LANGN_105,LANGN_108,LANGN_118,LANGN_140,LANGN_148,LANGN_150,LANGN_156,LANGN_200,LANGN_204,LANGN_232,LANGN_273,LANGN_313,LANGN_316,LANGN_322,LANGN_329,LANGN_344,LANGN_351,LANGN_415,LANGN_463,LANGN_493,LANGN_496,LANGN_500,LANGN_520,LANGN_531,LANGN_602,LANGN_606,LANGN_615,LANGN_621,LANGN_625,LANGN_640,LANGN_641,LANGN_663,LANGN_669,LANGN_670,LANGN_800,LANGN_801,LANGN_802,LANGN_804,LANGN_805,LANGN_806,LANGN_807,LANGN_808,LANGN_865,LANGN_892,LANGN_895,LANGN_917,SC177Q01JA_1,SC177Q01JA_2,SC177Q01JA_3,SC177Q02JA_1,SC177Q02JA_2,SC177Q02JA_3,SC177Q03JA_1,SC177Q03JA_2,SC177Q03JA_3,MATHEXC_0,MATHEXC_1,MATHEXC_2,MATHEXC_3,SCHLTYPE_1,SCHLTYPE_2,SCHLTYPE_3,LANGN_121,LANGN_130,LANGN_137,LANGN_170,LANGN_244,LANGN_258,LANGN_263,LANGN_264,LANGN_266,LANGN_317,LANGN_340,LANGN_369,LANGN_381,LANGN_404,LANGN_420,LANGN_449,LANGN_467,LANGN_494,LANGN_495,LANGN_514,LANGN_523,LANGN_529,LANGN_540,LANGN_547,LANGN_600,LANGN_607,LANGN_618,LANGN_619,LANGN_630,LANGN_635,LANGN_650,LANGN_661,LANGN_673,LANGN_674,LANGN_809,LANGN_810,LANGN_811,LANGN_812,LANGN_813,LANGN_814,LANGN_815,LANGN_816,LANGN_818,LANGN_832,LANGN_868,LANGN_870,LANGN_920,LANGN_921,LANGN_113,LANGN_147,LANGN_275,LANGN_286,LANGN_363,LANGN_422,LANGN_434,LANGN_442,LANGN_471,LANGN_611,LANGN_614,LANGN_624,LANGN_642,LANGN_675,LANGN_676,LANGN_677,LANGN_678,LANGN_817,LANGN_819,LANGN_821,LANGN_823,LANGN_824,LANGN_825,LANGN_826,LANGN_827,LANGN_828,LANGN_885,LANGN_896,LANGN_916,LANGN_112,LANGN_154,LANGN_202,LANGN_246,LANGN_254,LANGN_272,LANGN_301,LANGN_325,LANGN_338,LANGN_358,LANGN_371,LANGN_375,LANGN_383,LANGN_409,LANGN_428,LANGN_465,LANGN_517,LANGN_527,LANGN_561,LANGN_562,LANGN_563,LANGN_565,LANGN_566,LANGN_567,LANGN_601,LANGN_622,LANGN_623,LANGN_628,LANGN_631,LANGN_831,LANGN_833,LANGN_836,LANGN_837,LANGN_838,LANGN_839,LANGN_840,LANGN_841,LANGN_845,LANGN_872,LANGN_873,LANGN_881,LANGN_890,LANGN_897,LANGN_898,LANGN_899,LANGN_900,LANGN_901,LANGN_902,LANGN_903,LANGN_904,LANGN_905,LANGN_906,LANGN_907,LANGN_908,LANGN_909,LANGN_910,LANGN_911,LANGN_912,LANGN_913,LANGN_914,LANGN_918,LANGN_919,LANGN_160,LANGN_327,LANGN_451,LANGN_474,LANGN_503,LANGN_608,LANGN_627,LANGN_639,LANGN_668,LANGN_842,LANGN_843,LANGN_844,LANGN_846,LANGN_849,LANGN_850,LANGN_851,LANGN_852,LANGN_861,LANGN_879,LANGN_133,LANGN_195,LANGN_237,LANGN_379,LANGN_382,LANGN_472,LANGN_492,LANGN_555,LANGN_605,LANGN_616,LANGN_626,LANGN_634,LANGN_648,LANGN_662,LANGN_665,LANGN_666,LANGN_667,LANGN_829,LANGN_854,LANGN_855,LANGN_857,LANGN_859,LANGN_860,LANGN_866,LANGN_877,LANGN_922
575474,0.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,7.0,3.0,2.0,2.0,1.0,2.0,3.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,4.0,4.0,1.0,1.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,3.0,1.0,4.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,4.0,1.0,4.0,1.0,5.0,1.0,1.0,1.0,1.0,4.0,1.0,3.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,5.0,5.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,1.0,5.0,1.0,1.0,4.0,4.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,1.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,5.0,5.0,5.0,1.0,5.0,4.0,5.0,5.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,2.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,3.0,4.0,2.0,4.0,1.0,1.0,4.0,4.0,4.0,4.0,3.0,4.0,2.0,3.0,2.0,3.0,4.0,1.0,2.0,1.0,4.0,1.0,5.0,1.0,0.0,0.0,7.0,7.0,6.0,3.0,0.0,3.0,0.0,14.5,4.0,0.0,3.0,1.0,8.0,1.0,0.0,5.0,5.0,3.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,6.0,0.0,1.0,1.0,4.0,49.0,12.0,53.0,19.0,37.0,2.0,1.0,1.0,1.0,1.0,60.0,4.0,4.0,3.0,3.0,3.0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
574994,1.0,1.0,2.0,1.0,1.0,1.0,1.0,3.0,2.0,2.0,2.0,4.0,7.0,3.0,1.0,2.0,2.0,1.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,2.0,2.0,3.0,3.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,4.0,4.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,2.0,1.0,2.0,3.0,2.0,2.0,4.0,3.0,3.0,4.0,3.0,4.0,4.0,4.0,3.0,4.0,1.0,1.0,1.0,5.0,2.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,2.0,1.0,3.0,5.0,5.0,1.0,5.0,4.0,2.0,5.0,4.0,5.0,1.0,4.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,5.0,6.0,6.0,6.0,3.0,4.0,4.0,1.0,1.0,1.0,3.0,4.0,2.0,4.0,4.0,1.0,4.0,3.0,5.0,4.0,2.0,3.0,4.0,4.0,2.0,3.0,2.0,6.0,5.0,2.0,5.0,2.0,1.0,3.0,6.0,6.0,2.0,6.0,2.0,1.0,1.0,1.0,3.0,1.0,1.0,2.0,3.0,3.0,3.0,2.0,2.0,1.0,3.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,1.0,3.0,3.0,1.0,1.0,1.0,4.0,2.0,3.0,1.0,1.0,0.0,7.0,7.0,6.0,1.0,2.0,2.0,0.0,12.0,3.0,1.0,3.0,3.0,8.0,1.0,1.0,4.0,4.0,4.0,1.0,1.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0,3.0,13.0,13.0,41.0,12.0,30.0,0.0,1.0,2.0,2.0,2.0,90.0,4.0,4.0,2.0,4.0,2.0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
574164,0.0,1.0,1.0,2.0,2.0,1.0,1.0,3.0,1.0,2.0,1.0,2.0,8.0,3.0,1.0,2.0,2.0,1.0,4.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,1.0,2.0,1.0,2.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,1.0,1.0,1.0,4.0,5.0,3.0,3.0,2.0,2.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,5.0,5.0,5.0,5.0,2.0,5.0,5.0,5.0,5.0,4.0,5.0,4.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,5.0,4.0,3.0,2.0,3.0,2.0,3.0,3.0,2.0,3.0,2.0,3.0,2.0,3.0,2.0,3.0,2.0,3.0,4.0,3.0,2.0,4.0,4.0,4.0,2.0,3.0,3.0,4.0,4.0,4.0,4.0,2.0,3.0,2.0,3.0,4.0,4.0,4.0,1.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,2.0,3.0,2.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,1.0,6.0,1.0,0.0,0.0,9.0,7.0,6.0,1.0,1.0,3.0,0.0,16.0,4.0,0.0,3.0,3.0,1.0,4.0,4.0,3.0,3.0,1.0,1.0,1.0,1.0,3.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,2.0,6.0,9.0,0.0,6.0,4.0,9.0,16.0,83.0,4.0,9.0,0.0,1.0,1.0,1.0,1.0,50.0,3.0,3.0,2.0,1.0,2.0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
576638,0.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,4.0,1.0,7.0,2.0,1.0,2.0,2.0,1.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,5.0,1.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,3.0,3.0,1.0,3.0,3.0,4.0,3.0,4.0,5.0,4.0,4.0,4.0,3.0,1.0,2.0,3.0,3.0,3.0,2.0,1.0,2.0,3.0,3.0,3.0,4.0,2.0,5.0,4.0,3.0,2.0,1.0,2.0,3.0,4.0,3.0,5.0,5.0,4.0,3.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,6.0,5.0,4.0,3.0,2.0,1.0,2.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,1.0,2.0,3.0,2.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0,7.0,1.0,0.0,0.0,8.0,6.0,5.0,1.0,1.0,3.0,0.0,16.0,2.0,0.0,2.0,2.0,20.0,3.0,6.0,4.0,4.0,3.0,1.0,2.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4.0,6.0,6.0,0.0,10.0,3.0,65.0,6.0,1.0,65.0,80.0,0.0,1.0,1.0,1.0,1.0,90.0,4.0,4.0,2.0,1.0,4.0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
577351,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,3.0,3.0,1.0,8.0,3.0,1.0,3.0,2.0,1.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,4.0,4.0,4.0,2.0,2.0,2.0,3.0,2.0,3.0,4.0,3.0,1.0,4.0,1.0,4.0,1.0,3.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,4.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,2.0,3.0,1.0,3.0,2.0,5.0,2.0,3.0,2.0,5.0,3.0,3.0,3.0,4.0,3.0,3.0,4.0,1.0,1.0,1.0,3.0,5.0,1.0,1.0,5.0,5.0,1.0,5.0,5.0,5.0,3.0,5.0,2.0,1.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,4.0,4.0,3.0,4.0,6.0,4.0,5.0,2.0,2.0,1.0,1.0,3.0,5.0,3.0,4.0,4.0,3.0,1.0,1.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,1.0,4.0,3.0,2.0,2.0,2.0,1.0,1.0,5.0,3.0,3.0,4.0,2.0,1.0,1.0,1.0,4.0,3.0,3.0,2.0,2.0,2.0,2.0,3.0,3.0,2.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0,4.0,4.0,1.0,1.0,3.0,1.0,1.0,6.0,1.0,0.0,0.0,7.0,7.0,6.0,1.0,1.0,3.0,0.0,16.0,4.0,0.0,3.0,1.0,20.0,4.0,4.0,5.0,3.0,5.0,2.0,4.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4.0,5.0,0.0,0.0,0.0,3.0,20.0,2.0,22.0,3.0,1.0,0.0,1.0,1.0,1.0,1.0,90.0,4.0,4.0,4.0,4.0,4.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
# Validation data - Saved later to S3 as CSV
print(validation_data.shape)
validation_data.head()

(683, 558)


Unnamed: 0,MATH_Proficient,SISCO,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,ST251Q01JA,ST251Q02JA,ST251Q03JA,ST251Q04JA,ST251Q06JA,ST253Q01JA,ST254Q01JA,ST254Q02JA,ST254Q03JA,ST254Q04JA,ST254Q05JA,ST254Q06JA,ST255Q01JA,ST038Q03NA,ST038Q04NA,ST038Q05NA,ST038Q06NA,ST038Q07NA,ST038Q08NA,ST038Q09JA,ST038Q10JA,ST038Q11JA,ST265Q01JA,ST265Q02JA,ST265Q03JA,ST265Q04JA,ST266Q01JA,ST266Q02JA,ST266Q03JA,ST266Q04JA,ST266Q05JA,ST263Q02JA,ST263Q04JA,ST263Q06JA,ST270Q01JA,ST270Q02JA,ST270Q03JA,ST270Q04JA,FL166Q01HA,FL166Q02HA,FL166Q03HA,FL166Q05HA,FL166Q06HA,FL166Q07HA,FL174Q01JA,FL174Q02JA,FL174Q03JA,FL174Q04JA,FL174Q05JA,FL174Q06JA,FL174Q07JA,FL167Q01HA,FL167Q02HA,FL167Q06JA,FL167Q03HA,FL167Q04HA,FL167Q05HA,FL167Q07JA,FL170Q01JA,FL170Q02JA,FL170Q03JA,FL170Q04JA,FL170Q05JA,FL170Q06JA,FL170Q07JA,FL162Q01HA,FL162Q02HA,FL162Q03HA,FL162Q04HA,FL162Q05HA,FL162Q06HA,FL163Q01HA,FL163Q02HA,FL163Q03HA,FL163Q04HA,FL163Q05HA,FL171Q01JA,FL171Q02JA,FL171Q03JA,FL171Q04JA,FL171Q05JA,FL171Q07JA,FL171Q08JA,FL171Q09JA,FL171Q10JA,FL171Q11JA,FL171Q12JA,FL169Q01HA,FL169Q05JA,FL169Q02HA,FL169Q04HA,FL169Q08JA,FL169Q10JA,FL169Q11JA,FL172Q01JA,FL172Q03JA,FL172Q05JA,FL172Q06JA,IC170Q01JA,IC170Q02JA,IC170Q03JA,IC170Q04JA,IC170Q05JA,IC170Q06JA,IC170Q07JA,IC171Q01JA,IC171Q02JA,IC171Q03JA,IC171Q04JA,IC171Q05JA,IC171Q06JA,IC172Q01JA,IC172Q02JA,IC172Q03JA,IC172Q04JA,IC172Q05JA,IC172Q06JA,IC172Q07JA,IC172Q08JA,IC172Q09JA,IC173Q01JA,IC173Q02JA,IC173Q03JA,IC173Q04JA,IC174Q01JA,IC174Q02JA,IC174Q03JA,IC174Q04JA,IC174Q05JA,IC174Q06JA,IC174Q07JA,IC174Q08JA,IC174Q09JA,IC174Q10JA,IC175Q01JA,IC175Q02JA,IC175Q03JA,IC175Q05JA,IC176Q01JA,IC176Q02JA,IC176Q03JA,IC176Q04JA,IC176Q05JA,IC176Q06JA,IC176Q07JA,IC176Q08JA,IC177Q01JA,IC177Q02JA,IC177Q03JA,IC177Q04JA,IC177Q05JA,IC177Q06JA,IC177Q07JA,IC178Q01JA,IC178Q02JA,IC178Q03JA,IC178Q04JA,IC178Q05JA,IC178Q06JA,IC178Q07JA,IC179Q01JA,IC179Q02JA,IC179Q03JA,IC179Q04JA,IC179Q05JA,IC179Q06JA,IC180Q02JA,IC180Q03JA,IC180Q04JA,IC180Q05JA,IC180Q06JA,IC180Q07JA,IC183Q01JA,IC183Q02JA,IC183Q03JA,IC183Q04JA,IC183Q05JA,IC183Q07JA,IC183Q08JA,IC183Q09JA,IC183Q10JA,IC183Q12JA,IC183Q13JA,IC183Q14JA,IC183Q15JA,IC183Q16JA,ST347Q01JA,ST347Q02JA,ST259Q01JA,ST004D01T,GRADE,REPEAT,EXPECEDU,ICTAVSCH,ICTAVHOM,IMMIG,TARDYSD,ST226Q01JA,MISSSC,PAREDINT,ST230Q01JA,SKIPPING,IC180Q01JA,IC180Q08JA,ST059Q02JA,ST296Q04JA,STUDYHMW,IC184Q01JA,IC184Q02JA,ST059Q01TA,ST296Q01JA,ST268Q01JA,ST268Q04JA,ST268Q07JA,ST297Q01JA,ST297Q03JA,ST297Q05JA,ST297Q06JA,ST297Q07JA,ST297Q09JA,ST258Q01JA,ST294Q01JA,ST295Q01JA,EXERPRAC,WORKPAY,WORKHOME,SC001Q01TA,SC211Q01JA,SC211Q02JA,SC211Q03JA,SC211Q04JA,SC211Q05JA,SC211Q06JA,SC037Q11JA,SC183Q02JA,SC183Q03JA,SC183Q04JA,SC175Q01JA,SC188Q01JA,SC188Q02JA,SC188Q03JA,SC188Q04JA,SC188Q05JA,...,ST349Q01JA_2,ST349Q01JA_3,ST349Q01JA_4,ST349Q01JA_0,LANGN_105,LANGN_108,LANGN_118,LANGN_140,LANGN_148,LANGN_150,LANGN_156,LANGN_200,LANGN_204,LANGN_232,LANGN_273,LANGN_313,LANGN_316,LANGN_322,LANGN_329,LANGN_344,LANGN_351,LANGN_415,LANGN_463,LANGN_493,LANGN_496,LANGN_500,LANGN_520,LANGN_531,LANGN_602,LANGN_606,LANGN_615,LANGN_621,LANGN_625,LANGN_640,LANGN_641,LANGN_663,LANGN_669,LANGN_670,LANGN_800,LANGN_801,LANGN_802,LANGN_804,LANGN_805,LANGN_806,LANGN_807,LANGN_808,LANGN_865,LANGN_892,LANGN_895,LANGN_917,SC177Q01JA_1,SC177Q01JA_2,SC177Q01JA_3,SC177Q02JA_1,SC177Q02JA_2,SC177Q02JA_3,SC177Q03JA_1,SC177Q03JA_2,SC177Q03JA_3,MATHEXC_0,MATHEXC_1,MATHEXC_2,MATHEXC_3,SCHLTYPE_1,SCHLTYPE_2,SCHLTYPE_3,LANGN_121,LANGN_130,LANGN_137,LANGN_170,LANGN_244,LANGN_258,LANGN_263,LANGN_264,LANGN_266,LANGN_317,LANGN_340,LANGN_369,LANGN_381,LANGN_404,LANGN_420,LANGN_449,LANGN_467,LANGN_494,LANGN_495,LANGN_514,LANGN_523,LANGN_529,LANGN_540,LANGN_547,LANGN_600,LANGN_607,LANGN_618,LANGN_619,LANGN_630,LANGN_635,LANGN_650,LANGN_661,LANGN_673,LANGN_674,LANGN_809,LANGN_810,LANGN_811,LANGN_812,LANGN_813,LANGN_814,LANGN_815,LANGN_816,LANGN_818,LANGN_832,LANGN_868,LANGN_870,LANGN_920,LANGN_921,LANGN_113,LANGN_147,LANGN_275,LANGN_286,LANGN_363,LANGN_422,LANGN_434,LANGN_442,LANGN_471,LANGN_611,LANGN_614,LANGN_624,LANGN_642,LANGN_675,LANGN_676,LANGN_677,LANGN_678,LANGN_817,LANGN_819,LANGN_821,LANGN_823,LANGN_824,LANGN_825,LANGN_826,LANGN_827,LANGN_828,LANGN_885,LANGN_896,LANGN_916,LANGN_112,LANGN_154,LANGN_202,LANGN_246,LANGN_254,LANGN_272,LANGN_301,LANGN_325,LANGN_338,LANGN_358,LANGN_371,LANGN_375,LANGN_383,LANGN_409,LANGN_428,LANGN_465,LANGN_517,LANGN_527,LANGN_561,LANGN_562,LANGN_563,LANGN_565,LANGN_566,LANGN_567,LANGN_601,LANGN_622,LANGN_623,LANGN_628,LANGN_631,LANGN_831,LANGN_833,LANGN_836,LANGN_837,LANGN_838,LANGN_839,LANGN_840,LANGN_841,LANGN_845,LANGN_872,LANGN_873,LANGN_881,LANGN_890,LANGN_897,LANGN_898,LANGN_899,LANGN_900,LANGN_901,LANGN_902,LANGN_903,LANGN_904,LANGN_905,LANGN_906,LANGN_907,LANGN_908,LANGN_909,LANGN_910,LANGN_911,LANGN_912,LANGN_913,LANGN_914,LANGN_918,LANGN_919,LANGN_160,LANGN_327,LANGN_451,LANGN_474,LANGN_503,LANGN_608,LANGN_627,LANGN_639,LANGN_668,LANGN_842,LANGN_843,LANGN_844,LANGN_846,LANGN_849,LANGN_850,LANGN_851,LANGN_852,LANGN_861,LANGN_879,LANGN_133,LANGN_195,LANGN_237,LANGN_379,LANGN_382,LANGN_472,LANGN_492,LANGN_555,LANGN_605,LANGN_616,LANGN_626,LANGN_634,LANGN_648,LANGN_662,LANGN_665,LANGN_666,LANGN_667,LANGN_829,LANGN_854,LANGN_855,LANGN_857,LANGN_859,LANGN_860,LANGN_866,LANGN_877,LANGN_922
574478,1.0,1.0,2.0,1.0,1.0,1.0,1.0,3.0,1.0,4.0,4.0,4.0,8.0,3.0,2.0,3.0,2.0,1.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,3.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,3.0,2.0,4.0,2.0,4.0,2.0,1.0,1.0,4.0,4.0,3.0,4.0,4.0,3.0,4.0,3.0,5.0,1.0,1.0,4.0,3.0,1.0,2.0,2.0,2.0,3.0,4.0,4.0,2.0,4.0,4.0,4.0,4.0,5.0,2.0,3.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,3.0,3.0,2.0,4.0,3.0,4.0,2.0,2.0,1.0,1.0,1.0,5.0,5.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,5.0,5.0,5.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,5.0,5.0,5.0,6.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,1.0,1.0,4.0,1.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,6.0,1.0,7.0,1.0,0.0,0.0,6.0,7.0,6.0,1.0,0.0,3.0,0.0,12.0,4.0,0.0,1.0,3.0,3.0,3.0,9.0,3.0,4.0,3.0,1.0,4.0,4.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,5.0,1.0,6.0,3.0,0.0,10.0,4.0,40.0,30.0,79.0,11.0,18.0,4.0,1.0,1.0,1.0,1.0,60.0,4.0,4.0,3.0,3.0,3.0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573678,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,2.0,3.0,3.0,4.0,5.0,3.0,2.0,2.0,1.0,1.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,2.0,1.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,5.0,5.0,1.0,5.0,4.0,3.0,5.0,4.0,5.0,2.0,5.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,5.0,4.0,4.0,5.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,2.0,3.0,3.0,4.0,1.0,9.0,2.0,0.0,0.0,7.0,7.0,6.0,1.0,1.0,3.0,0.0,16.0,3.0,1.0,2.0,2.0,7.0,1.0,0.0,4.0,4.0,7.0,1.0,2.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0,0.0,5.0,0.0,3.0,1.0,13.0,22.0,0.0,4.0,0.0,2.0,2.0,1.0,1.0,50.0,4.0,3.0,2.0,4.0,2.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
576927,0.0,1.0,1.0,2.0,1.0,1.0,1.0,3.0,1.0,3.0,3.0,1.0,5.0,4.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,3.0,3.0,3.0,4.0,2.0,3.0,3.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,1.0,4.0,1.0,4.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,2.0,2.0,2.0,4.0,4.0,3.0,3.0,2.0,4.0,4.0,1.0,3.0,5.0,3.0,3.0,3.0,4.0,2.0,1.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,4.0,5.0,3.0,5.0,4.0,3.0,5.0,4.0,5.0,2.0,5.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,5.0,4.0,4.0,5.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,2.0,3.0,3.0,4.0,4.0,5.0,1.0,0.0,0.0,9.0,4.0,6.0,1.0,1.0,4.0,1.0,16.0,4.0,0.0,2.0,2.0,4.0,1.0,4.0,4.0,4.0,1.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,6.0,0.0,0.0,0.0,4.0,5.0,2.0,83.0,10.0,7.0,0.0,1.0,1.0,1.0,1.0,80.0,4.0,4.0,3.0,3.0,3.0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
576321,1.0,0.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,4.0,4.0,4.0,8.0,3.0,4.0,4.0,4.0,1.0,4.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,4.0,4.0,4.0,4.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,3.0,1.0,1.0,1.0,3.0,1.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,4.0,4.0,3.0,1.0,4.0,4.0,4.0,4.0,2.0,4.0,1.0,3.0,4.0,4.0,2.0,3.0,2.0,3.0,3.0,2.0,3.0,1.0,2.0,1.0,1.0,5.0,5.0,5.0,5.0,5.0,2.0,1.0,5.0,5.0,5.0,5.0,2.0,2.0,1.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,6.0,3.0,5.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,4.0,3.0,1.0,1.0,1.0,5.0,4.0,1.0,4.0,4.0,3.0,2.0,1.0,2.0,4.0,5.0,1.0,5.0,1.0,3.0,2.0,6.0,6.0,2.0,4.0,1.0,3.0,2.0,1.0,3.0,2.0,2.0,2.0,3.0,3.0,2.0,3.0,3.0,2.0,4.0,4.0,4.0,3.0,4.0,5.0,3.0,3.0,4.0,4.0,4.0,2.0,3.0,1.0,4.0,1.0,8.0,1.0,1.0,0.0,7.0,7.0,6.0,2.0,0.0,2.0,0.0,16.0,3.0,1.0,2.0,2.0,20.0,6.0,5.0,3.0,3.0,5.0,1.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0,6.0,0.0,0.0,5.0,2.0,44.0,12.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,40.0,2.0,2.0,2.0,4.0,1.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
576339,1.0,1.0,1.0,1.0,2.0,1.0,1.0,3.0,1.0,2.0,2.0,1.0,7.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,2.0,1.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,5.0,5.0,1.0,5.0,4.0,3.0,5.0,4.0,5.0,2.0,5.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,5.0,4.0,4.0,5.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,2.0,3.0,3.0,3.0,1.0,4.0,2.0,-1.0,0.0,4.0,7.0,6.0,1.0,0.0,1.0,0.0,12.0,4.0,0.0,2.0,2.0,7.0,5.0,2.0,4.0,4.0,7.0,1.0,4.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,6.0,0.0,0.0,4.0,2.0,0.0,18.0,30.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,55.0,2.0,4.0,2.0,3.0,4.0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
# Test data - NOT SAVED TO S3
print(test_data.shape)
test_data.head()

(683, 558)


Unnamed: 0,MATH_Proficient,SISCO,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,ST251Q01JA,ST251Q02JA,ST251Q03JA,ST251Q04JA,ST251Q06JA,ST253Q01JA,ST254Q01JA,ST254Q02JA,ST254Q03JA,ST254Q04JA,ST254Q05JA,ST254Q06JA,ST255Q01JA,ST038Q03NA,ST038Q04NA,ST038Q05NA,ST038Q06NA,ST038Q07NA,ST038Q08NA,ST038Q09JA,ST038Q10JA,ST038Q11JA,ST265Q01JA,ST265Q02JA,ST265Q03JA,ST265Q04JA,ST266Q01JA,ST266Q02JA,ST266Q03JA,ST266Q04JA,ST266Q05JA,ST263Q02JA,ST263Q04JA,ST263Q06JA,ST270Q01JA,ST270Q02JA,ST270Q03JA,ST270Q04JA,FL166Q01HA,FL166Q02HA,FL166Q03HA,FL166Q05HA,FL166Q06HA,FL166Q07HA,FL174Q01JA,FL174Q02JA,FL174Q03JA,FL174Q04JA,FL174Q05JA,FL174Q06JA,FL174Q07JA,FL167Q01HA,FL167Q02HA,FL167Q06JA,FL167Q03HA,FL167Q04HA,FL167Q05HA,FL167Q07JA,FL170Q01JA,FL170Q02JA,FL170Q03JA,FL170Q04JA,FL170Q05JA,FL170Q06JA,FL170Q07JA,FL162Q01HA,FL162Q02HA,FL162Q03HA,FL162Q04HA,FL162Q05HA,FL162Q06HA,FL163Q01HA,FL163Q02HA,FL163Q03HA,FL163Q04HA,FL163Q05HA,FL171Q01JA,FL171Q02JA,FL171Q03JA,FL171Q04JA,FL171Q05JA,FL171Q07JA,FL171Q08JA,FL171Q09JA,FL171Q10JA,FL171Q11JA,FL171Q12JA,FL169Q01HA,FL169Q05JA,FL169Q02HA,FL169Q04HA,FL169Q08JA,FL169Q10JA,FL169Q11JA,FL172Q01JA,FL172Q03JA,FL172Q05JA,FL172Q06JA,IC170Q01JA,IC170Q02JA,IC170Q03JA,IC170Q04JA,IC170Q05JA,IC170Q06JA,IC170Q07JA,IC171Q01JA,IC171Q02JA,IC171Q03JA,IC171Q04JA,IC171Q05JA,IC171Q06JA,IC172Q01JA,IC172Q02JA,IC172Q03JA,IC172Q04JA,IC172Q05JA,IC172Q06JA,IC172Q07JA,IC172Q08JA,IC172Q09JA,IC173Q01JA,IC173Q02JA,IC173Q03JA,IC173Q04JA,IC174Q01JA,IC174Q02JA,IC174Q03JA,IC174Q04JA,IC174Q05JA,IC174Q06JA,IC174Q07JA,IC174Q08JA,IC174Q09JA,IC174Q10JA,IC175Q01JA,IC175Q02JA,IC175Q03JA,IC175Q05JA,IC176Q01JA,IC176Q02JA,IC176Q03JA,IC176Q04JA,IC176Q05JA,IC176Q06JA,IC176Q07JA,IC176Q08JA,IC177Q01JA,IC177Q02JA,IC177Q03JA,IC177Q04JA,IC177Q05JA,IC177Q06JA,IC177Q07JA,IC178Q01JA,IC178Q02JA,IC178Q03JA,IC178Q04JA,IC178Q05JA,IC178Q06JA,IC178Q07JA,IC179Q01JA,IC179Q02JA,IC179Q03JA,IC179Q04JA,IC179Q05JA,IC179Q06JA,IC180Q02JA,IC180Q03JA,IC180Q04JA,IC180Q05JA,IC180Q06JA,IC180Q07JA,IC183Q01JA,IC183Q02JA,IC183Q03JA,IC183Q04JA,IC183Q05JA,IC183Q07JA,IC183Q08JA,IC183Q09JA,IC183Q10JA,IC183Q12JA,IC183Q13JA,IC183Q14JA,IC183Q15JA,IC183Q16JA,ST347Q01JA,ST347Q02JA,ST259Q01JA,ST004D01T,GRADE,REPEAT,EXPECEDU,ICTAVSCH,ICTAVHOM,IMMIG,TARDYSD,ST226Q01JA,MISSSC,PAREDINT,ST230Q01JA,SKIPPING,IC180Q01JA,IC180Q08JA,ST059Q02JA,ST296Q04JA,STUDYHMW,IC184Q01JA,IC184Q02JA,ST059Q01TA,ST296Q01JA,ST268Q01JA,ST268Q04JA,ST268Q07JA,ST297Q01JA,ST297Q03JA,ST297Q05JA,ST297Q06JA,ST297Q07JA,ST297Q09JA,ST258Q01JA,ST294Q01JA,ST295Q01JA,EXERPRAC,WORKPAY,WORKHOME,SC001Q01TA,SC211Q01JA,SC211Q02JA,SC211Q03JA,SC211Q04JA,SC211Q05JA,SC211Q06JA,SC037Q11JA,SC183Q02JA,SC183Q03JA,SC183Q04JA,SC175Q01JA,SC188Q01JA,SC188Q02JA,SC188Q03JA,SC188Q04JA,SC188Q05JA,...,ST349Q01JA_2,ST349Q01JA_3,ST349Q01JA_4,ST349Q01JA_0,LANGN_105,LANGN_108,LANGN_118,LANGN_140,LANGN_148,LANGN_150,LANGN_156,LANGN_200,LANGN_204,LANGN_232,LANGN_273,LANGN_313,LANGN_316,LANGN_322,LANGN_329,LANGN_344,LANGN_351,LANGN_415,LANGN_463,LANGN_493,LANGN_496,LANGN_500,LANGN_520,LANGN_531,LANGN_602,LANGN_606,LANGN_615,LANGN_621,LANGN_625,LANGN_640,LANGN_641,LANGN_663,LANGN_669,LANGN_670,LANGN_800,LANGN_801,LANGN_802,LANGN_804,LANGN_805,LANGN_806,LANGN_807,LANGN_808,LANGN_865,LANGN_892,LANGN_895,LANGN_917,SC177Q01JA_1,SC177Q01JA_2,SC177Q01JA_3,SC177Q02JA_1,SC177Q02JA_2,SC177Q02JA_3,SC177Q03JA_1,SC177Q03JA_2,SC177Q03JA_3,MATHEXC_0,MATHEXC_1,MATHEXC_2,MATHEXC_3,SCHLTYPE_1,SCHLTYPE_2,SCHLTYPE_3,LANGN_121,LANGN_130,LANGN_137,LANGN_170,LANGN_244,LANGN_258,LANGN_263,LANGN_264,LANGN_266,LANGN_317,LANGN_340,LANGN_369,LANGN_381,LANGN_404,LANGN_420,LANGN_449,LANGN_467,LANGN_494,LANGN_495,LANGN_514,LANGN_523,LANGN_529,LANGN_540,LANGN_547,LANGN_600,LANGN_607,LANGN_618,LANGN_619,LANGN_630,LANGN_635,LANGN_650,LANGN_661,LANGN_673,LANGN_674,LANGN_809,LANGN_810,LANGN_811,LANGN_812,LANGN_813,LANGN_814,LANGN_815,LANGN_816,LANGN_818,LANGN_832,LANGN_868,LANGN_870,LANGN_920,LANGN_921,LANGN_113,LANGN_147,LANGN_275,LANGN_286,LANGN_363,LANGN_422,LANGN_434,LANGN_442,LANGN_471,LANGN_611,LANGN_614,LANGN_624,LANGN_642,LANGN_675,LANGN_676,LANGN_677,LANGN_678,LANGN_817,LANGN_819,LANGN_821,LANGN_823,LANGN_824,LANGN_825,LANGN_826,LANGN_827,LANGN_828,LANGN_885,LANGN_896,LANGN_916,LANGN_112,LANGN_154,LANGN_202,LANGN_246,LANGN_254,LANGN_272,LANGN_301,LANGN_325,LANGN_338,LANGN_358,LANGN_371,LANGN_375,LANGN_383,LANGN_409,LANGN_428,LANGN_465,LANGN_517,LANGN_527,LANGN_561,LANGN_562,LANGN_563,LANGN_565,LANGN_566,LANGN_567,LANGN_601,LANGN_622,LANGN_623,LANGN_628,LANGN_631,LANGN_831,LANGN_833,LANGN_836,LANGN_837,LANGN_838,LANGN_839,LANGN_840,LANGN_841,LANGN_845,LANGN_872,LANGN_873,LANGN_881,LANGN_890,LANGN_897,LANGN_898,LANGN_899,LANGN_900,LANGN_901,LANGN_902,LANGN_903,LANGN_904,LANGN_905,LANGN_906,LANGN_907,LANGN_908,LANGN_909,LANGN_910,LANGN_911,LANGN_912,LANGN_913,LANGN_914,LANGN_918,LANGN_919,LANGN_160,LANGN_327,LANGN_451,LANGN_474,LANGN_503,LANGN_608,LANGN_627,LANGN_639,LANGN_668,LANGN_842,LANGN_843,LANGN_844,LANGN_846,LANGN_849,LANGN_850,LANGN_851,LANGN_852,LANGN_861,LANGN_879,LANGN_133,LANGN_195,LANGN_237,LANGN_379,LANGN_382,LANGN_472,LANGN_492,LANGN_555,LANGN_605,LANGN_616,LANGN_626,LANGN_634,LANGN_648,LANGN_662,LANGN_665,LANGN_666,LANGN_667,LANGN_829,LANGN_854,LANGN_855,LANGN_857,LANGN_859,LANGN_860,LANGN_866,LANGN_877,LANGN_922
577660,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,3.0,3.0,2.0,7.0,3.0,2.0,2.0,2.0,1.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,2.0,1.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,5.0,5.0,1.0,5.0,4.0,3.0,5.0,4.0,5.0,2.0,5.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,5.0,4.0,4.0,5.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,2.0,3.0,3.0,4.0,1.0,7.0,2.0,0.0,0.0,7.0,7.0,6.0,1.0,0.0,3.0,0.0,16.0,3.0,0.0,2.0,2.0,8.0,3.0,4.0,4.0,4.0,4.0,1.0,2.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,6.0,5.0,0.0,4.0,2.0,1.0,13.0,29.0,1.0,1.0,0.0,2.0,1.0,1.0,1.0,90.0,3.0,3.0,3.0,3.0,2.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
577072,0.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,8.0,3.0,2.0,2.0,4.0,1.0,4.0,3.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,3.0,2.0,2.0,2.0,1.0,1.0,1.0,3.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,4.0,1.0,2.0,3.0,2.0,3.0,4.0,2.0,3.0,4.0,3.0,4.0,3.0,3.0,3.0,3.0,4.0,4.0,3.0,5.0,3.0,5.0,4.0,4.0,3.0,4.0,4.0,3.0,4.0,2.0,3.0,3.0,3.0,3.0,2.0,1.0,3.0,2.0,5.0,5.0,4.0,5.0,3.0,5.0,5.0,5.0,5.0,3.0,5.0,2.0,2.0,3.0,3.0,2.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0,4.0,3.0,4.0,5.0,4.0,3.0,2.0,4.0,3.0,3.0,2.0,4.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,3.0,2.0,3.0,2.0,3.0,6.0,6.0,6.0,5.0,6.0,4.0,4.0,6.0,6.0,6.0,4.0,5.0,3.0,4.0,1.0,1.0,3.0,2.0,2.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,3.0,3.0,3.0,1.0,9.0,1.0,0.0,0.0,8.0,7.0,6.0,1.0,2.0,4.0,0.0,16.0,4.0,1.0,3.0,1.0,7.0,4.0,8.0,5.0,4.0,1.0,2.0,2.0,2.0,4.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,5.0,6.0,7.0,3.0,8.0,3.0,9.0,13.0,44.0,3.0,4.0,0.0,1.0,1.0,1.0,1.0,60.0,4.0,4.0,3.0,3.0,3.0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
576813,0.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,4.0,4.0,1.0,7.0,3.0,3.0,2.0,3.0,1.0,3.0,5.0,2.0,3.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,4.0,4.0,3.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,4.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,4.0,4.0,3.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,4.0,3.0,2.0,2.0,3.0,4.0,2.0,4.0,3.0,3.0,4.0,3.0,3.0,5.0,2.0,6.0,4.0,6.0,3.0,1.0,6.0,3.0,5.0,5.0,2.0,4.0,2.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,2.0,3.0,2.0,5.0,3.0,4.0,2.0,2.0,4.0,3.0,4.0,2.0,4.0,2.0,3.0,2.0,2.0,4.0,2.0,8.0,1.0,0.0,0.0,8.0,7.0,6.0,1.0,0.0,4.0,0.0,16.0,4.0,0.0,3.0,3.0,4.0,1.0,6.0,4.0,5.0,2.0,1.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,6.0,2.0,4.0,8.0,3.0,1.0,3.0,5.0,0.0,2.0,0.0,1.0,2.0,2.0,2.0,80.0,4.0,4.0,3.0,4.0,3.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
577850,0.0,1.0,1.0,1.0,2.0,1.0,1.0,3.0,1.0,3.0,3.0,1.0,7.0,3.0,2.0,3.0,1.0,1.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,2.0,1.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,5.0,5.0,1.0,5.0,4.0,3.0,5.0,4.0,5.0,2.0,5.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,5.0,4.0,4.0,5.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,2.0,3.0,3.0,1.0,1.0,7.0,2.0,0.0,0.0,7.0,7.0,6.0,1.0,0.0,3.0,0.0,12.0,3.0,0.0,2.0,2.0,8.0,3.0,4.0,4.0,4.0,4.0,1.0,2.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,6.0,5.0,0.0,4.0,3.0,5.0,46.0,80.0,0.0,10.0,0.0,2.0,1.0,1.0,1.0,45.0,3.0,4.0,1.0,3.0,3.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
573800,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,3.0,3.0,2.0,8.0,3.0,3.0,2.0,2.0,1.0,3.0,4.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,3.0,2.0,3.0,2.0,3.0,3.0,1.0,1.0,1.0,1.0,2.0,3.0,2.0,1.0,1.0,4.0,3.0,1.0,2.0,4.0,3.0,4.0,4.0,4.0,3.0,4.0,3.0,5.0,1.0,1.0,1.0,4.0,1.0,2.0,2.0,1.0,1.0,3.0,4.0,2.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,5.0,1.0,1.0,3.0,3.0,2.0,4.0,2.0,2.0,3.0,3.0,4.0,1.0,3.0,3.0,3.0,5.0,5.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,2.0,5.0,4.0,4.0,2.0,3.0,1.0,2.0,3.0,4.0,3.0,3.0,4.0,4.0,1.0,4.0,6.0,4.0,4.0,5.0,3.0,1.0,1.0,5.0,4.0,4.0,5.0,4.0,2.0,5.0,2.0,5.0,5.0,4.0,5.0,4.0,1.0,5.0,4.0,2.0,4.0,3.0,1.0,5.0,2.0,3.0,5.0,5.0,2.0,1.0,6.0,2.0,3.0,1.0,4.0,2.0,2.0,3.0,3.0,4.0,4.0,1.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,4.0,1.0,8.0,1.0,0.0,0.0,8.0,7.0,6.0,1.0,0.0,3.0,0.0,16.0,1.0,1.0,2.0,1.0,7.0,5.0,7.0,4.0,5.0,4.0,3.0,1.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,6.0,0.0,0.0,0.0,2.0,18.0,14.0,77.0,6.0,5.0,0.0,1.0,1.0,1.0,1.0,65.0,4.0,4.0,2.0,3.0,3.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Now we'll copy the file to S3 for Amazon SageMaker's managed training to pickup.

In [22]:
# cell 14
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

## Training 

In [23]:
# cell 15
#container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='latest')
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, "factorization-machines")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Then, because we're training with the CSV file format, we'll create `s3_input`s that our training function can use as a pointer to the files in S3, which also specify that the content type is CSV.

In [24]:
# cell 16
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='text/csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='text/csv')

In [25]:
# cell 17
sess = sagemaker.Session()

fm = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.c5.xlarge",
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=sess,
)

In [26]:
# Set hyperparameters for logistic regression - Set to default values
fm.set_hyperparameters(
    feature_dim=557, predictor_type="binary_classifier", mini_batch_size=200, num_factors=10
)


#### Use auto-tuning to find best hyperparameters

In [27]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

mini_batch_upper_limit = int(train_data.shape[0]*0.16)

# Define hyperparameter ranges
hyperparameter_ranges = {
    "factors_lr": ContinuousParameter(0.0001, 0.1),  # Learning rate for factorized interactions
    "linear_lr": ContinuousParameter(0.0001, 0.1),   # Learning rate for linear terms
    "bias_lr": ContinuousParameter(0.0001, 0.1),     # Learning rate for bias terms

    "factors_wd": ContinuousParameter(1e-6, 1e-2),  # Regularization for factors
    "linear_wd": ContinuousParameter(1e-6, 1e-2),   # Regularization for linear terms
    "bias_wd": ContinuousParameter(1e-6, 1e-2),     # Regularization for bias terms

    "epochs": IntegerParameter(5, 10),             # Number of training iterations
    "mini_batch_size": IntegerParameter(128, 10000),  # Batch size for training
}

In [28]:
# Tunable Parameters
# [factors_init_value, factors_init_scale, bias_init_sigma, factors_init_sigma, 
# linear_wd, linear_init_sigma, bias_init_value, linear_lr, linear_init_scale, 
# bias_wd, bias_lr, factors_wd, bias_init_scale, epochs, factors_lr, linear_init_value, mini_batch_size]

tuner = HyperparameterTuner(estimator=fm,
                            objective_metric_name='test:binary_classification_accuracy',
                            hyperparameter_ranges=hyperparameter_ranges,
                            max_jobs=50,  # should be 50
                            max_parallel_jobs=5)

# May need to adjust number of jobs depending on budget!

In [29]:
tuner.fit({'train': s3_input_train, 'test': s3_input_validation})

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


..................................................................................................................................................!


In [30]:
# cell 26
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

'Completed'

In [31]:
# cell 27
# Return the best training job name
best_training_job = tuner.best_training_job()
print("Best training job:", best_training_job)

Best training job: factorization-machin-250304-0246-030-fb8d30b9


In [32]:
# Print out hyperparameters of BEST model

response = boto3.client('sagemaker').describe_training_job(TrainingJobName=best_training_job)
best_hyperparameters = response["HyperParameters"]

best_mini_batch_size = int(best_hyperparameters["mini_batch_size"])
best_learning_rate = float(best_hyperparameters["num_factors"])
best_epochs = int(best_hyperparameters["epochs"])

best_factors_lr = float(best_hyperparameters["factors_lr"])
best_linear_lr = float(best_hyperparameters["linear_lr"])
best_bias_lr = float(best_hyperparameters["bias_lr"])

best_factors_wd = float(best_hyperparameters["factors_wd"])
best_linear_wd = float(best_hyperparameters["linear_wd"])
best_bias_wd = float(best_hyperparameters["bias_wd"])


# Print hyperparameters with formatted output
print("BEST mini batch size:", best_mini_batch_size)
print("BEST epochs:", best_epochs)

print("BEST factors learning rate:", round(best_factors_lr, 3))
print("BEST linear learning rate:", round(best_linear_lr, 3))
print("BEST bias learning rate:", round(best_bias_lr, 3))

print("BEST factors weight decay:", round(best_factors_wd, 6))
print("BEST linear weight decay:", round(best_linear_wd, 6))
print("BEST bias weight decay:", round(best_bias_wd, 6))


BEST mini batch size: 169
BEST epochs: 7
BEST factors learning rate: 0.094
BEST linear learning rate: 0.0
BEST bias learning rate: 0.0
BEST factors weight decay: 0.004378
BEST linear weight decay: 2e-06
BEST bias weight decay: 0.000864


## Deploy the model (the best model identified by HyperparameterTuner)

In [33]:
# cell 28
fm_predictor = tuner.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')


2025-03-04 02:55:30 Starting - Found matching resource for reuse
2025-03-04 02:55:30 Downloading - Downloading the training image
2025-03-04 02:55:30 Training - Training image download completed. Training in progress.
2025-03-04 02:55:30 Uploading - Uploading generated training model
2025-03-04 02:55:30 Completed - Resource reused by training job: factorization-machin-250304-0246-035-d236b57a
-------!

In [34]:
# cell 29
# Create a serializer
fm_predictor.serializer = sagemaker.serializers.CSVSerializer()

Finally, we can now validate the model for use. We can pass HTTP POST requests to the endpoint to get back predictions. To make this easier, we’ll again use the Amazon SageMaker Python SDK and specify how to serialize requests and deserialize responses that are specific to the algorithm.

Since factorization machines are so frequently used with sparse data, making inference requests with a CSV format (as is done in other algorithm examples) can be massively inefficient. Rather than waste space and time generating all of those zeros, to pad the row to the correct dimensionality, JSON can be used more efficiently. Since we trained the model using dense data, this is a bit of a moot point, as we’ll have to pass all the 0s in anyway.

Nevertheless, we’ll write our own small function to serialize our inference request in the JSON format that Amazon SageMaker Factorization Machines expects.

In [35]:
import json
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

# Custom Serializer for Factorization Machines
class FMSerializer(JSONSerializer):
    def serialize(self, data):
        js = {"instances": []}
        for row in data:
            js["instances"].append({"features": row.tolist()})  # Ensure proper JSON structure
        return json.dumps(js)

# Attach serializer & deserializer to the SageMaker predictor
fm_predictor.serializer = FMSerializer()
fm_predictor.deserializer = JSONDeserializer()

In [36]:
# Select a single record from test_data
single_record = test_data.drop(['MATH_Proficient'], axis=1).iloc[30:31].to_numpy()

# Send request to SageMaker Factorization Machines endpoint
result = fm_predictor.predict(single_record, initial_args={"ContentType": "application/json"})

# Print the result
print(result)

{'predictions': [{'score': 1.0, 'predicted_label': 1.0}]}


OK, a single prediction works. We see that for one record our endpoint returned some JSON which contains predictions, including the score and predicted_label. In this case, score will be a continuous value between [0, 1] representing the probability we think the digit is a 0 or not. predicted_label will take a value of either 0 or 1 where (somewhat counterintuitively) 1 denotes that we predict the image is a 0, while 0 denotes that we are predicting the image is not of a 0.

Let’s do a whole batch of images and evaluate our predictive accuracy.

In [37]:
import numpy as np

# Convert test data to NumPy array (drop the target column)
batch_data = test_data.drop(['MATH_Proficient'], axis=1).to_numpy()

# Store predictions
predictions = []

# Split data into smaller batches (to avoid exceeding request limits)
for batch in np.array_split(batch_data, 100):  # Split into 100 smaller requests
    result = fm_predictor.predict(batch, initial_args={"ContentType": "application/json"})
    
    # Extract the predicted scores from the response
    predictions += [r["score"] for r in result["predictions"]]

# Convert predictions to a NumPy array
predictions = np.array(predictions)

# Print first few predictions
print(predictions[:1000])


[1.0000000e+00 0.0000000e+00 0.0000000e+00 1.0000000e+00 1.0000000e+00
 0.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00 0.0000000e+00
 1.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00
 1.0000000e+00 0.0000000e+00 1.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 1.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 1.0000000e+00 1.0000000e+00 0.0000000e+00 1.0000000e+00 0.0000000e+00
 1.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00 0.0000000e+00
 1.0000000e+00 0.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00
 0.0000000e+00 0.0000000e+00 1.0000000e+00 0.0000000e+00 1.0000000e+00
 1.0000000e+00 0.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00
 0.0000000e+00 1.0000000e+00 1.0000000e+00 0.0000000e+00 1.0000000e+00
 0.0000000e+00 0.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00
 1.0000000e+00 0.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00
 0.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00
 0.000

In [38]:
# Save the real values for the test set
real_values = test_data['MATH_Proficient']
real_values.to_csv('real_values.csv', index=False, header=False)

# Save the predicted values for the test set
predicted_values_full = predictions
predicted_values_full = pd.DataFrame(predicted_values_full, columns=['Predicted Values'])
predicted_values_full.to_csv('predicted_values_full.csv', index=False, header=False)

In [39]:
# Clean up
fm_predictor.delete_endpoint(delete_endpoint_config=True)

## Explain the trained model using Clarify

In [40]:
from datetime import datetime

session = sagemaker.Session()

model_name = "Clarify-{}-{}".format(country_name_edited, datetime.now().strftime("%d-%m-%Y-%H-%M-%S"))

best_model = sagemaker.estimator.Estimator.attach(best_training_job)  # Attach the best training job

model = best_model.create_model(name=model_name)  # Create a model from the best job

container_def = model.prepare_container_def()

session.create_model(model_name, role, container_def)


2025-03-04 02:55:30 Starting - Found matching resource for reuse
2025-03-04 02:55:30 Downloading - Downloading the training image
2025-03-04 02:55:30 Training - Training image download completed. Training in progress.
2025-03-04 02:55:30 Uploading - Uploading generated training model
2025-03-04 02:55:30 Completed - Resource reused by training job: factorization-machin-250304-0246-035-d236b57a


'Clarify-United-States-04-03-2025-03-03-30'

In [41]:
test_features = test_data.drop(["MATH_Proficient"], axis=1)
test_target = test_data["MATH_Proficient"]
test_features.to_csv("test_features.csv", index=False, header=False)

In [42]:
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role, instance_count=1, instance_type="ml.m5.2xlarge", sagemaker_session=session
)

model_config = clarify.ModelConfig(
    model_name=model_name,
    instance_type="ml.m5.large",
    instance_count=1,
    accept_type="text/csv",
    content_type="text/csv",
)

In [43]:
from sagemaker.s3 import S3Downloader

# Download data from S3 to local instance
local_path = S3Downloader.download('s3://{}/{}/train'.format(bucket, prefix), './tmp/train_data')

In [44]:
# Load and sample
full_data = pd.read_csv('./tmp/train_data/train.csv', header=None)
n = min(3000, len(full_data))  
sampled_data = full_data.sample(n=n)  # If full_data has less than n, use the full sample

# Save sampled data back to S3
sampled_path = 'sampled_train_data.csv'
sampled_data.to_csv(sampled_path, index=False)

from sagemaker.s3 import S3Uploader
sampled_s3_uri = S3Uploader.upload(sampled_path, 's3://{}/{}/sampled_train'.format(bucket, prefix))

In [45]:
print(sampled_data.shape)
sampled_data.head()

(3000, 558)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,...,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557
223,0.0,0.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,8.0,3.0,2.0,3.0,2.0,1.0,4.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,3.0,2.0,2.0,2.0,1.0,1.0,1.0,4.0,4.0,2.0,4.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,5.0,4.0,3.0,1.0,2.0,4.0,1.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,1.0,4.0,1.0,3.0,1.0,1.0,2.0,1.0,1.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,6.0,2.0,6.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,4.0,6.0,6.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,2.0,2.0,2.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9.0,2.0,-1.0,1.0,4.0,7.0,6.0,2.0,1.0,4.0,0.0,3.0,4.0,0.0,2.0,2.0,5.0,1.0,3.0,4.0,4.0,5.0,1.0,1.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0,6.0,9.0,0.0,1.0,3.0,24.0,11.0,92.0,4.0,4.0,0.0,2.0,1.0,1.0,1.0,55.0,4.0,4.0,1.0,3.0,3.0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2480,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,3.0,3.0,1.0,7.0,2.0,2.0,2.0,2.0,2.0,3.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0,4.0,5.0,1.0,3.0,4.0,1.0,1.0,3.0,2.0,1.0,5.0,4.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,1.0,4.0,4.0,4.0,3.0,3.0,4.0,3.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,6.0,5.0,5.0,6.0,3.0,3.0,1.0,1.0,1.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,1.0,4.0,5.0,3.0,4.0,1.0,5.0,4.0,4.0,4.0,3.0,1.0,4.0,2.0,1.0,2.0,2.0,3.0,1.0,4.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0,4.0,1.0,1.0,3.0,1.0,1.0,1.0,6.0,2.0,0.0,0.0,8.0,7.0,6.0,1.0,0.0,3.0,0.0,16.0,1.0,0.0,3.0,3.0,20.0,5.0,4.0,4.0,4.0,5.0,3.0,3.0,2.0,4.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,6.0,6.0,5.0,0.0,1.0,3.0,13.0,13.0,41.0,12.0,30.0,0.0,1.0,2.0,2.0,2.0,90.0,4.0,4.0,2.0,4.0,2.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
898,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,3.0,3.0,4.0,7.0,3.0,1.0,3.0,2.0,2.0,3.0,4.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,4.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,3.0,2.0,3.0,2.0,2.0,2.0,3.0,2.0,3.0,3.0,2.0,5.0,5.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,1.0,5.0,3.0,5.0,3.0,2.0,2.0,2.0,2.0,3.0,3.0,2.0,3.0,5.0,5.0,5.0,1.0,3.0,4.0,3.0,4.0,2.0,3.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,5.0,4.0,3.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0,3.0,2.0,3.0,2.0,2.0,5.0,5.0,5.0,2.0,4.0,2.0,1.0,2.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,4.0,2.0,2.0,2.0,6.0,1.0,6.0,2.0,0.0,0.0,7.0,7.0,6.0,2.0,0.0,1.0,0.0,16.0,2.0,0.0,3.0,3.0,20.0,3.0,2.0,5.0,4.0,3.0,2.0,3.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0,6.0,0.0,0.0,5.0,3.0,10.0,24.0,16.0,7.0,7.0,0.0,2.0,2.0,2.0,2.0,80.0,3.0,3.0,3.0,3.0,3.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1928,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,3.0,4.0,2.0,6.0,3.0,1.0,4.0,1.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,3.0,3.0,2.0,3.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,2.0,3.0,4.0,2.0,4.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,3.0,2.0,1.0,3.0,2.0,4.0,3.0,1.0,3.0,1.0,5.0,4.0,1.0,5.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,3.0,6.0,6.0,3.0,5.0,5.0,2.0,1.0,1.0,5.0,5.0,3.0,5.0,4.0,4.0,1.0,1.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,5.0,4.0,2.0,5.0,2.0,2.0,2.0,1.0,6.0,2.0,6.0,2.0,2.0,2.0,1.0,2.0,1.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,4.0,4.0,3.0,2.0,4.0,1.0,4.0,2.0,2.0,4.0,1.0,1.0,4.0,3.0,1.0,10.0,1.0,1.0,0.0,9.0,7.0,6.0,1.0,2.0,3.0,0.0,16.0,4.0,0.0,2.0,2.0,5.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,6.0,3.0,0.0,3.0,3.0,20.0,2.0,22.0,3.0,1.0,0.0,1.0,1.0,1.0,1.0,90.0,4.0,4.0,4.0,4.0,4.0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1250,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,3.0,4.0,1.0,7.0,2.0,2.0,2.0,1.0,1.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,3.0,1.0,1.0,1.0,4.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,4.0,4.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,5.0,2.0,3.0,3.0,2.0,3.0,3.0,3.0,2.0,2.0,3.0,2.0,5.0,4.0,1.0,5.0,4.0,1.0,5.0,5.0,5.0,1.0,5.0,1.0,4.0,3.0,2.0,2.0,2.0,3.0,3.0,2.0,3.0,3.0,5.0,5.0,3.0,1.0,1.0,3.0,2.0,1.0,1.0,3.0,1.0,4.0,4.0,4.0,4.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,3.0,4.0,4.0,2.0,3.0,2.0,3.0,1.0,1.0,1.0,1.0,4.0,3.0,2.0,1.0,1.0,1.0,1.0,2.0,3.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,4.0,3.0,3.0,4.0,3.0,4.0,4.0,3.0,1.0,3.0,3.0,1.0,1.0,1.0,4.0,1.0,7.0,2.0,0.0,0.0,9.0,7.0,6.0,2.0,0.0,3.0,0.0,12.0,3.0,0.0,2.0,1.0,6.0,2.0,4.0,4.0,4.0,1.0,2.0,3.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0,6.0,4.0,0.0,5.0,3.0,20.0,10.0,48.0,15.0,25.0,2.0,1.0,2.0,1.0,2.0,50.0,3.0,4.0,4.0,3.0,3.0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [46]:
shap_config = clarify.SHAPConfig(
    baseline=[test_features.iloc[0].values.tolist()],
    num_samples=3000,  
    agg_method="mean_abs",
    save_local_shap_values=True
)

explainability_output_path = "s3://{}/{}/clarify-explainability".format(bucket, prefix)

explainability_data_config = clarify.DataConfig(
    #s3_data_input_path='s3://{}/{}/train'.format(bucket, prefix),
    s3_data_input_path=sampled_s3_uri,
    s3_output_path=explainability_output_path,
    label='MATH_Proficient',
    headers=train_data.columns.to_list(),
    dataset_type="text/csv",
)

In [47]:
# Set logging level for 'sagemaker.clarify' to WARNING (hides INFO messages)
import logging

logging.getLogger("sagemaker.clarify").setLevel(logging.WARNING)

clarify_processor.run_explainability(
    data_config=explainability_data_config,
    model_config=model_config,
    explainability_config=shap_config
)

INFO:sagemaker:Creating processing-job with name Clarify-Explainability-2025-03-04-03-03-37-739


.................[34msagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml[0m
[34msagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml[0m
[34mWe are not in a supported iso region, /bin/sh exiting gracefully with no changes.[0m
[34mINFO:sagemaker-clarify-processing:Starting SageMaker Clarify Processing job[0m
[34mINFO:analyzer.data_loading.data_loader_util:Analysis config path: /opt/ml/processing/input/config/analysis_config.json[0m
[34mINFO:analyzer.data_loading.data_loader_util:Analysis result path: /opt/ml/processing/output[0m
[34mINFO:analyzer.data_loading.data_loader_util:This host is algo-1.[0m
[34mINFO:analyzer.data_loading.data_loader_util:This host is the leader.[0m
[34mINFO:analyzer.data_loading.data_loader_util:Number of hosts in the cluster is 1.[0m
[34mINFO:sagemaker-clarify-processing:Running Python / Pandas based analyzer.[0m
[34mINFO:analyzer.data_loading.data_l

UnexpectedStatusException: Error for Processing job Clarify-Explainability-2025-03-04-03-03-37-739: Failed. Reason: ClientError: An error occurred (ModelError) when calling the InvokeEndpoint operation (reached max retries: 0): Received client error (415) from primary with message "content-type text/csv not supported". See https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logEventViewer:group=/aws/sagemaker/Endpoints/sm-clarify-Clarify-United-States-04-03-2025-03--1741057588-3c75 in account 986030204467 for more information., exit code: 1

## Train the model again with the top 20 predictors
#### Get the list of top 20 predictors

In [None]:
# Replace with your actual bucket name and prefix used in explainability_output_path
# bucket = "your-bucket-name"
# prefix = "your-prefix"  # e.g., the folder structure used in your explainability_output_path

# Construct the S3 key for the output file
key = f"{prefix}/clarify-explainability/analysis.json"

# Initialize boto3 client for S3 and download the JSON report
s3 = boto3.client("s3")
response = s3.get_object(Bucket=bucket, Key=key)
content = response["Body"].read().decode("utf-8")
report = json.loads(content)

# Navigate to the global SHAP values dictionary
global_shap = report["explanations"]["kernel_shap"]["label0"]["global_shap_values"]

# Sort the items by the SHAP value in descending order and take the top 20
top_20 = sorted(global_shap.items(), key=lambda item: item[1], reverse=True)[:20]

# Extract just the feature names
top_20_features = [feature for feature, value in top_20]

# Print
print("Top 20 features with the highest mean absolute SHAP values:")
for feature in top_20_features:
    print(feature)


In [None]:
# Make a subset of the training dataset (with only 20 predictors)
variables_to_keep = ["MATH_Proficient"] + top_20_features
train_data_small = train_data[variables_to_keep]
print(train_data_small.shape)
train_data_small.head()

In [None]:
# Save train dataset 
train_data_small.to_csv('train_small.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train_small/train_small.csv')).upload_file('train_small.csv')

In [None]:
# Make a subset of the validation dataset (with only 20 predictors)
validation_data_small = validation_data[variables_to_keep]
print(validation_data_small.shape)
validation_data_small.head()

In [None]:
# Save validation dataset 
validation_data_small.to_csv('validation_small.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation_small/validation_small.csv')).upload_file('validation_small.csv')

#### Train the model using the hyperparameters from the best model

In [None]:
# cell 15
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, "factorization-machines")

In [None]:
# cell 16
s3_input_train_small = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train_small'.format(bucket, prefix), content_type='text/csv')
s3_input_validation_small = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation_small/'.format(bucket, prefix), content_type='text/csv')

In [None]:
# cell 17
sess = sagemaker.Session()

fm = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.c5.xlarge",
    output_path=output_location,
    sagemaker_session=sess,
)


# Set hyperparameters for logistic regression - Set to default values
fm.set_hyperparameters(
    feature_dim=557,  # Keep this static if it's predefined
    predictor_type="binary_classifier",  # Keep this if it's a classification task
    mini_batch_size=best_mini_batch_size,
    epochs=best_epochs,
    num_factors=10,  # Ensure this is correct or replace with tuned value if applicable

    factors_lr=best_factors_lr,
    linear_lr=best_linear_lr,
    bias_lr=best_bias_lr,

    factors_wd=best_factors_wd,
    linear_wd=best_linear_wd,
    bias_wd=best_bias_wd
)


fm.fit({'train': s3_input_train_small, 'test': s3_input_validation_small}) 

## Deploy the model

In [None]:
test_data_small = test_data[variables_to_keep]

In [None]:
# cell 18
fm_small_predictor = fm.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

In [None]:
# cell 19
fm_small_predictor.serializer = sagemaker.serializers.CSVSerializer()

Now, we'll use a simple function to:
1. Loop over our test dataset
1. Split it into mini-batches of rows 
1. Convert those mini-batches to CSV string payloads (notice, we drop the target variable from our dataset first)
1. Retrieve mini-batch predictions by invoking the XGBoost endpoint
1. Collect predictions and convert from the CSV output our model provides into a NumPy array

In [None]:
# Get the raw prediction output
raw_predictions_small = fm_small_predictor.predict(test_data_small.drop(['MATH_Proficient'], axis=1).to_numpy())

# Decode and parse JSON
parsed_predictions_small = json.loads(raw_predictions_small.decode("utf-8"))

# Extract the scores
predictions_small = np.array([pred["score"] for pred in parsed_predictions_small["predictions"]])

In [None]:
# Save the predicted values for the test set
predicted_values_small = predictions_small
predicted_values_small = pd.DataFrame(predicted_values_small, columns=['Predicted Values'])
predicted_values_small.to_csv('predicted_values_small.csv', index=False, header=False)

In [None]:
# Clean up
fm_small_predictor.delete_endpoint(delete_endpoint_config=True)

## Summary

#### Number of students not proficient in Math

In [None]:
#print("Students who are proficient: ", proficient_n)
print("Students who are NOT proficient in Math: ", not_proficient_n, "(", not_proficient_p, "%)")

#### Model performance (model with all the predictors)

In [None]:
suggested_threshold = (100 - not_proficient_p)/100
print("Suggested threshold:", round(suggested_threshold, 2))

***Adjust the threhold for the FINAL PREDICTIONS if necessary!!*** 

The model will predict as Math_proficient if the probability is above this threhold. (If the threshold is above 0.5, it will reduce the number of students predicted as "Math proficient" for both students that are actually proficient and not proficient in Math.)

In [None]:
threshold = 0.68

print("Threshold:", threshold)

In [None]:
import pandas as pd
import numpy as np

# Read in the real values
real_values = pd.read_csv('real_values.csv', usecols=[0], header=None)
real_values = real_values.values.ravel()

# Read in the predicted values (using the full model)
predicted_values_full = pd.read_csv('predicted_values_full.csv', usecols=[0], header=None)
predicted_values_full = predicted_values_full.values.ravel()

In [None]:
cm = pd.crosstab(index=real_values, 
                 columns=np.round( (predicted_values_full >= threshold).astype(int) ), 
                 rownames=['actuals'], 
                 colnames=['predictions'])

TN = cm.loc[0.0, 0.0]
FP = cm.loc[0.0, 1.0]
FN = cm.loc[1.0, 0.0]
TP = cm.loc[1.0, 1.0]

accuracy = (TP + TN) / (TP + TN + FP + FN) * 100
precision = TP / (TP + FP) * 100 if (TP + FP) > 0 else 0
recall = TP / (TP + FN) * 100 if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
specificity = TN / (TN + FP) * 100 if (TN + FP) > 0 else 0

print("MODEL USING ALL FEATURES \n")
print(cm)

print("\nAccuracy: {:.1f}".format(accuracy))
print("F1 Score: {:.1f}".format(f1_score))
print("Precision: {:.1f}".format(precision))
print("Recall: {:.1f}".format(recall))
print("Specificity: {:.1f}".format(specificity))

### Model performance (model with 20 predictors)

In [None]:
# Read in the predicted values (using 20 predictors)
predicted_values_small = pd.read_csv('predicted_values_small.csv', usecols=[0], header=None)
predicted_values_small = predicted_values_small.values.ravel()

In [None]:
cm_small = pd.crosstab(index=real_values, 
                       columns=np.round( (predicted_values_small >= threshold).astype(int) ), 
                       rownames=['actuals'], 
                       colnames=['predictions'])

TN_small = cm_small.loc[0.0, 0.0]
FP_small = cm_small.loc[0.0, 1.0]
FN_small = cm_small.loc[1.0, 0.0]
TP_small = cm_small.loc[1.0, 1.0]

accuracy_small = (TP_small + TN_small) / (TP_small + TN_small + FP_small + FN_small) * 100
precision_small = TP_small / (TP_small + FP_small) * 100 if (TP_small + FP_small) > 0 else 0
recall_small = TP_small / (TP_small + FN_small) * 100 if (TP_small + FN_small) > 0 else 0
f1_score_small = 2 * (precision_small * recall_small) / (precision_small + recall_small) if (precision_small + recall_small) > 0 else 0
specificity_small = TN_small / (TN_small + FP_small) * 100 if (TN_small + FP_small) > 0 else 0

print("MODEL USING 20 FEATURES \n")
print(cm_small)

print("\nAccuracy: {:.1f}".format(accuracy_small))
print("F1 Score: {:.1f}".format(f1_score_small))
print("Precision: {:.1f}".format(precision_small))
print("Recall: {:.1f}".format(recall_small))
print("Specificity: {:.1f}".format(specificity_small))

#### Top 20 features

In [None]:
pd.set_option('display.max_colwidth', None)
from IPython.display import display, Markdown

# Filter the DataFrame to only include rows where Variable_name is in top_20_features
top_20_dictionary = dictionary[dictionary["Variable_name"].isin(top_20_features)]
top_20_table = top_20_dictionary.set_index("Variable_name").loc[top_20_features].reset_index()
display(Markdown(top_20_table.to_markdown()))