# US Education Preprocessing and Training

In this notebook we will be taking out reduced dataframe and standardizing our data (since we have different scales in our data) and preforming our train/test split.

In [1]:
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import requests
import numpy as np
import matplotlib.pyplot as plt

In [2]:
us_ed = pd.read_csv(r"C:\Users\book_\OneDrive\Desktop\Data_Storage\US_Education\clean_data\state_ed_03_19_reduced.csv", index_col=0)
us_ed

Unnamed: 0,STATE,YEAR,STATE_ABBREVIATION,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,...,G08_HI_A_READING,G08_HI_A_MATHEMATICS,G08_AS_A_READING,G08_AS_A_MATHEMATICS,G08_AM_A_READING,G08_AM_A_MATHEMATICS,G08_HP_A_READING,G08_HP_A_MATHEMATICS,G08_TR_A_READING,G08_TR_A_MATHEMATICS
0,ALABAMA,2003,AL,727900.0,5196054.0,567704.0,2966981.0,1661369.0,5298932.0,2817111.0,...,,,,,,,,,,
1,ALABAMA,2004,AL,730418.0,5356113.0,625666.0,2971520.0,1758927.0,5450269.0,2875595.0,...,,,,,,,,,,
2,ALABAMA,2005,AL,729342.0,5889966.0,665924.0,3247562.0,1976480.0,5904664.0,3053380.0,...,,,,,,,,,,
3,ALABAMA,2006,AL,743265.0,6364028.0,730112.0,3540436.0,2093480.0,6591429.0,3326656.0,...,,,,,,,,,,
4,ALABAMA,2007,AL,743273.0,7069040.0,689072.0,4070949.0,2309019.0,7196459.0,3653466.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,WYOMING,2014,WY,92732.0,1772633.0,112709.0,965213.0,694711.0,1775999.0,865932.0,...,259.5,275.5,,,248.75,260.0,,,,
812,WYOMING,2015,WY,93867.0,1962874.0,120290.0,1116917.0,725667.0,1942406.0,895910.0,...,258.0,273.0,,,249.50,251.0,,,,
813,WYOMING,2016,WY,94511.0,2044669.0,123012.0,1175899.0,745758.0,2034229.0,921494.0,...,259.5,274.0,,,250.25,259.5,,,,
814,WYOMING,2017,WY,94511.0,2044669.0,123012.0,1175899.0,745758.0,2034229.0,921494.0,...,261.0,275.0,,,251.00,268.0,,,,


# Splitting Training and Testing Data

 Our data ranges from 2003 - 2019, that is 17 years. For this data set we want approximately 70% of our data to train on and 30% for testing. 70% of 17 is 11.9, so we will round that off to 12 years. In other words our training data will be 2003 - 2014, and our testing dat will be 2015 - 2019.
 
 we also need to decide what is our independent (x) versus dependent (y) variable:
 
y = Scores, since this is what we want to improve
x = Expenditure, Revenue, Enrollment

In [3]:
train = us_ed[us_ed['YEAR'] <= 2014]
test = us_ed[us_ed['YEAR'] >= 2015]
train.columns

Index(['STATE', 'YEAR', 'STATE_ABBREVIATION', 'ENROLL', 'TOTAL_REVENUE',
       'FEDERAL_REVENUE', 'STATE_REVENUE', 'LOCAL_REVENUE',
       'TOTAL_EXPENDITURE', 'INSTRUCTION_EXPENDITURE',
       'SUPPORT_SERVICES_EXPENDITURE', 'OTHER_EXPENDITURE',
       'CAPITAL_OUTLAY_EXPENDITURE', 'A_A_A', 'G08_A_A', 'G08_AM_F',
       'G08_AM_M', 'G08_AS_F', 'G08_AS_M', 'G08_BL_F', 'G08_BL_M', 'G08_HI_F',
       'G08_HI_M', 'G08_HP_F', 'G08_HP_M', 'G08_TR_F', 'G08_TR_M', 'G08_WH_F',
       'G08_WH_M', 'G08_A_A_READING', 'G08_A_A_MATHEMATICS', 'G08_A_M_READING',
       'G08_A_M_MATHEMATICS', 'G08_A_F_READING', 'G08_A_F_MATHEMATICS',
       'G08_WH_A_READING', 'G08_WH_A_MATHEMATICS', 'G08_BL_A_READING',
       'G08_BL_A_MATHEMATICS', 'G08_HI_A_READING', 'G08_HI_A_MATHEMATICS',
       'G08_AS_A_READING', 'G08_AS_A_MATHEMATICS', 'G08_AM_A_READING',
       'G08_AM_A_MATHEMATICS', 'G08_HP_A_READING', 'G08_HP_A_MATHEMATICS',
       'G08_TR_A_READING', 'G08_TR_A_MATHEMATICS'],
      dtype='object')

In [4]:
y_train = train['G08_A_A_MATHEMATICS']
x_train = train[['STATE', 'YEAR', 'STATE_ABBREVIATION', 'ENROLL', 'TOTAL_REVENUE',
       'FEDERAL_REVENUE', 'STATE_REVENUE', 'LOCAL_REVENUE',
       'TOTAL_EXPENDITURE', 'INSTRUCTION_EXPENDITURE',
       'SUPPORT_SERVICES_EXPENDITURE', 'OTHER_EXPENDITURE',
       'CAPITAL_OUTLAY_EXPENDITURE']]

y_test = test['G08_A_A_MATHEMATICS']
x_test = test[['STATE', 'YEAR', 'STATE_ABBREVIATION', 'ENROLL', 'TOTAL_REVENUE',
       'FEDERAL_REVENUE', 'STATE_REVENUE', 'LOCAL_REVENUE',
       'TOTAL_EXPENDITURE', 'INSTRUCTION_EXPENDITURE',
       'SUPPORT_SERVICES_EXPENDITURE', 'OTHER_EXPENDITURE',
       'CAPITAL_OUTLAY_EXPENDITURE']]

In [5]:
y_train.to_csv("C:/Users/book_/OneDrive/Desktop/Data_Storage/US_Education/clean_data/y_train.csv")
x_train.to_csv("C:/Users/book_/OneDrive/Desktop/Data_Storage/US_Education/clean_data/x_train.csv")
y_test.to_csv("C:/Users/book_/OneDrive/Desktop/Data_Storage/US_Education/clean_data/y_test.csv")
x_test.to_csv("C:/Users/book_/OneDrive/Desktop/Data_Storage/US_Education/clean_data/x_test.csv")