# Data Analysis of California Schools Data
## 5/12/2022

In [1]:
# install psycopg2 in case don't have the library.
#!pip install psycopg2

# for generating cool reports
#!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [2]:
# importing dependencies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import psycopg2
from pandas_profiling import ProfileReport

In [3]:
# setting up connect to PostgreSQL
conn = psycopg2.connect(
    host="localhost",
    database="schools",
    user="postgres",
    password="6491")

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

#Executing an MYSQL function using the execute() method
cursor.execute("select version()")

#Setting auto commit false
#conn.autocommit = True

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

In [4]:
#Retrieving `frpm` data
cursor.execute('''select * from schools.public.frpm''')

#Fetching all rows from the table
frpm_raw = cursor.fetchall();

frpm = pd.DataFrame(list(frpm_raw), 
                  columns = ["CDSCode"  
                            ,"Academic Year" 
                            ,"County Code" 
                            ,"District Code" 
                            ,"School Code" 
                            ,"County Name" 
                            ,"District Name" 
                            ,"School Name" 
                            ,"District Type" 
                            ,"School Type" 
                            ,"Educational Option Type" 
                            ,"NSLP Provision Status" 
                            ,"Charter School (Y/N)" 
                            ,"Charter School Number" 
                            ,"Charter Funding Type" 
                            ,"IRC" 
                            ,"Low Grade" 
                            ,"High Grade" 
                            ,"Enrollment (K-12)" 
                            ,"Free Meal Count (K-12)" 
                            ,"Percent (%) Eligible Free (K-12)" 
                            ,"FRPM Count (K-12)" 
                            ,"Percent (%) Eligible FRPM (K-12)" 
                            ,"Enrollment (Ages 5-17)" 
                            ,"Free Meal Count (Ages 5-17)" 
                            ,"Percent (%) Eligible Free (Ages 5-17)" 
                            ,"FRPM Count (Ages 5-17)" 
                            ,"Percent (%) Eligible FRPM (Ages 5-17)" 
                            ,"2013-14 CALPADS Fall 1 Certification Status"
                            ])

# snapshot
frpm

Unnamed: 0,CDSCode,Academic Year,County Code,District Code,School Code,County Name,District Name,School Name,District Type,School Type,...,Free Meal Count (K-12),Percent (%) Eligible Free (K-12),FRPM Count (K-12),Percent (%) Eligible FRPM (K-12),Enrollment (Ages 5-17),Free Meal Count (Ages 5-17),Percent (%) Eligible Free (Ages 5-17),FRPM Count (Ages 5-17),Percent (%) Eligible FRPM (Ages 5-17),2013-14 CALPADS Fall 1 Certification Status
0,09618530000001,2014-2015,09,61853.0,0000001,El Dorado,El Dorado Union High,"Nonpublic, Nonsectarian Schools",High School District,,...,,,,,4.0,,,,,1.0
1,10623720000000,2014-2015,10,62372.0,0000000,Fresno,Pine Ridge Elementary,District Office,Elementary School District,,...,,,,,7.0,,,,,1.0
2,04614240000001,2014-2015,04,61424.0,0000001,Butte,Chico Unified,"Nonpublic, Nonsectarian Schools",Unified School District,,...,1.0,0.333333,1.0,0.333333,3.0,1.0,0.333333,1.0,0.333333,1.0
3,10621660000001,2014-2015,10,62166.0,0000001,Fresno,Fresno Unified,"Nonpublic, Nonsectarian Schools",Unified School District,,...,,,,,10.0,,,,,1.0
4,19642611930288,2014-2015,19,64261.0,1930288,Los Angeles,Arcadia Unified,Arcadia High,Unified School District,High Schools (Public),...,430.0,0.124493,654.0,0.189346,3387.0,421.0,0.124299,644.0,0.190139,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10390,58727516118806,2014-2015,58,72751.0,6118806,Yuba,Wheatland,Wheatland Charter Academy,Elementary School District,K-12 Schools (Public),...,19.0,0.215909,40.0,0.454545,88.0,19.0,0.215909,40.0,0.454545,1.0
10391,58727690123570,2014-2015,58,72769.0,0123570,Yuba,Wheatland Union High,Wheatland Community Day High,High School District,District Community Day Schools,...,,,,,3.0,,,,,1.0
10392,58727695838305,2014-2015,58,72769.0,5838305,Yuba,Wheatland Union High,Wheatland Union High,High School District,High Schools (Public),...,164.0,0.224044,227.0,0.310109,711.0,158.0,0.222222,220.0,0.309423,1.0
10393,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# filtering for high schools
frpm = frpm[(frpm['Low Grade'] == '9') & (frpm['High Grade'] == '12')]

In [6]:
#Retrieving `frpm` data
cursor.execute('''select * from schools.public.sat_scores''')

#Fetching all rows from the table
sat_scores_raw = cursor.fetchall();

sat_scores = pd.DataFrame(list(sat_scores_raw), 
                  columns = ["CDSCode", 
                             "rtype", 
                             "sname", 
                             "dname", 
                             "cname",
                             "enroll12", 
                             "NumTstTakr", 
                             "AvgScrRead", 
                             "AvgScrMath",
                             "AvgScrWrite", 
                             "NumGE1500", 
                             "PctGE1500"
                             ])

# snapshot
sat_scores

Unnamed: 0,CDSCode,rtype,sname,dname,cname,enroll12,NumTstTakr,AvgScrRead,AvgScrMath,AvgScrWrite,NumGE1500,PctGE1500
0,00000000000000,X,,,,496901,210706,489.0,500.0,484.0,93334.0,44.30
1,01000000000000,C,,,Alameda,16978,8855,516.0,536.0,517.0,4900.0,55.34
2,01100170000000,D,,Alameda County Office of Education,Alameda,398,88,418.0,418.0,417.0,14.0,15.91
3,01100170109835,S,FAME Public Charter,Alameda County Office of Education,Alameda,62,17,503.0,546.0,505.0,9.0,52.94
4,01100170112607,S,Envision Academy for Arts & Technology,Alameda County Office of Education,Alameda,75,71,397.0,387.0,395.0,5.0,7.04
...,...,...,...,...,...,...,...,...,...,...,...,...
2326,58727365830054,S,Lincoln (Abraham) (Alternative),Marysville Joint Unified,Yuba,97,0,,,,,
2327,58727365830138,S,Marysville Charter Academy for the Arts,Marysville Joint Unified,Yuba,41,29,501.0,494.0,484.0,16.0,55.17
2328,58727365835202,S,Marysville High,Marysville Joint Unified,Yuba,197,53,489.0,513.0,487.0,24.0,45.28
2329,58727690000000,D,,Wheatland Union High,Yuba,160,54,480.0,475.0,463.0,21.0,38.89


In [7]:
# it looks like when the 'sname', 'dname' or 'cname' columns are empty
# then that row represents some aggregate average of the school, district
# or county averages.
# We only want raw scores per school.
# Lets remove rows where sname, dname or cname are missing.

# replacing empty strings with NaN
sat_scores = sat_scores.replace(r'^\s*$', np.nan, regex=True)

# filtering against na values in col subset
sat_scores = sat_scores.dropna(subset = ['sname', 'dname', 'cname'])

# reseting index
sat_scores.reset_index(inplace = True)

# drop index col
sat_scores.drop('index', axis=1, inplace=True)

# snapshot
sat_scores

Unnamed: 0,CDSCode,rtype,sname,dname,cname,enroll12,NumTstTakr,AvgScrRead,AvgScrMath,AvgScrWrite,NumGE1500,PctGE1500
0,01100170109835,S,FAME Public Charter,Alameda County Office of Education,Alameda,62,17,503.0,546.0,505.0,9.0,52.94
1,01100170112607,S,Envision Academy for Arts & Technology,Alameda County Office of Education,Alameda,75,71,397.0,387.0,395.0,5.0,7.04
2,01100170118489,S,Aspire California College Preparatory Academy,Alameda County Office of Education,Alameda,61,0,,,,,
3,01611190106401,S,Alameda Science and Technology Institute,Alameda Unified,Alameda,36,36,562.0,590.0,555.0,29.0,80.56
4,01611190119222,S,Nea Community Learning Center,Alameda Unified,Alameda,12,6,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1748,58727365830013,S,Lindhurst High,Marysville Joint Unified,Yuba,221,65,428.0,450.0,423.0,13.0,20.00
1749,58727365830054,S,Lincoln (Abraham) (Alternative),Marysville Joint Unified,Yuba,97,0,,,,,
1750,58727365830138,S,Marysville Charter Academy for the Arts,Marysville Joint Unified,Yuba,41,29,501.0,494.0,484.0,16.0,55.17
1751,58727365835202,S,Marysville High,Marysville Joint Unified,Yuba,197,53,489.0,513.0,487.0,24.0,45.28


In [8]:
print(sum(sat_scores[['AvgScrMath', 'AvgScrRead', 'AvgScrWrite']].isnull().all(1)))

print(sum(sat_scores[['AvgScrMath']].isnull().all(1)))

print(sum(sat_scores[['AvgScrRead']].isnull().all(1)))

print(sum(sat_scores[['AvgScrWrite']].isnull().all(1)))

# Scores are either all present, or all missing
# Lets drop all rows where `AvgScrMath` is nan.
sat_scores = sat_scores.dropna(subset = ['AvgScrMath'])

# reset index
sat_scores.reset_index(inplace = True)

sat_scores = sat_scores.drop(columns = ['index'])

# snapshot
sat_scores

499
499
499
499


Unnamed: 0,CDSCode,rtype,sname,dname,cname,enroll12,NumTstTakr,AvgScrRead,AvgScrMath,AvgScrWrite,NumGE1500,PctGE1500
0,01100170109835,S,FAME Public Charter,Alameda County Office of Education,Alameda,62,17,503.0,546.0,505.0,9.0,52.94
1,01100170112607,S,Envision Academy for Arts & Technology,Alameda County Office of Education,Alameda,75,71,397.0,387.0,395.0,5.0,7.04
2,01611190106401,S,Alameda Science and Technology Institute,Alameda Unified,Alameda,36,36,562.0,590.0,555.0,29.0,80.56
3,01611190130229,S,Alameda High,Alameda Unified,Alameda,465,325,543.0,573.0,543.0,229.0,70.46
4,01611190130609,S,Alameda Community Learning Center,Alameda Unified,Alameda,26,20,575.0,593.0,581.0,16.0,80.00
...,...,...,...,...,...,...,...,...,...,...,...,...
1249,57727105738802,S,Woodland Senior High,Woodland Joint Unified,Yolo,277,118,477.0,469.0,468.0,44.0,37.29
1250,58727365830013,S,Lindhurst High,Marysville Joint Unified,Yuba,221,65,428.0,450.0,423.0,13.0,20.00
1251,58727365830138,S,Marysville Charter Academy for the Arts,Marysville Joint Unified,Yuba,41,29,501.0,494.0,484.0,16.0,55.17
1252,58727365835202,S,Marysville High,Marysville Joint Unified,Yuba,197,53,489.0,513.0,487.0,24.0,45.28


In [9]:
#Retrieving `frpm` data
cursor.execute('''select * from schools.public.schools''')

#Fetching all rows from the table
schools_raw = cursor.fetchall();

schools = pd.DataFrame(list(schools_raw), 
                  columns = ["CDSCode" 
                            ,"NCESDist"
                            ,"NCESSchool"
                            ,"StatusType"
                            ,"County"
                            ,"Distrinct"
                            ,"School"
                            ,"Street"
                            ,"StreetAbr"
                            ,"City"
                            ,"Zip"
                            ,"State"
                            ,"MailStreet"
                            ,"MailStrAbr"
                            ,"MailCity"
                            ,"MailZip"
                            ,"MailState"
                            ,"Phone"
                            ,"Ext"
                            ,"Website"
                            ,"OpenDate"
                            ,"ClosedDate"
                            ,"Charter"
                            ,"CharterNum"
                            ,"FundingType"
                            ,"DOC"
                            ,"DOCType"
                            ,"SOC"
                            ,"SOCType"
                            ,"EdOpsCode"
                            ,"EdOpsName"
                            ,"EILCode"
                            ,"EILName"
                            ,"GSoffered"
                            ,"GSserved"
                            ,"Virtual"
                            ,"Magnet"
                            ,"Latitude"
                            ,"Longitude"
                            ,"AdmFName1"
                            ,"AdmLName1"
                            ,"AdmEmail1"
                            ,"AdmFName2"
                            ,"AdmLName2"
                            ,"AdmEmail2"
                            ,"AdmFName3"
                            ,"AdmLName3"
                            ,"AdmEmail3"
                            ,"LastUpdate"
                            ,"district"])

# snapshot
schools

Unnamed: 0,CDSCode,NCESDist,NCESSchool,StatusType,County,Distrinct,School,Street,StreetAbr,City,...,AdmLName1,AdmEmail1,AdmFName2,AdmLName2,AdmEmail2,AdmFName3,AdmLName3,AdmEmail3,LastUpdate,district
0,01100170000000,0691051,,Active,Alameda,,,313 West Winton Avenue,313 West Winton Ave.,Hayward,...,Monroe,lkmonroe@acoe.org,,,,,,,2015-06-23,Alameda County Office of Education
1,01100170109835,0691051,10546,Closed,Alameda,,FAME Public Charter,"39899 Balentine Drive, Suite 335","39899 Balentine Dr., Ste. 335",Newark,...,,,,,,,,,2015-09-01,Alameda County Office of Education
2,01100170112607,0691051,10947,Active,Alameda,,Envision Academy for Arts & Technology,1515 Webster Street,1515 Webster St.,Oakland,...,Robell,laura@envisionacademy.org,,,,,,,2015-06-18,Alameda County Office of Education
3,01100170118489,0691051,12283,Closed,Alameda,,Aspire California College Preparatory Academy,2125 Jefferson Avenue,2125 Jefferson Ave.,Berkeley,...,,,,,,,,,2015-07-01,Alameda County Office of Education
4,01100170123968,0691051,12844,Active,Alameda,,Community School for Creative Education,2111 International Boulevard,2111 International Blvd.,Oakland,...,Thompson,cliffordt@communityschoolforcreativeeducation.org,,,,,,,2016-07-18,Alameda County Office of Education
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,19647330120360,0622710,12525,Active,Los Angeles,,Daniel Pearl Journalism & Communications Magnet,6649 Balboa Boulevard,6649 Balboa Blvd.,Van Nuys,...,Smith,debbie.smith@lausd.net,,,,,,,2016-03-03,Los Angeles Unified
4996,19647330120477,0622710,12447,Active,Los Angeles,,Aspire Titan Academy,6720 South Alameda Street,6720 South Alameda St.,Huntington Park,...,Lafaurie,leilani.lafaurie@aspirepublicschools.org,Delphine,Sherman,delphine.sherman@aspirepublicschools.org,,,,2016-09-22,Los Angeles Unified
4997,19647330120527,0622710,12491,Active,Los Angeles,,Watts Learning Center Charter Middle,8800 South San Pedro Street,8800 South San Pedro St.,Los Angeles,...,Windom,gwindom@wlccms.org,,,,,,,2016-10-07,Los Angeles Unified
4998,19647330120667,0622710,12516,Closed,Los Angeles,,Futuro College Preparatory Elementary,1314 Dacatah Street,1314 Dacatah St.,Los Angeles,...,,,,,,,,,2013-07-09,Los Angeles Unified


In [10]:
# cleaning schools zip code
# there are '-####' in the column, we'll remove them
schools_zip = []

for i in range(len(schools)):
    val = schools.Zip.str.split('-')[i][0]
    if val == '':
        val = np.nan
    else:
        val = round(int(val))
    schools_zip += [val]
    
schools['Zip'] = schools_zip

# drop Nan in `schools.Zip`
schools = schools.dropna(subset = ['Zip'])

# reset index
schools.reset_index(inplace = True)

schools = schools.drop(columns = ['index'])

# snapshot
schools

Unnamed: 0,CDSCode,NCESDist,NCESSchool,StatusType,County,Distrinct,School,Street,StreetAbr,City,...,AdmLName1,AdmEmail1,AdmFName2,AdmLName2,AdmEmail2,AdmFName3,AdmLName3,AdmEmail3,LastUpdate,district
0,01100170000000,0691051,,Active,Alameda,,,313 West Winton Avenue,313 West Winton Ave.,Hayward,...,Monroe,lkmonroe@acoe.org,,,,,,,2015-06-23,Alameda County Office of Education
1,01100170109835,0691051,10546,Closed,Alameda,,FAME Public Charter,"39899 Balentine Drive, Suite 335","39899 Balentine Dr., Ste. 335",Newark,...,,,,,,,,,2015-09-01,Alameda County Office of Education
2,01100170112607,0691051,10947,Active,Alameda,,Envision Academy for Arts & Technology,1515 Webster Street,1515 Webster St.,Oakland,...,Robell,laura@envisionacademy.org,,,,,,,2015-06-18,Alameda County Office of Education
3,01100170118489,0691051,12283,Closed,Alameda,,Aspire California College Preparatory Academy,2125 Jefferson Avenue,2125 Jefferson Ave.,Berkeley,...,,,,,,,,,2015-07-01,Alameda County Office of Education
4,01100170123968,0691051,12844,Active,Alameda,,Community School for Creative Education,2111 International Boulevard,2111 International Blvd.,Oakland,...,Thompson,cliffordt@communityschoolforcreativeeducation.org,,,,,,,2016-07-18,Alameda County Office of Education
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4901,19647330120360,0622710,12525,Active,Los Angeles,,Daniel Pearl Journalism & Communications Magnet,6649 Balboa Boulevard,6649 Balboa Blvd.,Van Nuys,...,Smith,debbie.smith@lausd.net,,,,,,,2016-03-03,Los Angeles Unified
4902,19647330120477,0622710,12447,Active,Los Angeles,,Aspire Titan Academy,6720 South Alameda Street,6720 South Alameda St.,Huntington Park,...,Lafaurie,leilani.lafaurie@aspirepublicschools.org,Delphine,Sherman,delphine.sherman@aspirepublicschools.org,,,,2016-09-22,Los Angeles Unified
4903,19647330120527,0622710,12491,Active,Los Angeles,,Watts Learning Center Charter Middle,8800 South San Pedro Street,8800 South San Pedro St.,Los Angeles,...,Windom,gwindom@wlccms.org,,,,,,,2016-10-07,Los Angeles Unified
4904,19647330120667,0622710,12516,Closed,Los Angeles,,Futuro College Preparatory Elementary,1314 Dacatah Street,1314 Dacatah St.,Los Angeles,...,,,,,,,,,2013-07-09,Los Angeles Unified


In [11]:
# Generating Reports
"""
from pandas_profiling import ProfileReport

frpm_profile = ProfileReport(frpm)
frpm_profile.to_file("Free_Reduced_Priced_Meals_Profile.html")

schools_profile = ProfileReport(schools)
schools_profile.to_file("Schools_Profile.html")

sat_scores_profile = ProfileReport(sat_scores)
sat_scores_profile.to_file("Sat_Scores_Profile.html")
"""

'\nfrom pandas_profiling import ProfileReport\n\nfrpm_profile = ProfileReport(frpm)\nfrpm_profile.to_file("Free_Reduced_Priced_Meals_Profile.html")\n\nschools_profile = ProfileReport(schools)\nschools_profile.to_file("Schools_Profile.html")\n\nsat_scores_profile = ProfileReport(sat_scores)\nsat_scores_profile.to_file("Sat_Scores_Profile.html")\n'

---
## Importing California Income Data

Source: [US Census Data 2014 - California](ACSST5Y2014.S1901_metadata)

I'm going to import the file here and filter for the only two columns we need for our project.

In [12]:
# reading csv
cali_14 = pd.read_csv("https://raw.githubusercontent.com/peterantonarosjr/MATH_290_FinalProject/main/Dataset/US%202014%20Census%20Data/ACSST5Y2014.S1901_data_with_overlays.csv")

cali_14

Unnamed: 0,GEO_ID,NAME,S1901_C01_001E,S1901_C01_001M,S1901_C01_002E,S1901_C01_002M,S1901_C01_003E,S1901_C01_003M,S1901_C01_004E,S1901_C01_004M,...,S1901_C04_012E,S1901_C04_012M,S1901_C04_013E,S1901_C04_013M,S1901_C04_014E,S1901_C04_014M,S1901_C04_015E,S1901_C04_015M,S1901_C04_016E,S1901_C04_016M
0,id,Geographic Area Name,Households!!Estimate!!Total,Households!!Margin of Error!!Total,"Households!!Estimate!!Less than $10,000","Households!!Margin of Error!!Less than $10,000","Households!!Estimate!!$10,000 to $14,999","Households!!Margin of Error!!$10,000 to $14,999","Households!!Estimate!!$15,000 to $24,999","Households!!Margin of Error!!$15,000 to $24,999",...,Nonfamily households!!Estimate!!Median income ...,Nonfamily households!!Margin of Error!!Median ...,Nonfamily households!!Estimate!!Mean income (d...,Nonfamily households!!Margin of Error!!Mean in...,Nonfamily households!!Estimate!!PERCENT IMPUTE...,Nonfamily households!!Margin of Error!!PERCENT...,Nonfamily households!!Estimate!!PERCENT IMPUTE...,Nonfamily households!!Margin of Error!!PERCENT...,Nonfamily households!!Estimate!!PERCENT IMPUTE...,Nonfamily households!!Margin of Error!!PERCENT...
1,8600000US89010,ZCTA5 89010,172,44,14.5,12.0,0.0,15.9,14.5,10.6,...,28250,16533,30674,10680,(X),(X),(X),(X),27.9,(X)
2,8600000US89019,ZCTA5 89019,914,269,0.0,3.2,7.5,7.7,27.8,15.2,...,21318,5780,29987,9446,(X),(X),(X),(X),31.9,(X)
3,8600000US89060,ZCTA5 89060,4076,353,6.4,2.7,8.4,3.7,17.1,5.1,...,24986,5129,41715,18821,(X),(X),(X),(X),28.9,(X)
4,8600000US89061,ZCTA5 89061,2123,235,2.5,2.4,5.7,3.9,7.2,3.5,...,41765,18185,57568,19172,(X),(X),(X),(X),20.3,(X)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1766,8600000US96150,ZCTA5 96150,11388,493,7.1,1.7,7.2,1.8,9.6,1.8,...,30209,2237,42126,3498,(X),(X),(X),(X),26.8,(X)
1767,8600000US96155,ZCTA5 96155,0,12,-,**,-,**,-,**,...,-,**,-,**,(X),(X),(X),(X),-,(X)
1768,8600000US96161,ZCTA5 96161,7071,356,4.5,2.0,1.7,1.3,6.5,2.9,...,50897,7138,82038,22815,(X),(X),(X),(X),34.6,(X)
1769,8600000US97635,ZCTA5 97635,151,76,0.0,17.9,7.9,12.8,18.5,19.9,...,26923,5491,24777,5116,(X),(X),(X),(X),0.0,(X)


In [13]:
# cleaning data

# renaming columns
cali_14 = cali_14.rename(columns = cali_14.iloc[0]).iloc[1:]

# dropping final row
cali_14 = cali_14.iloc[:-1]

# filtering for id, median and mean income columns
cali_14 = cali_14[['id',
                   'Households!!Estimate!!Median income (dollars)',
                   'Households!!Estimate!!Mean income (dollars)']]

# rename columns
cali_14 = cali_14.rename(columns = {'id' : 'Zip',
                                    'Households!!Estimate!!Median income (dollars)' : 'median_income',
                                    'Households!!Estimate!!Mean income (dollars)' : 'mean_income'})

# dropping rows that contain non-numeric characters
cali_14 = cali_14[(pd.to_numeric(cali_14['mean_income'], errors = 'coerce').notnull()) & 
                  (pd.to_numeric(cali_14['median_income'], errors = 'coerce').notnull())]

# reseting index
cali_14.reset_index(inplace = True)

# extracting zip codes for zip id column
cali_14['Zip'] = cali_14.Zip.str[9:]

# casting all columns as integers
cali_14 = cali_14[['Zip', 'median_income', 'mean_income']].astype('int')

# snapshot
cali_14

Unnamed: 0,Zip,median_income,mean_income
0,89010,35000,45226
1,89019,37632,49179
2,89060,40726,52416
3,89061,55096,70373
4,89439,59275,122256
...,...,...,...
1687,96146,53250,74270
1688,96148,52841,65040
1689,96150,47500,62684
1690,96161,75172,105535


In [14]:
# output to csv
#cali_14.to_csv(r"../Data/US 2014 Census Data/California_2014_Zip_Income.csv",
#               index=False)

---

## Joining tables

We will be testing the following hypotheses.

Is there a relationship between getting free or reduced lunch, the mean income of the Zip code of the school, and performance on the Math portion of the SAT.

Specifically,
- $y$ is Avg Math SAT Score 
- $x_0$ is the intercept
- $x_1$ is percentage of students recieving free lunch at the school
- $x_2$ is percentage of students recieving reduced lunch at the school
- $x_3$ is percentage of students paying full priced lunch at the school
- $x_4$ is the mean income of the zip code of the school


**Game Plan:**
We have two options.

- Let `schools` be the left table, and Left Join the three tables `frpm`, `sat_scores` and `cali_14` to `schools`.
- Let `sat_scores` be the left table, and Left Join two tables, `frpm` and `schools`, then Inner Join the three tables to `cali_14`.

Lads the three cells below are to test the difference in cardinality between two different approaches to joining the tables.

In [15]:
#test_1 = frpm[['Low Grade', 'High Grade']]

#test_1[(test_1['Low Grade'] == '9') & (test_1['High Grade'] == '12')]

#schools.merge(frpm, on = 'CDSCode', how = 'left')

In [16]:
#test_2 = schools.merge(frpm, on = 'CDSCode', how = 'left')

#test_2[(test_2['Low Grade'] == '9') & (test_2['High Grade'] == '12')]

In [17]:
#schools.merge(frpm, on = 'CDSCode', how = 'left')

In [18]:
################
# Attempt 1
# Let `schools` be the left table and left join everything to it
################

# schools_income_df
schools_income_df = schools.merge(cali_14, on = 'Zip', how = 'inner')

# schools_income_frpm_df
schools_income_frpm_df = schools_income_df.merge(frpm, on = 'CDSCode', how = 'left')

# schools_income_frpm_sat_scores_df
master_df = schools_income_frpm_df.merge(sat_scores, on = 'CDSCode', how = 'left')

# print cardinalities
print(len(schools))
print(len(schools_income_df))
print(len(schools_income_frpm_df))
print(len(master_df))

# snapshot
master_df

4906
4844
4844
4844


Unnamed: 0,CDSCode,NCESDist,NCESSchool,StatusType,County,Distrinct,School,Street,StreetAbr,City,...,sname,dname,cname,enroll12,NumTstTakr,AvgScrRead,AvgScrMath,AvgScrWrite,NumGE1500,PctGE1500
0,01100170000000,0691051,,Active,Alameda,,,313 West Winton Avenue,313 West Winton Ave.,Hayward,...,,,,,,,,,,
1,01100170130419,0691051,06830,Active,Alameda,,Alameda County Community,313 West Winton Avenue,313 West Winton Ave.,Hayward,...,,,,,,,,,,
2,01100170130427,0691051,09265,Closed,Alameda,,Alameda County Opportunity,313 West Winton Avenue,313 West Winton Ave.,Hayward,...,,,,,,,,,,
3,01100176106751,0691051,08672,Active,Alameda,,Alameda County Special Education,313 West Winton Avenue,313 West Winton Ave.,Hayward,...,,,,,,,,,,
4,01611920000000,0616740,,Active,Alameda,,,24411 Amador Street,24411 Amador St.,Hayward,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4839,19647330117762,0622710,12370,Active,Los Angeles,,School for the Visual Arts and Humanities,701 South Catalina Street,701 South Catalina St.,Los Angeles,...,RFK Community Schools- for the Visual Arts and...,Los Angeles Unified,Los Angeles,81.0,71.0,392.0,416.0,394.0,12.0,16.90
4840,19647330119693,0622710,12524,Active,Los Angeles,,UCLA Community K-12,700 South Mariposa Avenue,700 South Mariposa Ave.,Los Angeles,...,RFK Community Schools-UCLA Community K-12,Los Angeles Unified,Los Angeles,75.0,61.0,384.0,404.0,392.0,6.0,9.84
4841,19647330119735,0622710,12440,Active,Los Angeles,,Young Oak Kim Academy,615 South Shatto Place,615 South Shatto Pl.,Los Angeles,...,,,,,,,,,,
4842,19647330117846,0622710,12361,Active,Los Angeles,,Para Los Niños Middle,835 Stanford Avenue,835 Stanford Ave.,Los Angeles,...,,,,,,,,,,


In [19]:
################
# Attempt 2
# Let `sat_scores` be the left table and left join everything to it
################

# sat_scores_frpm_df
sat_scores_frpm_df = sat_scores.merge(frpm, on = 'CDSCode', how = 'left')

# sat_scores_frpm_schools_df
sat_scores_frpm_schools_df = sat_scores_frpm_df.merge(schools, on = 'CDSCode', how = 'left')

# schools_income_frpm_sat_scores_df
master_df = sat_scores_frpm_schools_df.merge(cali_14, on = 'Zip', how = 'inner')

# print cardinalities
print(len(sat_scores))
print(len(sat_scores_frpm_df))
print(len(sat_scores_frpm_schools_df))
print(len(master_df))

# snapshot
master_df

1254
1254
1254
359


Unnamed: 0,CDSCode,rtype,sname,dname,cname,enroll12,NumTstTakr,AvgScrRead,AvgScrMath,AvgScrWrite,...,AdmFName2,AdmLName2,AdmEmail2,AdmFName3,AdmLName3,AdmEmail3,LastUpdate,district,median_income,mean_income
0,01100170109835,S,FAME Public Charter,Alameda County Office of Education,Alameda,62,17,503.0,546.0,505.0,...,,,,,,,2015-09-01,Alameda County Office of Education,86521,98373
1,01612340130054,S,Newark Memorial High,Newark Unified,Alameda,444,234,486.0,498.0,487.0,...,,,,,,,2015-10-06,Newark Unified,86521,98373
2,01100170112607,S,Envision Academy for Arts & Technology,Alameda County Office of Education,Alameda,75,71,397.0,387.0,395.0,...,,,,,,,2015-06-18,Alameda County Office of Education,30879,50381
3,01612593030772,S,Oakland School for the Arts,Oakland Unified,Alameda,99,80,529.0,470.0,523.0,...,,,,,,,2016-04-19,Oakland Unified,30879,50381
4,01611190106401,S,Alameda Science and Technology Institute,Alameda Unified,Alameda,36,36,562.0,590.0,555.0,...,,,,,,,2015-06-18,Alameda Unified,70935,94830
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,19647330117911,S,New Millennium Secondary,Los Angeles Unified,Los Angeles,67,44,387.0,378.0,388.0,...,,,,,,,2016-09-29,Los Angeles Unified,51956,67133
355,19647330119651,S,Sun Valley High,Los Angeles Unified,Los Angeles,106,30,391.0,408.0,385.0,...,,,,,,,2016-10-12,Los Angeles Unified,49088,62920
356,19647330119685,S,RFK Community Schools-New Open World Academy K-12,Los Angeles Unified,Los Angeles,45,33,402.0,387.0,402.0,...,,,,,,,2016-10-05,Los Angeles Unified,47308,71900
357,19647330119727,S,Ramon C. Cortines School of Visual and Performing,Los Angeles Unified,Los Angeles,356,249,451.0,434.0,446.0,...,,,,,,,2016-03-03,Los Angeles Unified,29492,50578


In [20]:
# this is the number of NAN values.
# this is 3 off of the cardinality of the left join instead of the inner
sum(sat_scores_frpm_schools_df['Zip'].isna())

892

In [21]:
# There is a one to many relationship between zip codes and schools
# There can be many schools per zip code
master_df[master_df['Zip'] == 94560]['AvgScrMath']

0    546.0
1    498.0
Name: AvgScrMath, dtype: float64

In [22]:
# master_df column names
master_df.columns

Index(['CDSCode', 'rtype', 'sname', 'dname', 'cname', 'enroll12', 'NumTstTakr',
       'AvgScrRead', 'AvgScrMath', 'AvgScrWrite', 'NumGE1500', 'PctGE1500',
       'Academic Year', 'County Code', 'District Code', 'School Code',
       'County Name', 'District Name', 'School Name', 'District Type',
       'School Type', 'Educational Option Type', 'NSLP Provision Status',
       'Charter School (Y/N)', 'Charter School Number', 'Charter Funding Type',
       'IRC', 'Low Grade', 'High Grade', 'Enrollment (K-12)',
       'Free Meal Count (K-12)', 'Percent (%) Eligible Free (K-12)',
       'FRPM Count (K-12)', 'Percent (%) Eligible FRPM (K-12)',
       'Enrollment (Ages 5-17)', 'Free Meal Count (Ages 5-17)',
       'Percent (%) Eligible Free (Ages 5-17)', 'FRPM Count (Ages 5-17)',
       'Percent (%) Eligible FRPM (Ages 5-17)',
       '2013-14 CALPADS Fall 1 Certification Status', 'NCESDist', 'NCESSchool',
       'StatusType', 'County', 'Distrinct', 'School', 'Street', 'StreetAbr',
       'Ci

In [23]:
# filtering for relevant columns
clean_master_df = master_df[["CDSCode",
                            # cali income data
                            'Zip',
                            'median_income',
                            'mean_income',
                            # frpm
                            "County",
                            "Distrinct",
                            "School Name",
                            "District Type",
                            "School Type",
                            "Low Grade",
                            "High Grade",
                            "Enrollment (K-12)",
                            "Enrollment (K-12)", 
                            "Free Meal Count (K-12)", 
                            "Percent (%) Eligible Free (K-12)", 
                            "FRPM Count (K-12)", 
                            "Percent (%) Eligible FRPM (K-12)", 
                            # sat scores
                            "AvgScrRead", 
                            "AvgScrMath",
                            "AvgScrWrite", 
                              ]]

#snapshot
clean_master_df

Unnamed: 0,CDSCode,Zip,median_income,mean_income,County,Distrinct,School Name,District Type,School Type,Low Grade,High Grade,Enrollment (K-12),Enrollment (K-12).1,Free Meal Count (K-12),Percent (%) Eligible Free (K-12),FRPM Count (K-12),Percent (%) Eligible FRPM (K-12),AvgScrRead,AvgScrMath,AvgScrWrite
0,01100170109835,94560.0,86521,98373,Alameda,,,,,,,,,,,,,503.0,546.0,505.0
1,01612340130054,94560.0,86521,98373,Alameda,,Newark Memorial High,Unified School District,High Schools (Public),9,12,1850.0,1850.0,703.0,0.380000,838.0,0.452973,486.0,498.0,487.0
2,01100170112607,94612.0,30879,50381,Alameda,,Envision Academy for Arts & Technology,County Office of Education (COE),High Schools (Public),9,12,395.0,395.0,186.0,0.470886,186.0,0.470886,397.0,387.0,395.0
3,01612593030772,94612.0,30879,50381,Alameda,,,,,,,,,,,,,529.0,470.0,523.0
4,01611190106401,94501.0,70935,94830,Alameda,,Alameda Science and Technology Institute,Unified School District,Alternative Schools of Choice,9,12,170.0,170.0,39.0,0.229412,52.0,0.305882,562.0,590.0,555.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,19647330117911,90248.0,51956,67133,Los Angeles,,New Millennium Secondary,Unified School District,High Schools (Public),9,12,196.0,196.0,119.0,0.607143,142.0,0.724490,387.0,378.0,388.0
355,19647330119651,91352.0,49088,62920,Los Angeles,,Sun Valley High,Unified School District,High Schools (Public),9,12,568.0,568.0,397.0,0.698944,398.0,0.700704,391.0,408.0,385.0
356,19647330119685,90010.0,47308,71900,Los Angeles,,,,,,,,,,,,,402.0,387.0,402.0
357,19647330119727,90012.0,29492,50578,Los Angeles,,Ramon C. Cortines School of Visual and Perform...,Unified School District,High Schools (Public),9,12,1647.0,1647.0,892.0,0.541591,1015.0,0.616272,451.0,434.0,446.0


## Preparing for Hypothesis Test

We wish to see how the average Math SAT score at a school is a function of mean income and FRPM.

I propose to consider the proportions of the students at a school recieving free lunch, and reduced lunch, and financial assistance instead of the counts of students. Consider if a big school has a high proportion of students recieving any financial lunch aid, and if a small school has a low number of student recieving aid. The count of students are disproportionate. To this end, we'll create three new columns.


In [24]:
# Initializing new df
df = clean_master_df.copy()

# snapshot
#df

In [25]:
# Percent eligibility are either all present, or all missing
print(sum(df[['Percent (%) Eligible Free (K-12)', 'Percent (%) Eligible FRPM (K-12)']].isnull().all(1)))

print(sum(df['Percent (%) Eligible Free (K-12)'].isnull()))

print(sum(df['Percent (%) Eligible FRPM (K-12)'].isnull()))

# Lets drop all rows where `Percent (%) Eligible Free (K-12)` is nan.
df = df.dropna(subset = ['Percent (%) Eligible Free (K-12)'])

# reset index
df.reset_index(inplace = True)

df = df.drop(columns = ['index'])

# snapshot
df

47
47
47


Unnamed: 0,CDSCode,Zip,median_income,mean_income,County,Distrinct,School Name,District Type,School Type,Low Grade,High Grade,Enrollment (K-12),Enrollment (K-12).1,Free Meal Count (K-12),Percent (%) Eligible Free (K-12),FRPM Count (K-12),Percent (%) Eligible FRPM (K-12),AvgScrRead,AvgScrMath,AvgScrWrite
0,01612340130054,94560.0,86521,98373,Alameda,,Newark Memorial High,Unified School District,High Schools (Public),9,12,1850.0,1850.0,703.0,0.380000,838.0,0.452973,486.0,498.0,487.0
1,01100170112607,94612.0,30879,50381,Alameda,,Envision Academy for Arts & Technology,County Office of Education (COE),High Schools (Public),9,12,395.0,395.0,186.0,0.470886,186.0,0.470886,397.0,387.0,395.0
2,01611190106401,94501.0,70935,94830,Alameda,,Alameda Science and Technology Institute,Unified School District,Alternative Schools of Choice,9,12,170.0,170.0,39.0,0.229412,52.0,0.305882,562.0,590.0,555.0
3,01611190130229,94501.0,70935,94830,Alameda,,Alameda High,Unified School District,High Schools (Public),9,12,1746.0,1746.0,320.0,0.183276,396.0,0.226804,543.0,573.0,543.0
4,01611190132878,94501.0,70935,94830,Alameda,,Encinal High,Unified School District,High Schools (Public),9,12,1052.0,1052.0,371.0,0.352662,447.0,0.424905,483.0,504.0,476.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,19647330117762,90005.0,31214,45746,Los Angeles,,RFK Community Schools- for the Visual Arts and...,Unified School District,High Schools (Public),9,12,453.0,453.0,408.0,0.900662,410.0,0.905077,392.0,416.0,394.0
308,19647330117911,90248.0,51956,67133,Los Angeles,,New Millennium Secondary,Unified School District,High Schools (Public),9,12,196.0,196.0,119.0,0.607143,142.0,0.724490,387.0,378.0,388.0
309,19647330119651,91352.0,49088,62920,Los Angeles,,Sun Valley High,Unified School District,High Schools (Public),9,12,568.0,568.0,397.0,0.698944,398.0,0.700704,391.0,408.0,385.0
310,19647330119727,90012.0,29492,50578,Los Angeles,,Ramon C. Cortines School of Visual and Perform...,Unified School District,High Schools (Public),9,12,1647.0,1647.0,892.0,0.541591,1015.0,0.616272,451.0,434.0,446.0


In [26]:
# Creating new columns
df['FreeLunch'] = df['Percent (%) Eligible Free (K-12)']

df['ReducedLunch'] = df['Percent (%) Eligible FRPM (K-12)'] - df['Percent (%) Eligible Free (K-12)']

df['FullLunch'] = 1 - df['Percent (%) Eligible FRPM (K-12)']

# snapshot
df

Unnamed: 0,CDSCode,Zip,median_income,mean_income,County,Distrinct,School Name,District Type,School Type,Low Grade,...,Free Meal Count (K-12),Percent (%) Eligible Free (K-12),FRPM Count (K-12),Percent (%) Eligible FRPM (K-12),AvgScrRead,AvgScrMath,AvgScrWrite,FreeLunch,ReducedLunch,FullLunch
0,01612340130054,94560.0,86521,98373,Alameda,,Newark Memorial High,Unified School District,High Schools (Public),9,...,703.0,0.380000,838.0,0.452973,486.0,498.0,487.0,0.380000,0.072973,0.547027
1,01100170112607,94612.0,30879,50381,Alameda,,Envision Academy for Arts & Technology,County Office of Education (COE),High Schools (Public),9,...,186.0,0.470886,186.0,0.470886,397.0,387.0,395.0,0.470886,0.000000,0.529114
2,01611190106401,94501.0,70935,94830,Alameda,,Alameda Science and Technology Institute,Unified School District,Alternative Schools of Choice,9,...,39.0,0.229412,52.0,0.305882,562.0,590.0,555.0,0.229412,0.076471,0.694118
3,01611190130229,94501.0,70935,94830,Alameda,,Alameda High,Unified School District,High Schools (Public),9,...,320.0,0.183276,396.0,0.226804,543.0,573.0,543.0,0.183276,0.043528,0.773196
4,01611190132878,94501.0,70935,94830,Alameda,,Encinal High,Unified School District,High Schools (Public),9,...,371.0,0.352662,447.0,0.424905,483.0,504.0,476.0,0.352662,0.072243,0.575095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,19647330117762,90005.0,31214,45746,Los Angeles,,RFK Community Schools- for the Visual Arts and...,Unified School District,High Schools (Public),9,...,408.0,0.900662,410.0,0.905077,392.0,416.0,394.0,0.900662,0.004415,0.094923
308,19647330117911,90248.0,51956,67133,Los Angeles,,New Millennium Secondary,Unified School District,High Schools (Public),9,...,119.0,0.607143,142.0,0.724490,387.0,378.0,388.0,0.607143,0.117347,0.275510
309,19647330119651,91352.0,49088,62920,Los Angeles,,Sun Valley High,Unified School District,High Schools (Public),9,...,397.0,0.698944,398.0,0.700704,391.0,408.0,385.0,0.698944,0.001761,0.299296
310,19647330119727,90012.0,29492,50578,Los Angeles,,Ramon C. Cortines School of Visual and Perform...,Unified School District,High Schools (Public),9,...,892.0,0.541591,1015.0,0.616272,451.0,434.0,446.0,0.541591,0.074681,0.383728


In [27]:
# Lets filter the dataframe for the columns we'll need at this point
math_df = df[['AvgScrMath', 'FreeLunch', 'ReducedLunch', 'FullLunch', 'mean_income']]

# snapshot
math_df.head()

Unnamed: 0,AvgScrMath,FreeLunch,ReducedLunch,FullLunch,mean_income
0,498.0,0.38,0.072973,0.547027,98373
1,387.0,0.470886,0.0,0.529114,50381
2,590.0,0.229412,0.076471,0.694118,94830
3,573.0,0.183276,0.043528,0.773196,94830
4,504.0,0.352662,0.072243,0.575095,94830


## Hypothesis Testing - `AvgScrMath`

We can finally get to hypothesis testing! We have a sample of 312 rows to work with. Consider the model again.

- $y$ is Avg Math SAT Score 
- $x_0$ is the intercept
- $x_1$ is percentage of students recieving free lunch at the school
- $x_2$ is percentage of students recieving reduced lunch at the school
- $x_3$ is percentage of students paying full priced lunch at the school
- $x_4$ is the mean income of the zip code of the school

The null hypothesis $H_0$ states that there is no relationship between the average math SAT score at a school and any of the predictors.

The alternative hypothesis $H_A$ states that there is relationship between the average math SAT score at a school and any of the predictors.

We consider the standard alpha, $\alpha = 0.05$.

Our outcome variable is continous, all predictor varaibles are continuous, and there is more than one predictor variable. Therefore we will use multiple linear regression.

In [28]:
import statsmodels.formula.api as smf

# fit model 1
model_1 = smf.ols('AvgScrMath ~ 0 + mean_income',
                data = math_df).fit()

# output model summary
model_1.summary()

0,1,2,3
Dep. Variable:,AvgScrMath,R-squared (uncentered):,0.899
Model:,OLS,Adj. R-squared (uncentered):,0.898
Method:,Least Squares,F-statistic:,2755.0
Date:,"Tue, 17 May 2022",Prob (F-statistic):,1.4100000000000002e-156
Time:,21:44:59,Log-Likelihood:,-2007.2
No. Observations:,312,AIC:,4016.0
Df Residuals:,311,BIC:,4020.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
mean_income,0.0057,0.000,52.485,0.000,0.005,0.006

0,1,2,3
Omnibus:,96.552,Durbin-Watson:,0.874
Prob(Omnibus):,0.0,Jarque-Bera (JB):,263.541
Skew:,-1.432,Prob(JB):,5.9300000000000005e-58
Kurtosis:,6.474,Cond. No.,1.0


In [29]:
# fit model 2
model_2 = smf.ols('AvgScrMath ~ 0 + FreeLunch',
                data = math_df).fit()

# output model summary
model_2.summary()

0,1,2,3
Dep. Variable:,AvgScrMath,R-squared (uncentered):,0.7
Model:,OLS,Adj. R-squared (uncentered):,0.699
Method:,Least Squares,F-statistic:,724.2
Date:,"Tue, 17 May 2022",Prob (F-statistic):,3.31e-83
Time:,21:44:59,Log-Likelihood:,-2176.6
No. Observations:,312,AIC:,4355.0
Df Residuals:,311,BIC:,4359.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
FreeLunch,688.6102,25.588,26.911,0.000,638.262,738.958

0,1,2,3
Omnibus:,33.162,Durbin-Watson:,0.835
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15.469
Skew:,0.352,Prob(JB):,0.000437
Kurtosis:,2.167,Cond. No.,1.0


In [30]:
# fit model 3
model_3 = smf.ols('AvgScrMath ~ 0 + FreeLunch + mean_income',
                data = math_df).fit()

# output model summary
model_3.summary()

0,1,2,3
Dep. Variable:,AvgScrMath,R-squared (uncentered):,0.953
Model:,OLS,Adj. R-squared (uncentered):,0.953
Method:,Least Squares,F-statistic:,3161.0
Date:,"Tue, 17 May 2022",Prob (F-statistic):,6.479999999999999e-207
Time:,21:44:59,Log-Likelihood:,-1886.4
No. Observations:,312,AIC:,3777.0
Df Residuals:,310,BIC:,3784.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
FreeLunch,272.7165,14.319,19.046,0.000,244.542,300.891
mean_income,0.0043,0.000,41.015,0.000,0.004,0.004

0,1,2,3
Omnibus:,8.383,Durbin-Watson:,1.2
Prob(Omnibus):,0.015,Jarque-Bera (JB):,12.822
Skew:,-0.148,Prob(JB):,0.00164
Kurtosis:,3.948,Cond. No.,196000.0


In [31]:
# fit model 4
model_4 = smf.ols('AvgScrMath ~ 0 + FreeLunch + ReducedLunch + FullLunch + mean_income',
                data = math_df).fit()

# output model summary
model_4.summary()

0,1,2,3
Dep. Variable:,AvgScrMath,R-squared:,0.671
Model:,OLS,Adj. R-squared:,0.668
Method:,Least Squares,F-statistic:,209.7
Date:,"Tue, 17 May 2022",Prob (F-statistic):,4.4899999999999995e-74
Time:,21:45:20,Log-Likelihood:,-1588.4
No. Observations:,312,AIC:,3185.0
Df Residuals:,308,BIC:,3200.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
FreeLunch,351.3634,8.103,43.361,0.000,335.419,367.308
ReducedLunch,557.8248,40.829,13.662,0.000,477.485,638.165
FullLunch,550.4482,13.983,39.367,0.000,522.935,577.962
mean_income,0.0003,0.000,2.528,0.012,5.86e-05,0.000

0,1,2,3
Omnibus:,12.928,Durbin-Watson:,1.573
Prob(Omnibus):,0.002,Jarque-Bera (JB):,29.057
Skew:,0.048,Prob(JB):,4.9e-07
Kurtosis:,4.492,Cond. No.,1450000.0


We can reject the null hypothesis and state that there is a relationship between average math SAT scores at a school and the percentage of students recieving free lunch!