In [1]:
# Import Dependencies
import requests
from config import api_key
import censusdata
import pandas as pd  
from sqlalchemy import create_engine
from config import remote_db_endpoint, remote_db_port
from config import remote_db_name, remote_db_user, remote_db_pwd
import numpy as np
from sqlalchemy import func
from sqlalchemy import distinct
import json
import psycopg2
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
cloud_engine = create_engine(f"postgresql://{remote_db_user}:{remote_db_pwd}@{remote_db_endpoint}:{remote_db_port}/{remote_db_name}")
cloud_conn = cloud_engine.connect()

ca_school_data = pd.read_sql('select * from ml_master_clean', cloud_conn)

ca_school_data.head()

Unnamed: 0,Zipcode,Total Students Enrolled,Median Household Income,Median Income BLACK/AA,Median Income INDIGENOUS PEOPLE,Median Income ASIAN,Median Income HAW/PAC ISLANDER,Median Income OTHER RACE,Median Income MIXED RACE,Median Income WHITE/NOT LATINO,...,Substance Abuse Count,Violent Count,Miscellaneous Count,School Name,School Rating,School Address,City,Grades,Students Per Teacher,District
0,91901,244.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,317.0,50.0,52.0,The Heights Charter,7.0,"2710 Alpine Boulevard, Suite E",Alpine,K-8,15:1,Dehesa Elementary
1,91901,333.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,317.0,50.0,52.0,Boulder Oaks Elementary,6.0,2320 Tavern Road,Alpine,1-5,24:1,Alpine Union Elementary
2,91901,438.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,317.0,50.0,52.0,Joan MacQueen Middle,6.0,2001 Tavern Road,Alpine,6-8,20:1,Alpine Union Elementary
3,91901,387.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,317.0,50.0,52.0,Shadow Hills Elementary,6.0,8770 Harbison Canyon Road,Alpine,K-5,20:1,Alpine Union Elementary
4,91901,158.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,317.0,50.0,52.0,Alpine Elementary,4.0,1850 Alpine Boulevard,Alpine,1-5,20:1,Alpine Union Elementary


Four New Columns for Crime ratios are created which will standardize the count with the Zipcode population
The new columns created are 'Theft Count Ratio', 'Substance Abuse Count Ratio', 'Violent Count Ratio' and 'Miscellaneous Count Ratio'

In [3]:
ca_school_data['Theft Count Ratio'] = ca_school_data['Theft Count'] / ca_school_data['Population Total']
ca_school_data['Substance Abuse Count Ratio'] = ca_school_data['Substance Abuse Count'] / ca_school_data['Population Total']
ca_school_data['Violent Count Ratio'] = ca_school_data['Violent Count'] / ca_school_data['Population Total']
ca_school_data['Miscellaneous Count Ratio'] = ca_school_data['Miscellaneous Count'] / ca_school_data['Population Total']
ca_school_data.head(75)

Unnamed: 0,Zipcode,Total Students Enrolled,Median Household Income,Median Income BLACK/AA,Median Income INDIGENOUS PEOPLE,Median Income ASIAN,Median Income HAW/PAC ISLANDER,Median Income OTHER RACE,Median Income MIXED RACE,Median Income WHITE/NOT LATINO,...,School Rating,School Address,City,Grades,Students Per Teacher,District,Theft Count Ratio,Substance Abuse Count Ratio,Violent Count Ratio,Miscellaneous Count Ratio
0,91901,244.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,7.0,"2710 Alpine Boulevard, Suite E",Alpine,K-8,15:1,Dehesa Elementary,0.004269,0.017129,0.002702,0.002810
1,91901,333.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,6.0,2320 Tavern Road,Alpine,1-5,24:1,Alpine Union Elementary,0.004269,0.017129,0.002702,0.002810
2,91901,438.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,6.0,2001 Tavern Road,Alpine,6-8,20:1,Alpine Union Elementary,0.004269,0.017129,0.002702,0.002810
3,91901,387.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,6.0,8770 Harbison Canyon Road,Alpine,K-5,20:1,Alpine Union Elementary,0.004269,0.017129,0.002702,0.002810
4,91901,158.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,4.0,1850 Alpine Boulevard,Alpine,1-5,20:1,Alpine Union Elementary,0.004269,0.017129,0.002702,0.002810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,91915,332.0,101760.0,109750.0,0.0,101328.0,0.0,82159.0,121719.0,117525.0,...,4.0,1949 Discovery Falls Drive,Chula Vista,6-8,14:1,SBC - High Tech High,0.003226,0.000319,0.001489,0.002340
71,91916,211.0,69821.0,0.0,0.0,0.0,0.0,0.0,0.0,69955.0,...,6.0,24842 Viejas Boulevard,Descanso,K-6,21:1,Mountain Empire Unified,0.002483,0.000000,0.000000,0.000000
72,91932,896.0,46659.0,40521.0,74773.0,41701.0,0.0,37527.0,36649.0,57344.0,...,6.0,650 Imperial Beach Boulevard,Imperial Beach,K-8,20:1,South Bay Union,0.004632,0.006402,0.004406,0.003126
73,91932,1585.0,46659.0,40521.0,74773.0,41701.0,0.0,37527.0,36649.0,57344.0,...,4.0,505 Elm Avenue,Imperial Beach,9-12,25:1,Sweetwater Union High,0.004632,0.006402,0.004406,0.003126


Removing the Crime Actual Counts from the dataframe i.e. 
1. Theft Count
2. Substance Abuse Count
3. Violent Count
4. Miscellaneous Count

In [4]:
ca_school_data = ca_school_data.drop(labels=["Theft Count", "Substance Abuse Count","Violent Count","Miscellaneous Count"], axis=1)
ca_school_data.head(75)

Unnamed: 0,Zipcode,Total Students Enrolled,Median Household Income,Median Income BLACK/AA,Median Income INDIGENOUS PEOPLE,Median Income ASIAN,Median Income HAW/PAC ISLANDER,Median Income OTHER RACE,Median Income MIXED RACE,Median Income WHITE/NOT LATINO,...,School Rating,School Address,City,Grades,Students Per Teacher,District,Theft Count Ratio,Substance Abuse Count Ratio,Violent Count Ratio,Miscellaneous Count Ratio
0,91901,244.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,7.0,"2710 Alpine Boulevard, Suite E",Alpine,K-8,15:1,Dehesa Elementary,0.004269,0.017129,0.002702,0.002810
1,91901,333.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,6.0,2320 Tavern Road,Alpine,1-5,24:1,Alpine Union Elementary,0.004269,0.017129,0.002702,0.002810
2,91901,438.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,6.0,2001 Tavern Road,Alpine,6-8,20:1,Alpine Union Elementary,0.004269,0.017129,0.002702,0.002810
3,91901,387.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,6.0,8770 Harbison Canyon Road,Alpine,K-5,20:1,Alpine Union Elementary,0.004269,0.017129,0.002702,0.002810
4,91901,158.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,4.0,1850 Alpine Boulevard,Alpine,1-5,20:1,Alpine Union Elementary,0.004269,0.017129,0.002702,0.002810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,91915,332.0,101760.0,109750.0,0.0,101328.0,0.0,82159.0,121719.0,117525.0,...,4.0,1949 Discovery Falls Drive,Chula Vista,6-8,14:1,SBC - High Tech High,0.003226,0.000319,0.001489,0.002340
71,91916,211.0,69821.0,0.0,0.0,0.0,0.0,0.0,0.0,69955.0,...,6.0,24842 Viejas Boulevard,Descanso,K-6,21:1,Mountain Empire Unified,0.002483,0.000000,0.000000,0.000000
72,91932,896.0,46659.0,40521.0,74773.0,41701.0,0.0,37527.0,36649.0,57344.0,...,6.0,650 Imperial Beach Boulevard,Imperial Beach,K-8,20:1,South Bay Union,0.004632,0.006402,0.004406,0.003126
73,91932,1585.0,46659.0,40521.0,74773.0,41701.0,0.0,37527.0,36649.0,57344.0,...,4.0,505 Elm Avenue,Imperial Beach,9-12,25:1,Sweetwater Union High,0.004632,0.006402,0.004406,0.003126


Removing the following columns from the dataframe i.e. 
1. latitude
2. longitude
3. School Address
4. City
5. Grades


In [5]:
ca_school_drop = ca_school_data.drop(labels=["latitude", "longitude","School Name","School Address","City","Grades"], axis=1)
ca_school_drop.head(75)

Unnamed: 0,Zipcode,Total Students Enrolled,Median Household Income,Median Income BLACK/AA,Median Income INDIGENOUS PEOPLE,Median Income ASIAN,Median Income HAW/PAC ISLANDER,Median Income OTHER RACE,Median Income MIXED RACE,Median Income WHITE/NOT LATINO,...,Population other ratio,Population mixed ratio,Population hispanic/latino ratio,School Rating,Students Per Teacher,District,Theft Count Ratio,Substance Abuse Count Ratio,Violent Count Ratio,Miscellaneous Count Ratio
0,91901,244.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,7.0,15:1,Dehesa Elementary,0.004269,0.017129,0.002702,0.002810
1,91901,333.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,6.0,24:1,Alpine Union Elementary,0.004269,0.017129,0.002702,0.002810
2,91901,438.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,6.0,20:1,Alpine Union Elementary,0.004269,0.017129,0.002702,0.002810
3,91901,387.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,6.0,20:1,Alpine Union Elementary,0.004269,0.017129,0.002702,0.002810
4,91901,158.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,4.0,20:1,Alpine Union Elementary,0.004269,0.017129,0.002702,0.002810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,91915,332.0,101760.0,109750.0,0.0,101328.0,0.0,82159.0,121719.0,117525.0,...,0.010777,0.033961,0.472544,4.0,14:1,SBC - High Tech High,0.003226,0.000319,0.001489,0.002340
71,91916,211.0,69821.0,0.0,0.0,0.0,0.0,0.0,0.0,69955.0,...,0.000000,0.015518,0.120422,6.0,21:1,Mountain Empire Unified,0.002483,0.000000,0.000000,0.000000
72,91932,896.0,46659.0,40521.0,74773.0,41701.0,0.0,37527.0,36649.0,57344.0,...,0.000226,0.039578,0.500998,6.0,20:1,South Bay Union,0.004632,0.006402,0.004406,0.003126
73,91932,1585.0,46659.0,40521.0,74773.0,41701.0,0.0,37527.0,36649.0,57344.0,...,0.000226,0.039578,0.500998,4.0,25:1,Sweetwater Union High,0.004632,0.006402,0.004406,0.003126


In [6]:
district_type = ca_school_drop["District"].value_counts()
district_type

San Diego Unified                             212
Chula Vista Elementary                         48
Poway Unified                                  37
Vista Unified                                  28
Cajon Valley Union                             28
Escondido Union                                26
Sweetwater Union High                          25
La Mesa-Spring Valley                          23
Oceanside Unified                              23
San Marcos Unified                             18
Temecula Valley Unified                        16
Grossmont Union High                           12
National Elementary                            11
South Bay Union                                11
Ramona City Unified                            11
Mountain Empire Unified                        11
San Dieguito Union High                        10
Encinitas Union Elementary                      9
Santee                                          9
Lakeside Union Elementary                       9


In [7]:
district_to_replace = list(district_type[district_type < 12].index)

# Replace in dataframe
for cls in district_to_replace:
    ca_school_drop['District'] = ca_school_drop['District'].replace(cls,"Other")
    
# Check to make sure binning was successful
ca_school_drop['District'].value_counts()

San Diego Unified          212
Other                      200
Chula Vista Elementary      48
Poway Unified               37
Vista Unified               28
Cajon Valley Union          28
Escondido Union             26
Sweetwater Union High       25
Oceanside Unified           23
La Mesa-Spring Valley       23
San Marcos Unified          18
Temecula Valley Unified     16
Grossmont Union High        12
Name: District, dtype: int64

In [8]:
# new data frame with split value columns
new = ca_school_drop["Students Per Teacher"].str.split(":", n = 1, expand = True)
  
# making separate first name column from new data frame
ca_school_drop["Students Per Teacher Count"]= new[0]
#ca_school_drop

ca_school_updated = ca_school_drop.drop(labels=["Students Per Teacher"], axis=1)
ca_school_updated

Unnamed: 0,Zipcode,Total Students Enrolled,Median Household Income,Median Income BLACK/AA,Median Income INDIGENOUS PEOPLE,Median Income ASIAN,Median Income HAW/PAC ISLANDER,Median Income OTHER RACE,Median Income MIXED RACE,Median Income WHITE/NOT LATINO,...,Population other ratio,Population mixed ratio,Population hispanic/latino ratio,School Rating,District,Theft Count Ratio,Substance Abuse Count Ratio,Violent Count Ratio,Miscellaneous Count Ratio,Students Per Teacher Count
0,91901,244.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,7.0,Other,0.004269,0.017129,0.002702,0.00281,15
1,91901,333.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,6.0,Other,0.004269,0.017129,0.002702,0.00281,24
2,91901,438.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,6.0,Other,0.004269,0.017129,0.002702,0.00281,20
3,91901,387.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,6.0,Other,0.004269,0.017129,0.002702,0.00281,20
4,91901,158.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,4.0,Other,0.004269,0.017129,0.002702,0.00281,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
691,92672,422.0,77829.0,0.0,0.0,92614.0,0.0,56964.0,103520.0,80927.0,...,0.000764,0.025181,0.227365,9.0,Other,0.000000,0.000057,0.000000,0.00000,18
692,92672,575.0,77829.0,0.0,0.0,92614.0,0.0,56964.0,103520.0,80927.0,...,0.000764,0.025181,0.227365,7.0,Other,0.000000,0.000057,0.000000,0.00000,25
693,92672,929.0,77829.0,0.0,0.0,92614.0,0.0,56964.0,103520.0,80927.0,...,0.000764,0.025181,0.227365,7.0,Other,0.000000,0.000057,0.000000,0.00000,26
694,92672,889.0,77829.0,0.0,0.0,92614.0,0.0,56964.0,103520.0,80927.0,...,0.000764,0.025181,0.227365,5.0,Other,0.000000,0.000057,0.000000,0.00000,27


In [9]:
ca_school_updated.to_csv('sd_schools_tableau.csv',index=False)

In [10]:
# Using OneHotEncoder to create the features
# Generate our categorical variable lists
ca_school_cat = list(ca_school_updated.dtypes[ca_school_updated.dtypes == "object"].index)
ca_school_cat

['District', 'Students Per Teacher Count']

In [11]:
ca_school_updated["Students Per Teacher Count"] = ca_school_updated["Students Per Teacher Count"].astype(str).astype(int)
ca_school_updated

Unnamed: 0,Zipcode,Total Students Enrolled,Median Household Income,Median Income BLACK/AA,Median Income INDIGENOUS PEOPLE,Median Income ASIAN,Median Income HAW/PAC ISLANDER,Median Income OTHER RACE,Median Income MIXED RACE,Median Income WHITE/NOT LATINO,...,Population other ratio,Population mixed ratio,Population hispanic/latino ratio,School Rating,District,Theft Count Ratio,Substance Abuse Count Ratio,Violent Count Ratio,Miscellaneous Count Ratio,Students Per Teacher Count
0,91901,244.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,7.0,Other,0.004269,0.017129,0.002702,0.00281,15
1,91901,333.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,6.0,Other,0.004269,0.017129,0.002702,0.00281,24
2,91901,438.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,6.0,Other,0.004269,0.017129,0.002702,0.00281,20
3,91901,387.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,6.0,Other,0.004269,0.017129,0.002702,0.00281,20
4,91901,158.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.006754,0.012212,0.168153,4.0,Other,0.004269,0.017129,0.002702,0.00281,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
691,92672,422.0,77829.0,0.0,0.0,92614.0,0.0,56964.0,103520.0,80927.0,...,0.000764,0.025181,0.227365,9.0,Other,0.000000,0.000057,0.000000,0.00000,18
692,92672,575.0,77829.0,0.0,0.0,92614.0,0.0,56964.0,103520.0,80927.0,...,0.000764,0.025181,0.227365,7.0,Other,0.000000,0.000057,0.000000,0.00000,25
693,92672,929.0,77829.0,0.0,0.0,92614.0,0.0,56964.0,103520.0,80927.0,...,0.000764,0.025181,0.227365,7.0,Other,0.000000,0.000057,0.000000,0.00000,26
694,92672,889.0,77829.0,0.0,0.0,92614.0,0.0,56964.0,103520.0,80927.0,...,0.000764,0.025181,0.227365,5.0,Other,0.000000,0.000057,0.000000,0.00000,27


In [12]:
ca_school_cat = list(ca_school_updated.dtypes[ca_school_updated.dtypes == "object"].index)
ca_school_cat

['District']

In [13]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(ca_school_updated[ca_school_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(ca_school_cat)
encode_df.head(75)

Unnamed: 0,District_Cajon Valley Union,District_Chula Vista Elementary,District_Escondido Union,District_Grossmont Union High,District_La Mesa-Spring Valley,District_Oceanside Unified,District_Other,District_Poway Unified,District_San Diego Unified,District_San Marcos Unified,District_Sweetwater Union High,District_Temecula Valley Unified,District_Vista Unified
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
71,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
72,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [14]:
# Merge one-hot encoded features and drop the originals
ca_school_updated = ca_school_updated.merge(encode_df, left_index=True, right_index=True)
ca_school_updated = ca_school_updated.drop(labels=ca_school_cat, axis=1)
ca_school_updated.head()

Unnamed: 0,Zipcode,Total Students Enrolled,Median Household Income,Median Income BLACK/AA,Median Income INDIGENOUS PEOPLE,Median Income ASIAN,Median Income HAW/PAC ISLANDER,Median Income OTHER RACE,Median Income MIXED RACE,Median Income WHITE/NOT LATINO,...,District_Grossmont Union High,District_La Mesa-Spring Valley,District_Oceanside Unified,District_Other,District_Poway Unified,District_San Diego Unified,District_San Marcos Unified,District_Sweetwater Union High,District_Temecula Valley Unified,District_Vista Unified
0,91901,244.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,91901,333.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,91901,438.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,91901,387.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,91901,158.0,79859.0,143917.0,53750.0,78625.0,0.0,48026.0,0.0,88039.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# Split our preprocessed data into our features and target arrays
X = ca_school_updated.drop("School Rating", axis=1).values
y = ca_school_updated["School Rating"].values.reshape(-1, 1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [51]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [52]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 500
hidden_nodes_layer2 = 400
hidden_nodes_layer3 = 300
hidden_nodes_layer4 = 200
hidden_nodes_layer5 = 100

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid"))

# Fourth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="sigmoid"))

# Fourth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_40 (Dense)             (None, 400)               22000     
_________________________________________________________________
dense_41 (Dense)             (None, 300)               120300    
_________________________________________________________________
dense_42 (Dense)             (None, 200)               60200     
_________________________________________________________________
dense_43 (Dense)             (None, 100)               20100     
_________________________________________________________________
dense_44 (Dense)             (None, 1)                 101       
Total params: 222,701
Trainable params: 222,701
Non-trainable params: 0
_________________________________________________________________


In [53]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [54]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

6/6 - 0s - loss: -1.0508e+03 - accuracy: 0.0115
Loss: -1050.7777099609375, Accuracy: 0.01149425283074379
