## Neural Network Model with Top 20 Feature Importances (including census data)

In [5]:
#import libraries 
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:
#read in data
# read in data
parcel_data=pd.read_csv('all_data_joined_2010.csv')



In [None]:
# need to set APN to be index so that we can join this back to other information later on
parceldata_10 = parcel_data
parceldata_10.set_index('APN',inplace=True)
parceldata_10.columns

In [None]:
parceldata_10 = parceldata_10.rename(columns = {'buit_within_decade': 'built_within_decade'})

In [None]:
# Create a dataset for data as of 2020
parceldata_20=pd.read_csv('data/all_data_joined_2020.csv')

# need to set APN to be index so that we can join this back to other information later on
parceldata_20.set_index('APN',inplace=True)

In [None]:
parceldata_20 = parceldata_20.rename(columns = {'buit_within_decade': 'built_within_decade'})

## Standardize Data

Use top 20 feature importances from random forest to run a model with fewer columns that includes census data

In [None]:
#subset parceldata10 to run model with only top 20 feature importances
parceldata_10 = parceldata_10[['LAND_VALUE', 'ACRES', 'dollars_per_acre', 'distances', 'lon', 'lat',
       'IL_RATIO', 'warehouse_start', 'Industrial', 'Vacant',
       'Percent $10,000-$14,999', 'Percent $100,000 - $124,999',
       'Percent White alone', 'Percent Some other race alone',
       'Percent some_college', 'Percent less_highschool',
       'Percent Renter Occupied', 'Percent No schooling completed',
       'Percent Two or more races:', 'Percent Owner Occupied', 'built_within_decade']]

In [None]:
#standardize data

cols_to_exclude = ['built_within_decade']

otherCols  = [col for col in parceldata_10.columns if col not in cols_to_exclude]
otherCols

In [None]:
parceldata_10[otherCols].info()

In [None]:
#scale "otherCols"
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(parceldata_10[otherCols])

# convert to DataFrame and specify the column names and index
df_scaled = pd.DataFrame(scaler.transform(parceldata_10[otherCols]), 
                         columns=otherCols, index=parceldata_10.index)

# create a DataFrame with these scaled columns joined to the columns that we didn't scale
df_scaled = df_scaled.join(parceldata_10[cols_to_exclude])

df_scaled.head()

In [None]:


# define variables 
cols=df_scaled.columns.to_list()
xvars=[col for col in cols if col not in ('APN', 'built_within_decade')]
yvar = 'built_within_decade'

# create a dataframe with no NaNs
parceldata_10_model = df_scaled[xvars+[yvar]].dropna()

# create train-test split
X_train, X_test, y_train, y_test = train_test_split(
    parceldata_10_model[xvars], parceldata_10_model[yvar], test_size = 0.25, random_state = 1)

In [None]:
parceldata_10_model[xvars].describe() ##check that it is standardized

In [None]:
#predict
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000, random_state = 1) ##add random state here or next line & check that it is using the standardized data
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

In [None]:
# stop if the length of the predictions doesn't match the training dataset
assert len(X_test)==len(y_pred)

In [None]:
#check how it did
print(classification_report(y_test, y_pred))

In [None]:
print('Predicted fraction True: {:.4f}. Actual fraction True: {:.4f}'.format(
    y_pred.mean(), y_test.mean()))

In [None]:
#confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
#confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.savefig('figures/confusion_matrix_Neuralnet_census.png', bbox_inches='tight',transparent=True)


### Apply Model to 2020 Dataset

### Standardize New Dataframe

In [None]:
#subset parceldata20 to run model with only top 20 feature importances
parceldata_20 = parceldata_20[['LAND_VALUE', 'ACRES', 'dollars_per_acre', 'distances', 'lon', 'lat',
       'IL_RATIO', 'warehouse_start', 'Industrial', 'Vacant',
       'Percent $10,000-$14,999', 'Percent $100,000 - $124,999',
       'Percent White alone', 'Percent Some other race alone',
       'Percent some_college', 'Percent less_highschool',
       'Percent Renter Occupied', 'Percent No schooling completed',
       'Percent Two or more races:', 'Percent Owner Occupied', 'built_within_decade']]

In [None]:
#select columns to standardize

cols_to_exclude = ['built_within_decade']

otherCols  = [col for col in parceldata_20.columns if col not in cols_to_exclude]
otherCols

In [None]:
#scale "otherCols"
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(parceldata_20[otherCols])

# convert to DataFrame and specify the column names and index
df_scaled2 = pd.DataFrame(scaler.transform(parceldata_20[otherCols]), 
                         columns=otherCols, index=parceldata_20.index)

# create a DataFrame with these scaled columns joined to the columns that we didn't scale
df_scaled2 = df_scaled2.join(parceldata_20[cols_to_exclude])

df_scaled2.head()

In [None]:
# define variables 
cols=df_scaled2.columns.to_list()
xvars=[col for col in cols if col not in ('APN', 'built_within_decade')]
yvar = 'built_within_decade'

# create a dataframe with no NaNs
parceldata_20_model = df_scaled2[xvars+[yvar]].dropna()

In [None]:
# apply predictions from previous model to test dataset
X_test=parceldata_20_model[xvars]
y_pred = mlp.predict(X_test)

In [None]:
# stop if lengths don't match
assert len(X_test)==len(y_pred)
# how many does it predict?
print('Predicted fraction True: {:.4f}'.format(y_pred.mean()))
print('Predicted number True: {:.0f}'.format(y_pred.sum()))

In [None]:
# create a dataframe of predictions about whether or not a parcel will have a warehouse built on it within 10 years & join this to the full df
predictions = pd.DataFrame(mlp.predict_proba(X_test),columns = ['pred_noWH', 'pred_WH'])
predictions= predictions.join((X_test).reset_index())
predictions.set_index('APN',inplace=True)
predictions.head()

In [None]:
#predictions.to_csv('predictions_neuralnet1.csv')