In [None]:
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from numpy import nanmin, nanmax, nanmean, nanvar, nanmedian
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import spearmanr, pearsonr
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error


for target_index in ['index002','index004','index006','index008','index01']:
    train_data = pd.read_csv('./NYC_Experiments/data/train-gridsize-{}.csv'.format(str(target_index)[5:]))
    val_data = pd.read_csv('./NYC_Experiments/data/val-gridsize-{}.csv'.format(str(target_index)[5:]))
    test_data = pd.read_csv('./NYC_Experiments/data/test-gridsize-{}.csv'.format(str(target_index)[5:]))

    y_train, y_val, y_test = train_data['label'].values, val_data['label'].values, test_data['label'].values

    train_data.drop(['label', target_index], axis=1, inplace=True)
    val_data.drop(['label', target_index], axis=1, inplace=True)
    test_data.drop(['label', target_index], axis=1, inplace=True)

    X_train, X_val, X_test =  train_data.values, val_data.values, test_data.values


    # impute missing values
    imputer = SimpleImputer(strategy='mean')
    imputer_model = imputer.fit(X_train)
    X_train, X_val, X_test = imputer_model.transform(X_train), imputer_model.transform(X_val), imputer_model.transform(X_test)

    # min-max scaling
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler_model = min_max_scaler.fit(X_train)
    X_train, X_val, X_test = min_max_scaler_model.transform(X_train), min_max_scaler_model.transform(X_val), min_max_scaler_model.transform(X_test)

    y_min, y_max = np.min(y_train), np.max(y_train)
    y_train, y_val, y_test = ((y_train-y_min)/(y_max-y_min)), ((y_val-y_min)/(y_max-y_min)), ((y_test-y_min)/(y_max-y_min))


    # Random Forest model
    best_model, best_acc = None, np.inf
    for max_depth_i in [4,8,16, None]:
      regr = RandomForestRegressor(max_depth=max_depth_i)
      regr.fit(X_train, y_train)
      res = mean_squared_error(regr.predict(X_val), y_val)
      if res < best_acc:
          best_acc = res
          best_model = regr

    y_test_predicted = best_model.predict(X_test)
    print('Random Forest: ',target_index, '---', spearmanr(y_test_predicted, y_test).statistic, '---', pearsonr(y_test_predicted, y_test).statistic)

    # save the model to disk
    pickle.dump(best_model, open('./NYC_Experiments/models/rf_model_{}.sav'.format(target_index), 'wb'))
    (pd.DataFrame({'y_pred':y_test_predicted, 'y_true':y_test})).to_csv('./NYC_Experiments/results/rf_model_{}.csv'.format(target_index), index=False)



In [None]:
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from numpy import nanmin, nanmax, nanmean, nanvar, nanmedian
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import spearmanr, pearsonr
from sklearn import preprocessing
import xgboost as xgb
from sklearn.metrics import mean_squared_error

for target_index in ['index002','index004','index006','index008','index01']:
    train_data = pd.read_csv('./NYC_Experiments/data/train-gridsize-{}.csv'.format(str(target_index)[5:]))
    val_data = pd.read_csv('./NYC_Experiments/data/val-gridsize-{}.csv'.format(str(target_index)[5:]))
    test_data = pd.read_csv('./NYC_Experiments/data/test-gridsize-{}.csv'.format(str(target_index)[5:]))

    y_train, y_val, y_test = train_data['label'].values, val_data['label'].values, test_data['label'].values

    train_data.drop(['label', target_index], axis=1, inplace=True)
    val_data.drop(['label', target_index], axis=1, inplace=True)
    test_data.drop(['label', target_index], axis=1, inplace=True)

    X_train, X_val, X_test =  train_data.values, val_data.values, test_data.values


    # impute missing values
    imputer = SimpleImputer(strategy='mean')
    imputer_model = imputer.fit(X_train)
    X_train, X_val, X_test = imputer_model.transform(X_train), imputer_model.transform(X_val), imputer_model.transform(X_test)

    # min-max scaling
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler_model = min_max_scaler.fit(X_train)
    X_train, X_val, X_test = min_max_scaler_model.transform(X_train), min_max_scaler_model.transform(X_val), min_max_scaler_model.transform(X_test)

    y_min, y_max = np.min(y_train), np.max(y_train)
    y_train, y_val, y_test = ((y_train-y_min)/(y_max-y_min)), ((y_val-y_min)/(y_max-y_min)), ((y_test-y_min)/(y_max-y_min))


    # Random Forest model
    best_model, best_acc = None, np.inf
    for max_depth_i in [4,8,16, None]:
      regr = xgb.XGBRegressor(max_depth=max_depth_i)
      regr.fit(X_train, y_train)
      res = mean_squared_error(regr.predict(X_val), y_val)
      if res < best_acc:
          best_acc = res
          best_model = regr

    y_test_predicted = best_model.predict(X_test)
    print('XGBoost: ',target_index, '---', spearmanr(y_test_predicted, y_test).statistic, '---', pearsonr(y_test_predicted, y_test).statistic)

    # save the model to disk
    pickle.dump(best_model, open('./NYC_Experiments/models/xgb_model_{}.sav'.format(target_index), 'wb'))
    (pd.DataFrame({'y_pred':y_test_predicted, 'y_true':y_test})).to_csv('./NYC_Experiments/results/xgb_model_{}.csv'.format(target_index), index=False)



In [None]:
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from numpy import nanmin, nanmax, nanmean, nanvar, nanmedian
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import spearmanr, pearsonr
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

for target_index in ['index002','index004','index006','index008','index01']:
    train_data = pd.read_csv('./NYC_Experiments/data/train-gridsize-{}.csv'.format(str(target_index)[5:]))
    val_data = pd.read_csv('./NYC_Experiments/data/val-gridsize-{}.csv'.format(str(target_index)[5:]))
    test_data = pd.read_csv('./NYC_Experiments/data/test-gridsize-{}.csv'.format(str(target_index)[5:]))

    y_train, y_val, y_test = train_data['label'].values, val_data['label'].values, test_data['label'].values

    train_data.drop(['label', target_index], axis=1, inplace=True)
    val_data.drop(['label', target_index], axis=1, inplace=True)
    test_data.drop(['label', target_index], axis=1, inplace=True)

    X_train, X_val, X_test =  train_data.values, val_data.values, test_data.values


    # impute missing values
    imputer = SimpleImputer(strategy='mean')
    imputer_model = imputer.fit(X_train)
    X_train, X_val, X_test = imputer_model.transform(X_train), imputer_model.transform(X_val), imputer_model.transform(X_test)

    # min-max scaling
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler_model = min_max_scaler.fit(X_train)
    X_train, X_val, X_test = min_max_scaler_model.transform(X_train), min_max_scaler_model.transform(X_val), min_max_scaler_model.transform(X_test)

    y_min, y_max = np.min(y_train), np.max(y_train)
    y_train, y_val, y_test = ((y_train-y_min)/(y_max-y_min)), ((y_val-y_min)/(y_max-y_min)), ((y_test-y_min)/(y_max-y_min))


    # Random Forest model
    best_model, best_acc = None, np.inf
    for my_kernel in ["linear", "poly", "rbf", "sigmoid"]:
      regr = SVR(kernel=my_kernel)
      regr.fit(X_train, y_train)
      res = mean_squared_error(regr.predict(X_val), y_val)
      if res < best_acc:
          best_acc = res
          best_model = regr

    y_test_predicted = best_model.predict(X_test)
    print('SVR: ',target_index, '---', spearmanr(y_test_predicted, y_test).statistic, '---', pearsonr(y_test_predicted, y_test).statistic)

    # save the model to disk
    pickle.dump(best_model, open('./NYC_Experiments/models/SVR_model_{}.sav'.format(target_index), 'wb'))
    (pd.DataFrame({'y_pred':y_test_predicted, 'y_true':y_test})).to_csv('./NYC_Experiments/results/SVR_model_{}.csv'.format(target_index), index=False)



In [None]:
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from numpy import nanmin, nanmax, nanmean, nanvar, nanmedian
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import spearmanr, pearsonr
from sklearn import preprocessing

!pip install pytorch-tabnet
from pytorch_tabnet.tab_model import TabNetRegressor


for target_index in ['index002','index004','index006','index008','index01']:
    train_data = pd.read_csv('./NYC_Experiments/data/train-gridsize-{}.csv'.format(str(target_index)[5:]))
    val_data = pd.read_csv('./NYC_Experiments/data/val-gridsize-{}.csv'.format(str(target_index)[5:]))
    test_data = pd.read_csv('./NYC_Experiments/data/test-gridsize-{}.csv'.format(str(target_index)[5:]))

    y_train, y_val, y_test = train_data['label'].values, val_data['label'].values, test_data['label'].values

    train_data.drop(['label', target_index], axis=1, inplace=True)
    val_data.drop(['label', target_index], axis=1, inplace=True)
    test_data.drop(['label', target_index], axis=1, inplace=True)

    X_train, X_val, X_test =  train_data.values, val_data.values, test_data.values


    # impute missing values
    imputer = SimpleImputer(strategy='mean')
    imputer_model = imputer.fit(X_train)
    X_train, X_val, X_test = imputer_model.transform(X_train), imputer_model.transform(X_val), imputer_model.transform(X_test)

    # min-max scaling
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler_model = min_max_scaler.fit(X_train)
    X_train, X_val, X_test = min_max_scaler_model.transform(X_train), min_max_scaler_model.transform(X_val), min_max_scaler_model.transform(X_test)

    y_min, y_max = np.min(y_train), np.max(y_train)
    y_train, y_val, y_test = ((y_train-y_min)/(y_max-y_min)), ((y_val-y_min)/(y_max-y_min)), ((y_test-y_min)/(y_max-y_min))


    my_regressor = TabNetRegressor()
    my_regressor.fit(X_train, np.reshape(y_train,(-1,1)), eval_set=[(X_val, np.reshape(y_val,(-1,1)))],max_epochs = 200, batch_size=32)
    y_test_predicted = my_regressor.predict(X_test)
    y_test_predicted = np.reshape(y_test_predicted, (y_test_predicted.shape[0],))
    print('TabNet: ',target_index, '---', spearmanr(y_test_predicted, y_test).statistic, '---', pearsonr(y_test_predicted, y_test).statistic)
    my_regressor.save_model('./NYC_Experiments/models/Tabnet_model_{}'.format(target_index))
    (pd.DataFrame({'y_pred':y_test_predicted, 'y_true':y_test})).to_csv('./NYC_Experiments/results/Tabnet_model_{}.csv'.format(target_index), index=False)


In [None]:
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from numpy import nanmin, nanmax, nanmean, nanvar, nanmedian
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import spearmanr, pearsonr
from sklearn import preprocessing
import xgboost as xgb
!pip install catboost
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

for target_index in ['index002','index004','index006','index008','index01']:
    train_data = pd.read_csv('./NYC_Experiments/data/train-gridsize-{}.csv'.format(str(target_index)[5:]))
    val_data = pd.read_csv('./NYC_Experiments/data/val-gridsize-{}.csv'.format(str(target_index)[5:]))
    test_data = pd.read_csv('./NYC_Experiments/data/test-gridsize-{}.csv'.format(str(target_index)[5:]))

    y_train, y_val, y_test = train_data['label'].values, val_data['label'].values, test_data['label'].values

    train_data.drop(['label', target_index], axis=1, inplace=True)
    val_data.drop(['label', target_index], axis=1, inplace=True)
    test_data.drop(['label', target_index], axis=1, inplace=True)

    X_train, X_val, X_test =  train_data.values, val_data.values, test_data.values


    # impute missing values
    imputer = SimpleImputer(strategy='mean')
    imputer_model = imputer.fit(X_train)
    X_train, X_val, X_test = imputer_model.transform(X_train), imputer_model.transform(X_val), imputer_model.transform(X_test)

    # min-max scaling
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler_model = min_max_scaler.fit(X_train)
    X_train, X_val, X_test = min_max_scaler_model.transform(X_train), min_max_scaler_model.transform(X_val), min_max_scaler_model.transform(X_test)

    y_min, y_max = np.min(y_train), np.max(y_train)
    y_train, y_val, y_test = ((y_train-y_min)/(y_max-y_min)), ((y_val-y_min)/(y_max-y_min)), ((y_test-y_min)/(y_max-y_min))


    # CatBoostRegressor model
    best_model, best_acc = None, np.inf
    for max_depth_i in [4,8,16]:
      regr = CatBoostRegressor(max_depth=max_depth_i)
      regr.fit(X_train, y_train)
      res = mean_squared_error(regr.predict(X_val), y_val)
      if res < best_acc:
          best_acc = res
          best_model = regr

    y_test_predicted = best_model.predict(X_test)
    print('CatBoostRegressor: ',target_index, '---', spearmanr(y_test_predicted, y_test).statistic, '---', pearsonr(y_test_predicted, y_test).statistic)

    (pd.DataFrame({'y_pred':y_test_predicted, 'y_true':y_test})).to_csv('./NYC_Experiments/results/CatBoostRegressor_model_{}.csv'.format(target_index), index=False)
