<a href="https://colab.research.google.com/github/MapleWolfe/Milestone_2/blob/main/Supervised_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Supervised learning notebook

## installs, imports, pre-sets

In [None]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py
!pip install google-cloud-storage

In [8]:
#google import options
#from google.colab import drive
from google.cloud import storage

#general usage imports
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import gc
import os
import multiprocessing
import pickle
import json
import joblib

#model operations imports
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#sklearn classifiers
from sklearn.linear_model import SGDClassifier
#from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
# regression
from sklearn.linear_model import Lasso

#GPU imports
import cudf
from cuml.naive_bayes import GaussianNB
from cuml.naive_bayes import ComplementNB
from cuml import LogisticRegression
from cuml.ensemble import RandomForestClassifier

import cupy
import xgboost as xgb



## GCP set up

In [4]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/organic-reef-390716-609989a4c6da.json'
client = storage.Client()
bucket = client.get_bucket('fire_train_eval_test_bucket')
blob = bucket.blob('test.csv')
blob.download_to_filename('test.csv')
blob = bucket.blob('eval.csv')
blob.download_to_filename('eval.csv')
blob = bucket.blob('train.csv')
blob.download_to_filename('train.csv')

## Functions to load and clean csv chunks

In [1]:
#remember to add .csv at the end of file name
def read_csv_in_chunks(file_name,number_images):

  #number of rows per image:
  pixels_count = 64*64

  #upto 200 images at a time
  size = number_images*pixels_count

  #file string and location for Google cloud storage
  file_string = '/content/' + file_name
  return pd.read_csv(file_string, chunksize=size)

def read_full_csv(file_name):

  #file string and location for Google cloud storage
  file_string = '/content/' + file_name


  return pd.read_csv(file_string)

#this is procedure that cleans the data.
# cleaner_1 drops all negative "firemask" values and converts all values above 0 to 1
def cleaner_1(df_chunk):
  col_list = ['NDVI_scaled_smoothened_values', 'NDVI_local_gradient', 'NDVI_local_mean', 'tmmn_scaled_smoothened_values', 'tmmn_local_gradient', 'tmmn_local_mean', 'elevation_scaled_smoothened_values', 'elevation_local_gradient', 'elevation_local_mean', 'fire_at_similar_altitude', 'population_scaled_smoothened_values', 'population_local_gradient', 'population_local_mean', 'vs_scaled_smoothened_values', 'vs_local_gradient', 'vs_local_mean', 'pdsi_scaled_smoothened_values', 'pdsi_local_gradient', 'pdsi_local_mean', 'pr_scaled_smoothened_values', 'pr_local_gradient', 'pr_local_mean', 'tmmx_scaled_smoothened_values', 'tmmx_local_gradient', 'tmmx_local_mean', 'sph_scaled_smoothened_values', 'sph_local_gradient', 'sph_local_mean', 'th_scaled_smoothened_values', 'th_local_gradient', 'th_local_mean', 'distance_from_fire', 'erc_scaled_smoothened_values', 'erc_local_gradient', 'erc_local_mean']

  original_previous_day_fire = df_chunk['PrevFireMask']
  original_next_day_fire = df_chunk['FireMask']

  #general cleaning for classifier and regressor
  drop_neg_df = df_chunk[df_chunk['FireMask'] >=0]

  #only regressor selection
  regressor_target = drop_neg_df['FireMask']

  #cleaning specifically for the classifier
  classifier_target = np.where(regressor_target > 0, 1, 0)
  dropped_chunk = drop_neg_df.drop(labels=['PrevFireMask','FireMask','image_id'], axis=1)
  output_chunk = dropped_chunk[col_list]
  return output_chunk,regressor_target,classifier_target, original_previous_day_fire, original_next_day_fire


## loading unsupervised models

In [3]:
#standard_scalar_model
with open('/content/standard_scalar_model', 'rb') as ss_file:
    loaded_scalar_model = pickle.load(ss_file)

# pca model chosen:
with open('/content/pca_model_8', 'rb') as pca_file:
    loaded_pca_model = pickle.load(pca_file)

#kmeans model chosen:
with open('/content/kmean_model_1', 'rb') as kmean_file:
    loaded_kmean_model = pickle.load(kmean_file)

## setting up data

In [4]:
#train
train_df = read_full_csv('train.csv')
print('train loaded')
train_cleaned_df,train_regressor_target,train_classifier_target, train_original_previous_day_fire, train_original_next_day_fire = cleaner_1(train_df)
del train_df
gc.collect()

print('initializing train data scaling')
train_data_scaled = loaded_scalar_model.transform(train_cleaned_df)
del train_cleaned_df
gc.collect()

print('initializing train pca')
train_data_pca = loaded_pca_model.transform(train_data_scaled)
del train_data_scaled
gc.collect()

print('getting train cluster labels')
train_cluster_labels = loaded_kmean_model.predict(train_data_pca)


#evaluation
eval_df = read_full_csv('eval.csv')
print('eval loaded')
eval_cleaned_df,eval_regressor_target,eval_classifier_target, eval_original_previous_day_fire, eval_original_next_day_fire = cleaner_1(eval_df)
del eval_df
gc.collect()

print('initializing eval data scaling')
eval_data_scaled = loaded_scalar_model.transform(eval_cleaned_df)
del eval_cleaned_df
gc.collect()

print('initializing eval pca')
eval_data_pca = loaded_pca_model.transform(eval_data_scaled)
del eval_data_scaled
gc.collect()

print('getting eval cluster labels')
eval_cluster_labels = loaded_kmean_model.predict(eval_data_pca)


#test
test_df = read_full_csv('test.csv')
print('test loaded')
test_cleaned_df,test_regressor_target,test_classifier_target, test_original_previous_day_fire, test_original_next_day_fire = cleaner_1(test_df)
del test_df
gc.collect()

print('initializing test data scaling')
test_data_scaled = loaded_scalar_model.transform(test_cleaned_df)
del test_cleaned_df
gc.collect()

print('initializing test pca')
test_data_pca = loaded_pca_model.transform(test_data_scaled)
del test_data_scaled
gc.collect()

print('getting test cluster labels')
test_cluster_labels = loaded_kmean_model.predict(test_data_pca)



train loaded
initializing train data scaling
initializing train pca
getting train cluster labels
eval loaded
initializing eval data scaling
initializing eval pca
getting eval cluster labels
test loaded
initializing test data scaling
initializing test pca
getting test cluster labels


## breaking data into clusters function

In [5]:
def break_cluster(pca_data,cluster_labels,supervised_target):
  unique_labels = list(np.unique(cluster_labels))
  storage_dict={}
  for a_label in unique_labels:
    label_index = np.where(cluster_labels == a_label)[0]
    feature_data = pca_data[label_index]
    target_data = supervised_target[label_index]
    yield a_label,feature_data,target_data

## Classifier Model building

#### Logistic regression

In [11]:
def logistic_train(a_label,feature_df,target_array,penalty = ['l2']):
  model_counter = 0
  hyper_params ={'penalty': penalty}
  name_param = {}
  for params in ParameterGrid(hyper_params):
    print('cluster: ', a_label, ', logistic regression for params: ', params)
    model_counter+=1
    print('initializing training of logistic regression')
    lrs = LogisticRegression(**params)
    lrs.fit(feature_df,target_array)
    print('training complete')
    print('saving model...')
    model_name = 'cluster_'+str(a_label)+'_logistic_model_'+str(model_counter)
    with open(model_name, 'wb') as model_file:
      pickle.dump(lrs, model_file)
    print('model saved')
    name_param[model_name] = params
  return name_param


#### SGD CLASSIFIER

In [12]:
def sgd_train(a_label,feature_df,target_array,penalty = ['l1', 'l2', 'elasticnet'],random_state=[0],n_jobs=[-1]):
  model_counter = 0
  hyper_params ={'penalty': penalty,'random_state':random_state,'n_jobs':n_jobs}
  name_param = {}
  for params in ParameterGrid(hyper_params):
    print('cluster: ', a_label, ', SGD for params: ', params)
    model_counter+=1
    print('initializing training of SGD ')
    sgd = SGDClassifier(**params)
    sgd.fit(feature_df,target_array)
    print('training complete')
    print('saving model...')
    model_name = 'cluster_'+str(a_label)+'_SGD_model_'+str(model_counter)
    with open(model_name, 'wb') as model_file:
      pickle.dump(sgd, model_file)
    print('model saved')
    name_param[model_name] = params
  return name_param


#### Linear SVC

In [13]:
def linear_svc_train(a_label,feature_df,target_array,penalty = ['l2'],random_state=[0]):
  model_counter = 0
  hyper_params ={'penalty': penalty,'random_state':random_state}
  name_param = {}
  for params in ParameterGrid(hyper_params):
    print('cluster: ', a_label, ', linear svc for params: ', params)
    model_counter+=1
    print('initializing training of svc')
    svc = LinearSVC(**params)
    svc.fit(feature_df,target_array)
    print('training complete')
    print('saving model...')
    model_name = 'cluster_'+str(a_label)+'_linearSVC_model_'+str(model_counter)
    with open(model_name, 'wb') as model_file:
      pickle.dump(svc, model_file)
    print('model saved')
    name_param[model_name] = params
  return name_param

#### Random Forest

In [14]:
def random_forest_train(a_label,feature_df,target_array,n_estimators = [100,200,400],min_samples_split = [4096],random_state=[0]):
  model_counter = 0
  hyper_params ={'n_estimators': n_estimators,'random_state':random_state}
  name_param = {}
  for params in ParameterGrid(hyper_params):
    print('cluster: ', a_label, ', random forest for params: ', params)
    model_counter+=1
    print('initializing training of random forest')
    rfc = RandomForestClassifier(**params)
    rfc.fit(feature_df,target_array)
    print('training complete')
    print('saving model...')
    model_name = 'cluster_'+str(a_label)+'_random_forest_model_'+str(model_counter)
    with open(model_name, 'wb') as model_file:
      pickle.dump(rfc, model_file)
    print('model saved')
    name_param[model_name] = params
  return name_param

#### XGB classifier

In [36]:
def xgb_train(a_label,feature_df,target_array,n_estimators = [100,200,400],tree_method =['gpu_hist'], objective = ['binary:logistic'] ):
  model_counter = 0
  hyper_params ={'n_estimators': n_estimators,'tree_method':tree_method, 'objective':objective}
  name_param = {}
  for params in ParameterGrid(hyper_params):
    print('cluster: ', a_label, ', xgb for params: ', params)
    model_counter+=1
    print('initializing training of random forest')
    xgc = xgb.XGBClassifier(**params)
    # optimizing for gpu usage
    xgc.fit(feature_df,target_array)
    print('training complete')
    print('saving model...')
    model_name = 'cluster_'+str(a_label)+'_xgb_model_'+str(model_counter)
    with open(model_name, 'wb') as model_file:
      pickle.dump(xgc, model_file)
    print('model saved')
    name_param[model_name] = params
  return name_param

#### Complement Naive Bayes

In [16]:
def complement_nb_train(a_label,feature_df,target_array):
  name_param = {}
  print('cluster: ', a_label, ', Complement for params: ',)
  model_counter=1
  print('initializing training of ComplementNB')
  cnb = GaussianNB()
  cnb.fit(feature_df,target_array)
  print('training complete')
  print('saving model...')
  model_name = 'cluster_'+str(a_label)+'_ComplementNB_'+str(model_counter)
  with open(model_name, 'wb') as model_file:
    pickle.dump(cnb, model_file)
  print('model saved')
  name_param[model_name] = 'default params'
  return name_param

#### Gaussian Naive Bayes

In [17]:
def gaussian_nb_train(a_label,feature_df,target_array):
  name_param = {}
  print('cluster: ', a_label, ', gaussian for params: ',)
  model_counter=1
  print('initializing training of guassianNB')
  gnb = GaussianNB()
  gnb.fit(feature_df,target_array)
  print('training complete')
  print('saving model...')
  model_name = 'cluster_'+str(a_label)+'_guassianNB_'+str(model_counter)
  with open(model_name, 'wb') as model_file:
    pickle.dump(gnb, model_file)
  print('model saved')
  name_param[model_name] = 'default params'
  return name_param

#### classifier training

In [37]:
main_dict = {}
cluster = break_cluster(train_data_pca,train_cluster_labels,train_classifier_target)
for label,feature,target in cluster:
  print(label, len(feature),len(target))
  print(np.unique(target))
  print(type(target))
  #log_dict = logistic_train(label,feature,target)
  #gd_dict = sgd_train(label,feature,target)
  #svc_dict = linear_svc_train(label,feature,target)
  #rfc_dict = random_forest_train(label,feature,target)
  xgb_dict = xgb_train(label,feature,target)
  #cnb_dict = complement_nb_train(label,feature,target)
  #gnb_dict = gaussian_nb_train(label,feature,target)
  #main_dict[label] = [log_dict,sgd_dict,rfc_dict,xgb_dict,cnb_dict,gnb_dict]

0 478301 478301
[0 1]
<class 'numpy.ndarray'>
cluster:  0 , xgb for params:  {'n_estimators': 100, 'objective': 'binary:logistic', 'tree_method': 'gpu_hist'}
initializing training of random forest
training complete
saving model...
model saved
cluster:  0 , xgb for params:  {'n_estimators': 200, 'objective': 'binary:logistic', 'tree_method': 'gpu_hist'}
initializing training of random forest
training complete
saving model...
model saved
cluster:  0 , xgb for params:  {'n_estimators': 400, 'objective': 'binary:logistic', 'tree_method': 'gpu_hist'}
initializing training of random forest
training complete
saving model...
model saved


In [35]:
save_dict = {}
for key,val in main_dict.items():
  save_dict[str(key)] = val

# writing out dict as json
with open('classiier_id_dict.json', 'w') as clf_dict:
  json.dump(save_dict, clf_dict)

## Regressor training

#### XGB regressor

In [6]:
def xgb_reg_train(a_label,feature_df,target_array,n_estimators = [100,200,400],tree_method =['gpu_hist'] ):
  model_counter = 0
  hyper_params ={'n_estimators': n_estimators,'tree_method':tree_method}
  name_param = {}
  for params in ParameterGrid(hyper_params):
    print('cluster: ', a_label, ', xgb for params: ', params)
    model_counter+=1
    print('initializing training of random forest')
    xgr = xgb.XGBRegressor(**params)
    # optimizing for gpu usage
    xgr.fit(feature_df,target_array)
    print('training complete')
    print('saving model...')
    model_name = 'cluster_'+str(a_label)+'_xgb_regressor_model_'+str(model_counter)
    with open(model_name, 'wb') as model_file:
      pickle.dump(xgr, model_file)
    print('model saved')
    name_param[model_name] = params
  return name_param

Lasso regressor

In [9]:
def Lasso_reg_train(a_label,feature_df,target_array,alpha = [0.5,1,1.5,2] ):
  model_counter = 0
  hyper_params ={'alpha' : alpha}
  name_param = {}
  for params in ParameterGrid(hyper_params):
    print('cluster: ', a_label, ', lasso for params: ', params)
    model_counter+=1
    print('initializing training of lasso')
    lasso = Lasso(**params)
    # optimizing for gpu usage
    lasso.fit(feature_df,target_array)
    print('training complete')
    print('saving model...')
    model_name = 'cluster_'+str(a_label)+'_lasso_'+str(model_counter)
    with open(model_name, 'wb') as model_file:
      pickle.dump(lasso, model_file)
    print('model saved')
    name_param[model_name] = params
  return name_param

#### regressor_training

In [None]:
regressor_dict = {}
cluster = break_cluster(train_data_pca,train_cluster_labels,train_classifier_target)
for label,feature,target in cluster:
  xgb_reg_dict = xgb_reg_train(label,feature,target)
  las_reg_dict = Lasso_reg_train(label,feature,target)
  regressor_dict[str(label)] = [xgb_reg_dict,las_reg_dict]

In [11]:
# writing out dict as json
with open('regressor_id_dict.json', 'w') as clf_dict:
  json.dump(regressor_dict, clf_dict)