<a href="https://colab.research.google.com/github/MapleWolfe/Milestone_2/blob/main/Unsupervised_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unsupervised Learning techniques

## installs, imports, pre-sets

In [None]:
#!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
#!python rapidsai-csp-utils/colab/pip-install.py
#!pip install google-cloud-storage

In [1]:
#google import options
#from google.colab import drive
from google.cloud import storage

#general usage imports
import tensorflow as tf
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import gc
import os
import multiprocessing
import pickle
import json
import joblib
#clustering import
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.datasets import make_blobs
from sklearn.model_selection import ParameterGrid
from sklearn.cluster import DBSCAN

#PCA imports
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import OPTICS

## dask set up

In [None]:
num_cpus = multiprocessing.cpu_count()
print("Number of available CPUs:", num_cpus)

Number of available CPUs: 64


## GCP set up

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/organic-reef-390716-609989a4c6da.json'
client = storage.Client()
bucket = client.get_bucket('fire_train_eval_test_bucket')
blob = bucket.blob('test.csv')
blob.download_to_filename('test.csv')
blob = bucket.blob('eval.csv')
blob.download_to_filename('eval.csv')
blob = bucket.blob('train.csv')
blob.download_to_filename('train.csv')

## Functions to load csv chunks

In [None]:
# let's mount the drive
# drive.mount('/content/drive')

In [None]:
#remember to add .csv at the end of file name
def read_csv_in_chunks(file_name,number_images):

  #number of rows per image:
  pixels_count = 64*64

  #upto 200 images at a time
  size = number_images*pixels_count

  #file string and location for Google Drive
  #file_string = '/content/drive/MyDrive/' + file_name

  #file string and location for Google cloud storage
  file_string = '/content/' + file_name
  return pd.read_csv(file_string, chunksize=size)

def read_full_csv(file_name):
  #file string and location for Google Drive
  #file_string = '/content/drive/MyDrive/' + file_name

  #file string and location for Google cloud storage
  file_string = '/content/' + file_name


  return pd.read_csv(file_string)

## Functions to clean CSV chunks

In [4]:
#this is procedure that cleans the data.
# cleaner_1 drops all negative "firemask" values and converts all values above 0 to 1
def cleaner_1(df_chunk):
  col_list = ['NDVI_scaled_smoothened_values', 'NDVI_local_gradient', 'NDVI_local_mean', 'tmmn_scaled_smoothened_values', 'tmmn_local_gradient', 'tmmn_local_mean', 'elevation_scaled_smoothened_values', 'elevation_local_gradient', 'elevation_local_mean', 'fire_at_similar_altitude', 'population_scaled_smoothened_values', 'population_local_gradient', 'population_local_mean', 'vs_scaled_smoothened_values', 'vs_local_gradient', 'vs_local_mean', 'pdsi_scaled_smoothened_values', 'pdsi_local_gradient', 'pdsi_local_mean', 'pr_scaled_smoothened_values', 'pr_local_gradient', 'pr_local_mean', 'tmmx_scaled_smoothened_values', 'tmmx_local_gradient', 'tmmx_local_mean', 'sph_scaled_smoothened_values', 'sph_local_gradient', 'sph_local_mean', 'th_scaled_smoothened_values', 'th_local_gradient', 'th_local_mean', 'distance_from_fire', 'erc_scaled_smoothened_values', 'erc_local_gradient', 'erc_local_mean']

  original_previous_day_fire = df_chunk['PrevFireMask']
  original_next_day_fire = df_chunk['FireMask']

  #general cleaning for classifier and regressor
  drop_neg_df = df_chunk[df_chunk['FireMask'] >=0]

  #only regressor selection
  regressor_target = drop_neg_df['FireMask']

  #cleaning specifically for the classifier
  classifier_target = np.where(regressor_target > 0, 1, 0)
  dropped_chunk = df_chunk.drop(labels=['PrevFireMask','FireMask','image_id'], axis=1)
  output_chunk = dropped_chunk[col_list]
  return output_chunk,regressor_target,classifier_target, original_previous_day_fire, original_next_day_fire

In [5]:
train_df = read_full_csv('train.csv')
print('train loaded')
cleaned_df,_,_,_,_ = cleaner_1(train_df)
print('initializing data scaling')
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cleaned_df)

train loaded
initializing data scaling


In [None]:
del train_df, cleaned_df
gc.collect()

## Unsupervised Learning

#### PCA

In [None]:
pca_param_grid = {'n_components': [4,6,8,10,12,14,16,18]}
pca_storage_dict = {}
model_counter = 0
for params in ParameterGrid(pca_param_grid):
  model_counter +=1
  print('initializing PCA for param: ', params)
  pca_model = PCA(**params)
  chunk_counter = 0
  pca_model.fit(data_scaled)
  print('pca completed for model : ', model_counter)

  pca_model_name_string = 'pca_model_'+str(model_counter)
  pca_storage_dict[pca_model_name_string] = [params,
                                             {'explained_variance': list(pca_model.explained_variance_)},
                                             {'explained_variance_ratio':list(pca_model.explained_variance_ratio_)}]
  print('storing pca file')
  with open(pca_model_name_string, 'wb') as pca_file:
    pickle.dump(pca_model, pca_file)

print('storing scalar model')
with open('standard_scalar_model', 'wb') as scaler_file:
  pickle.dump(scaler, scaler_file)

print('storing pca performance')
with open('pca_model_performance.json', 'w') as pca_metric_json:
    json.dump(pca_storage_dict, pca_metric_json)

initializing PCA for param:  {'n_components': 4}
pca completed for model :  1
storing pca file
initializing PCA for param:  {'n_components': 6}
pca completed for model :  2
storing pca file
initializing PCA for param:  {'n_components': 8}
pca completed for model :  3
storing pca file
initializing PCA for param:  {'n_components': 10}
pca completed for model :  4
storing pca file
initializing PCA for param:  {'n_components': 12}
pca completed for model :  5
storing pca file
initializing PCA for param:  {'n_components': 14}
pca completed for model :  6
storing pca file
initializing PCA for param:  {'n_components': 16}
pca completed for model :  7
storing pca file
initializing PCA for param:  {'n_components': 18}
pca completed for model :  8
storing pca file
storing scalar model
storing pca performance


#### Kmeans Clustering

In [9]:
# open pca model:
with open('/content/pca_model_8', 'rb') as pca_file:
    loaded_pca_model = pickle.load(pca_file)

In [None]:
def cluster_evaluation(eval_df, cluster_model):
    print('evaluation start')
    eval_labels = cluster_model.predict(eval_df)
    inertia = cluster_model.inertia_
    calinski = calinski_harabasz_score(eval_df, eval_labels)
    davies_bouldin = davies_bouldin_score(eval_df, eval_labels)
    # we are no longer calculating silhouette score as it performs pairwise calculations that grow exponentially with data
    #silhouette = silhouette_score(eval_df, eval_labels)
    print('evaluation complete')
    return inertia, calinski, davies_bouldin

In [None]:
# lets build a function for our kmeans cluster
def search_params_kmeans(file,scaling_model,pca_model,cluster_list=[8,32,64],initialisation_list = ['k-means++', 'random'], random_state = [0]):
  k_means_param_grid = {'n_clusters': cluster_list, 'init': initialisation_list, 'random_state' : random_state, 'batch_size' : [1024*num_cpus]}
  for params in ParameterGrid(k_means_param_grid):
    print('initializing kmeans for param: ', params)
    csv_chunks_generator = read_csv_in_chunks(file,1000)
    K_means_model = MiniBatchKMeans(**params)
    counter = 0
    for a_chunk in csv_chunks_generator:
      features_df,_,_,_,_ = cleaner_1(a_chunk)
      scaled_df = scaling_model.transform(features_df)
      out_pca_df = pca_model.transform(scaled_df)
      K_means_model.partial_fit(out_pca_df)
      print('iteration completed: ', counter)
      counter+=1
    yield K_means_model, params

In [None]:
%%time
# a dict to store model performance & eval csv file
print('starting to read evaluation dataset')
evaluation_df = read_full_csv('eval.csv')
print('read eval')
cleaned_eval,_,_,_,_ = cleaner_1(evaluation_df)
eval_scaled = scaler.transform(cleaned_eval)
eval_pca = loaded_pca_model.transform(eval_scaled)

#intermediate memory step
print('starting deletion of raw evaluation data')
del evaluation_df
gc.collect()
print('completed deletion of raw evaluation data')

# initiating model building
model_builders = search_params_kmeans(file='train.csv',scaling_model = scaler, pca_model = loaded_pca_model)
model_perform_dict ={}
model_counter = 0

#this where a lot of time will go, it will iterate over each model across grid search
for a_kmean_model, kmean_params in model_builders:
  model_counter +=1
  print('initializing evaluation')
  inertia, calinski, davies_bouldin = cluster_evaluation(eval_pca, a_kmean_model)
  print('evaluation complete')

  model_name = 'kmean_model_'+str(model_counter)
  model_perform_dict[model_name]=[kmean_params,inertia, calinski, davies_bouldin]
  print('storing model')
  with open(model_name, 'wb') as model_file:
    pickle.dump(a_kmean_model, model_file)


In [None]:
# we are just improving the dict
for a_key in model_perform_dict.keys():
  #inertia, calinski, davies_bouldin
  model_perform_dict[a_key][1] = ('inertia',model_perform_dict[a_key][1])
  model_perform_dict[a_key][2] = ('calinski',model_perform_dict[a_key][2])
  model_perform_dict[a_key][3] = ('davies_bouldin',model_perform_dict[a_key][3])

#outputing our evaluation metrics for all the models
with open('kmean_model_performance.json', 'w') as kmeans_metric_json:
    json.dump(model_perform_dict, kmeans_metric_json)

#### Birch

In [None]:

print('PCA transformation of eval initiating')
eval_pca = loaded_pca_model.transform(eval_scaled)

print('PCA transformation of train initiating')
train_pca = loaded_pca_model.transform(data_scaled)

birch_param_grid ={
    'threshold': [0.2,0.4,0.6],
    'n_clusters': [8,32,64],
}

birch_performance = {}
birch_model_number = 0
for params in ParameterGrid(birch_param_grid):
  birch_model_number +=1
  print('initializing birch model for params: ', params)
  birch_model = Birch(**params)
  print('fitting birch')
  birch_model.fit(train_pca)
  print('predicting on birch')
  birch_labels = birch_model.predict(eval_pca)
  print('evaluating using similar metrics')
  calinski = calinski_harabasz_score(eval_pca, birch_labels)
  davies_bouldin = davies_bouldin_score(eval_pca, birch_labels)
  print('model is complete, now outputing the data')
  birch_model_name = 'birch_model_' +str(birch_model_number)
  birch_performance[birch_model_name] = {'params':params,'calinski':calinski,'davies_bouldin':davies_bouldin }
  print('outputing the model')

  with open(birch_model_name, 'wb') as birch_file:
    pickle.dump(birch_model, birch_file)

with open('bich_model_performance.json', 'w') as birch_metric_json:
    json.dump(birch_performance, birch_metric_json)

PCA transformation of eval initiating
PCA transformation of train initiating
initializing birch model for params:  {'n_clusters': 8, 'threshold': 0.2}
fitting birch


<class 'numpy.ndarray'>


#### OPTICS

In [None]:
train_pca = loaded_pca_model.transform(data_scaled)


optics_param_grid = {
    'min_samples': [0.01,0.05,0.1],
    'cluster_method': ['xi','dbscan'],
    'n_jobs': [-1]
}

optics_performance = {}
optics_model_number = 0
for params in ParameterGrid(optics_param_grid):
  optics_model_number +=1
  print('initializing optics model for params: ', params)
  optics_model = OPTICS(**params)
  print('fitting optics')
  print('predicting on optics')
  optics_labels = optics_model.fit_predict(train_pca)
  print('evaluating using similar metrics')
  calinski = calinski_harabasz_score(eval_pca, optics_labels)
  davies_bouldin = davies_bouldin_score(eval_pca, optics_labels)
  print('model is complete, now outputing the data')
  optic_model_name = 'optic_model_' +str(optics_model_number)
  optics_performance[optic_model_name] = {'params':params,'calinski':calinski,'davies_bouldin':davies_bouldin }
  print('outputing the model')

  with open(optic_model_name, 'wb') as optic_file:
    pickle.dump(optics_model, optic_file)

with open('optic_model_performance.json', 'w') as optic_metric_json:
    json.dump(optics_performance, optic_metric_json)

initializing optics model for params:  {'cluster_method': 'xi', 'min_samples': 0.01, 'n_jobs': -1}
fitting optics
predicting on optics
