In [16]:
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset

In [166]:
# import the torch_framework package
from analysis_utils.torch_framework.ResNet18 import ResNet18
from analysis_utils.torch_framework.SatDataset import SatDataset
from analysis_utils.torch_framework.FeatureExtractor import FeatureExtractor
from analysis_utils.torch_framework.FeatureExtractor import reduce_dimensions

# load the variable names of the tabular feature data
from analysis_utils.variable_names import *

# load the uids to exclude from the analysis due to missing values in the images
from analysis_utils.flagged_uids import *

# load the functions to do spatial CV
from analysis_utils.spatial_CV import *

In [18]:
# set the global file paths
root_data_dir = "../../Data"

# the lsms data
lsms_pth = f"{root_data_dir}/lsms/processed/labels_cluster_v1.csv"

# load the lsms data
lsms_df = pd.read_csv(lsms_pth)
lsms_df = lsms_df[~lsms_df.unique_id.isin(flagged_uids)].reset_index()

# set the device
device = 'cpu'

In [19]:
# load the pre-trained network
resnet = ResNet18(input_channels = 3, 
                  use_pretrained_weights = True, 
                  scaled_weight_init = False,
                  random_seed = None)

# reset the weights of the first layer to their pre-trained values
resnet.model.conv1.weight = resnet.layer1_pretrained_weights

# load the feature Extractor
feature_extractor = FeatureExtractor(model = resnet.model, device = device)

In [20]:
# load the data set
sat_img_dir = f"{root_data_dir}/satellite_imgs"

# median LS images at the cluster level
LS_rgb_img_dir = f"{sat_img_dir}/LS/LS_rgb"

# load the dataset
rgb_dat = SatDataset(labels_df = lsms_df,
                     img_dir = LS_rgb_img_dir,
                     data_type = "LS",
                     target_var = 'mean_asset_index_yeh', # not relevant
                     id_var = 'unique_id',
                     feat_transform=None,
                     target_transform=None,
                     random_seed=None)

# define the data loader
rgb_loader = DataLoader(rgb_dat, batch_size=512, shuffle=False)


# apply the feature extractor
rgb_feats = feature_extractor.extract_feats(rgb_loader, reduced = False)

		Extracting Features


  0%|          | 0/13 [00:00<?, ?it/s]

In [114]:
# extract features for the mean images
# median LS images at the cluster level
LS_median_img_dir = f"{sat_img_dir}/LS/LS_rgb_median_cluster"

cl_df = lsms_df[['cluster_id', 'mean_asset_index_yeh']].groupby('cluster_id').mean().reset_index()

# load the dataset
median_dat = SatDataset(labels_df = cl_df,
                     img_dir = LS_median_img_dir,
                     data_type = "LS",
                     target_var = 'mean_asset_index_yeh', # not relevant
                     id_var = 'cluster_id',
                     feat_transform=None,
                     target_transform=None,
                     random_seed=None)

# define the data loader
median_loader = DataLoader(median_dat, batch_size=512, shuffle=False)

# apply the feature extractor
median_feats = feature_extractor.extract_feats(median_loader, reduced = False)

		Extracting Features


  0%|          | 0/5 [00:00<?, ?it/s]

In [174]:
# combine the rgb_feats and the median feats
feats = np.concatenate([rgb_feats, median_feats], axis = 0)

# run a PCA on the combined dataset and extract first 25 components
pca_results = reduce_dimensions(feats, n_components = 25)

		Total variance explained by first 25 components: 0.9042


In [178]:
# split the pca results for both datasets and rename variables
dyn_pca_feats = pd.DataFrame(pca_results[:len(lsms_df),:])
dyn_pca_feats.columns = ['dyn_rgb_pc_' + str(i+1) for i in dyn_pca_feats.columns]
dyn_pca_feats['unique_id'] = lsms_df['unique_id']

median_pca_feats = pd.DataFrame(pca_results[len(lsms_df):,:])
median_pca_feats.columns = ['median_rgb_pc_' + str(i+1) for i in median_pca_feats.columns]
median_pca_feats['cluster_id'] = cl_df['cluster_id']

In [179]:
# save both datasets
pth = f"{root_data_dir}/satellite_imgs/LS/dyn_rgb_feats.csv"
dyn_pca_feats.to_csv(pth, index = False)

pth = f"{root_data_dir}/satellite_imgs/LS/median_rgb_feats.csv"
median_pca_feats.to_csv(pth, index = False)