In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os 
import pandas as pd
import numpy as np
import math
from skimage import io

DIR = '/content/drive/MyDrive/Penn/MUSA-650/HW4'
DATA_DIR = 'eurosat/'

%cd {DIR}

/content/drive/MyDrive/Penn/MUSA-650/HW4


In [4]:
data_dict = [(root,dirnames,filenames) for root,dirnames,filenames in os.walk(DATA_DIR)]

In [5]:
data_df = []
for root, dirs, files in os.walk(DATA_DIR):
    for filename in files:
        if ('.' in filename) and ('.csv' not in filename):
            folders = root.split('/')
            mainfolder = folders[1]
            subfolder = folders[2]
            ftype = filename.split('.')[1]
            path = os.path.join(root, filename)
            filename = filename.split('.')[0]
            num = str(int(filename.split('_')[1])).zfill(4)
            data_df.append([filename, subfolder, num, ftype, mainfolder, path])
data_df = pd.DataFrame(data_df, columns = ['filename', 'class', 'num', 'file_type', 'img_type', 'path'])
rgb_df = data_df[data_df['img_type']=='MS']

In [6]:
rgb_df

Unnamed: 0,filename,class,num,file_type,img_type,path
0,PermanentCrop_354,PermanentCrop,0354,tif,MS,eurosat/MS/PermanentCrop/PermanentCrop_354.tif
1,PermanentCrop_881,PermanentCrop,0881,tif,MS,eurosat/MS/PermanentCrop/PermanentCrop_881.tif
2,PermanentCrop_1149,PermanentCrop,1149,tif,MS,eurosat/MS/PermanentCrop/PermanentCrop_1149.tif
3,PermanentCrop_735,PermanentCrop,0735,tif,MS,eurosat/MS/PermanentCrop/PermanentCrop_735.tif
4,PermanentCrop_270,PermanentCrop,0270,tif,MS,eurosat/MS/PermanentCrop/PermanentCrop_270.tif
...,...,...,...,...,...,...
26995,River_622,River,0622,tif,MS,eurosat/MS/River/River_622.tif
26996,River_1697,River,1697,tif,MS,eurosat/MS/River/River_1697.tif
26997,River_1130,River,1130,tif,MS,eurosat/MS/River/River_1130.tif
26998,River_996,River,0996,tif,MS,eurosat/MS/River/River_996.tif


In [7]:
labels = list(rgb_df['class'].unique())
labels.sort()
labels

['AnnualCrop',
 'Forest',
 'HerbaceousVegetation',
 'Highway',
 'Industrial',
 'Pasture',
 'PermanentCrop',
 'Residential',
 'River',
 'SeaLake']

In [8]:
path_dict = rgb_df.groupby('class').agg({'path':list})['path'].to_dict()
def wildcard_path(path):
  paths = path.split('_')
  frontpath = paths[0]
  backpath = paths[1].split('.')[1]
  wildpath = frontpath + '_*.' + backpath
  return wildpath
folderpath_dict = {iclass:wildcard_path(paths[0]) for iclass,paths in path_dict.items()}

In [9]:
from skimage.io import imread_collection

def get_images_from_path(folder_path):
  return imread_collection(folder_path)
image_dict = {iclass: get_images_from_path(folder_path) for iclass,folder_path in folderpath_dict.items()}

In [10]:
from skimage.color import rgb2gray
import tqdm

numpy_dict = {}
for iclass,image_collection in tqdm.tqdm(image_dict.items()):
    numpy_dict[iclass] = image_collection.concatenate()

100%|██████████| 10/10 [4:19:43<00:00, 1558.38s/it]


In [11]:
MS = True

make_grey = False
# transform/merge RGB channels into a single greyscale channel 
grey_dict = {
    iclass: 
      rgb2gray(rgb_arr) if make_grey == True 
      else rgb_arr
    for iclass, rgb_arr in numpy_dict.items()
    }

def prod_list(full_list):
  from functools import reduce
  import operator
  return reduce(operator.mul, full_list, 1)

# flatten image's 64 x 64 matrix into 4096 long
flat_dict = {
    iclass: np.reshape(
        grey_arr, (
            grey_arr.shape[0], prod_list(grey_arr.shape[1:])
        ))
    for iclass, grey_arr in grey_dict.items()
    }
flat_len = list(flat_dict.values())[0].shape[1]

# stack classification image arrays ontop of each other
all_arr = [arr for arr in flat_dict.values()]
all_arr = np.vstack(all_arr)

# make dataframe of all image arrays
all_cols = list(range(flat_len))
all_df = pd.DataFrame(
    all_arr,
    columns = all_cols
)

# get list of classifications
class_list = []
[
    [class_list.append(iclass) for i in range(grey_arr.shape[0])]
    for iclass, grey_arr in grey_dict.items()
]

# add classification column to all_df
all_df['class'] = pd.Series(class_list)
all_df = all_df[['class'] + all_cols]

# save flat dataframe to csv
filename = 'RGB_flat'
if make_grey == True:
  filename = 'grey_flat'

sub_folder = 'RGB'
if MS == True:
  filename = 'MS_flat'
  sub_folder = 'MS'

#all_df.to_csv('eurosat/{}/{}.csv'.format(sub_folder, filename))

In [14]:
filename = 'MS_flat'
sub_folder = 'MS'

all_df.to_csv('eurosat/{}/{}.csv'.format(sub_folder, filename))