In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import image

from skimage import io
import seaborn as sns
import geopandas as gpd
import contextily as cx

%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
!unzip /content/drive/MyDrive/Post-hurricane.zip -d hurricane_data

In [None]:
#set main directory
DIR = '/content/hurricane_data'

#list folders
os.listdir(DIR)

In [None]:
# number of images per class in training set
print('-------TRAIN-------')
print(f"No Damage: {len(os.listdir(os.path.join(DIR, 'train_another', 'no_damage')))}")
print(f"Damage: {len(os.listdir(os.path.join(DIR, 'train_another', 'damage')))}")

# number of images per class in validation set
print('-----VALIDATION-----')
print(f"No Damage: {len(os.listdir(os.path.join(DIR, 'validation_another', 'no_damage')))}")
print(f"Damage: {len(os.listdir(os.path.join(DIR, 'validation_another', 'damage')))}")

# number of images per class in test set
print('--------TEST--------')
print(f"No Damage: {len(os.listdir(os.path.join(DIR, 'test', 'no_damage')))}")
print(f"Damage: {len(os.listdir(os.path.join(DIR, 'test', 'damage')))}")

# number of images per class in test set
print('-------TEST 2-------')
print(f"No Damage: {len(os.listdir(os.path.join(DIR, 'test_another', 'no_damage')))}")
print(f"Damage: {len(os.listdir(os.path.join(DIR, 'test_another', 'damage')))}")

In [None]:
#load a sample image
tile_path = os.path.join(DIR, 'train_another', 'no_damage' , '-96.987747_28.489092.jpeg')
tile = io.imread(tile_path)

#plot image
plt.figure(figsize=(5,5))
plt.imshow(tile);

In [None]:
tile.shape

In [None]:
image_size = (128, 128)
batch_size = 32

train_dir = os.path.join(DIR, 'train_another')
val_dir = os.path.join(DIR, 'validation_another')

train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    train_dir,
    seed=42,
    image_size=image_size,
    batch_size=batch_size,
)
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    val_dir,
    seed=42,
    image_size=image_size,
    batch_size=batch_size,
)

In [None]:
num_skipped = 0
for folder_name in ("damage", "no_damage"):
    folder_path = os.path.join(DIR,'train_another', folder_name)
    for fname in os.listdir(folder_path):
        fpath = os.path.join(folder_path, fname)
        try:
            fobj = open(fpath, "rb")
            is_jfif = tf.compat.as_bytes("JFIF") in fobj.peek(10)
        finally:
            fobj.close()

        if not is_jfif:
            num_skipped += 1

            print(fpath)
            # Delete corrupted image
            #os.remove(fpath)

print(f"There are {num_skipped} corrupted images")

In [None]:
# training set folder
train_dir = os.path.join(DIR, 'train_another')

# get the list of jpegs from sub image class folders
damage_imgs = [fn for fn in os.listdir(f'{train_dir}/damage') if fn.endswith('.jpeg')]
no_damage_imgs = [fn for fn in os.listdir(f'{train_dir}/no_damage') if fn.endswith('.jpeg')]

# randomly select 3 of each
select_damage = np.random.choice(damage_imgs, 6, replace = False)
select_no_damage = np.random.choice(no_damage_imgs, 6, replace = False)

In [None]:
# plotting 2 x 3 image matrix
fig = plt.figure(figsize = (8,6))
for i in range(6):
    if i < 3:
        fp = f'{train_dir}/damage/{select_damage[i]}'
        label = 'DAMAGE'
    else:
        fp = f'{train_dir}/no_damage/{select_no_damage[i-3]}'
        label = 'NO DAMAGE'
    ax = fig.add_subplot(2, 3, i+1)

    fn = image.load_img(fp, target_size = (128,128), color_mode='rgb')
    plt.imshow(fn)
    plt.title(label)
    plt.axis('off')
plt.show()

In [None]:
# making n X m matrix
def img2np(path, list_of_filename, size = (128, 128)):
    # iterating through each file
    for fn in list_of_filename:
        fp = path + fn
        current_image = image.load_img(fp, target_size = size, 
                                       color_mode = 'rgb')
        # covert image to a matrix
        img_ts = image.img_to_array(current_image)
        # turn that into a vector / 1D array
        img_ts = [img_ts.ravel()]
        try:
            # concatenate different images
            full_mat = np.concatenate((full_mat, img_ts))
        except UnboundLocalError: 
            # if not assigned yet, assign one
            full_mat = img_ts
    
    return full_mat

In [None]:
# training set folder
val_dir = os.path.join(DIR, 'validation_another')
val_damage_imgs = [fn for fn in os.listdir(f'{val_dir}/damage') if fn.endswith('.jpeg')]

In [None]:
test_array = img2np(f'{val_dir}/damage/', val_damage_imgs)

In [None]:
test_array[0]

In [None]:
# run it on our folders
damage_arrays = img2np(f'{train_dir}/damage/', damage_imgs)
no_damage_arrays = img2np(f'{train_dir}/no_damage/', no_damage_imgs)

Average Image by Class

In [None]:
def find_mean_img(full_mat, title, size = (128, 128)):
    # calculate the average
    mean_img = np.mean(full_mat, axis = 0)
    # reshape it back to a matrix
    mean_img = mean_img.reshape(size)
    plt.imshow(mean_img, vmin=85, vmax=110, cmap='plasma')
    plt.title(f'Average {title}')
    plt.axis('off')
    plt.colorbar()
    plt.show()
    return mean_img

In [None]:
damage_mean = find_mean_img(damage_arrays, 'DAMAGE')
no_damage_mean = find_mean_img(no_damage_arrays, 'NO DAMAGE')

Standard deviation

In [None]:
def find_std_img(full_mat, title, size = (128, 128)):
    # calculate the average
    var_img = np.std(full_mat, axis = 0)
    # reshape it back to a matrix
    var_img = var_img.reshape(size)
    plt.imshow(var_img, vmin=20, vmax=70, cmap='plasma')
    plt.title(f'Standard Deviation {title}')
    plt.colorbar()
    plt.axis('off')
    plt.show()
    return var_img

In [None]:
damage_std = find_std_img(damage_arrays, 'DAMAGE')
no_damage_std = find_std_img(no_damage_arrays, 'NO DAMAGE')

Contrast between average images

In [None]:
contrast_mean = damage_mean - no_damage_mean
plt.imshow(contrast_mean, vmin=-15, vmax=15, cmap='bwr')
plt.title(f'Difference Between Damage & No Damage Average')
plt.colorbar()
plt.axis('off')
plt.show()

Geographic distribution

In [None]:
#damage set
damage_df = pd.DataFrame(damage_imgs, columns=['Filename'])
damage_coors =  damage_df.Filename.str.split('_', expand=True)
damage_df = pd.concat([damage_df, damage_coors], axis=1)
damage_df = damage_df.rename(columns={0: 'lon', 1: 'lat'})
damage_df.lat = damage_df.lat.str.rstrip('.jpeg')
damage_df['damage'] = 1
#damage_df['file_size']

#no_damage set
no_damage_df = pd.DataFrame(no_damage_imgs, columns=['Filename'])
no_damage_coors = no_damage_df.Filename.str.split('_', expand=True)
no_damage_df = pd.concat([no_damage_df, no_damage_coors], axis=1)
no_damage_df = no_damage_df.rename(columns={0: 'lon', 1: 'lat'})
no_damage_df.lat = no_damage_df.lat.str.rstrip('.jpeg')
no_damage_df['damage'] = 0

#concatenate
all_df = pd.concat([damage_df, no_damage_df])

In [None]:
#convert to geodataframe and convert to Web Mercator
all_gdf = gpd.GeoDataFrame(all_df, geometry=gpd.points_from_xy(all_df.lon, all_df.lat))
all_gdf = all_gdf.set_crs(epsg=4326)
all_gdf = all_gdf.to_crs(epsg=3857)

In [None]:
#plot by damage class w/ basemap
ax = all_gdf.plot(column='damage',categorical=True, figsize=(10, 10), alpha=0.8, s=1.5, legend=True, cmap='cool')
cx.add_basemap(ax)
plt.title(f'Point locations of Training Images, Damaged or Not')
plt.axis('off')
plt.show()

In [None]:
#convert lat/long columns from text to float
all_gdf.lon = all_gdf.lon.astype('float')
all_gdf.lat = all_gdf.lat.astype('float')

In [None]:
#plot distribution of longitude values
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))
sns.histplot(all_gdf[all_gdf.damage==0].lon, ax=ax1)
fig.suptitle('Distribution of Longtitudes')
ax1.set_xlim([-97, -93.5])
ax1.set_title('No Damage')

sns.histplot(all_gdf[all_gdf.damage==1].lon, ax=ax2)
ax2.set_xlim([-97, -93.5])
ax2.set_title('Damage');


In [None]:
#plot by damage class w/ basemap - zoomed in on southwest grouping
southern_gdf = all_gdf[all_gdf.lon < -96]
axx = southern_gdf.plot(column='damage',categorical=True, figsize=(10, 10), alpha=0.8, s=1.5, legend=True, cmap='cool')
axx.set_xlim([-10798210.908-60000, -10782077.708+60000])
cx.add_basemap(axx)
plt.title(f'Point locations of Training Images, Damaged or Not (Southwestern Grouping)')
plt.axis('off')
plt.show()

In [None]:
def get_damage_file_size(row):
    file_path = os.path.join(DIR, 'train_another', 'damage', row['Filename'])
    file_size = os.stat(file_path).st_size
    return file_size

def get_no_damage_file_size(row):
    file_path = os.path.join(DIR, 'train_another', 'no_damage', row['Filename'])
    file_size = os.stat(file_path).st_size
    return file_size

In [None]:
damage_df['file_size'] = damage_df.apply((lambda row: get_damage_file_size(row)), axis=1)
no_damage_df['file_size'] = no_damage_df.apply((lambda row: get_no_damage_file_size(row)), axis=1)

In [None]:
all_df = pd.concat([damage_df, no_damage_df])

In [None]:
print(damage_df['file_size'].mean())
print(no_damage_df['file_size'].mean())

In [None]:
sns.boxplot(data=all_df, x='damage', y='file_size')