In [None]:
from data_collection.vso_search_result import VSOSearchResult
from data_collection.fido_search_result import FidoSearchResult
from data_collection.gong_sampler import sample_by_size
from data_collection.gong_sampler import sample_by_cadence
from PIL import Image
from PIL.ImageStat import Stat
import requests
import pandas as pd
import numpy as np
import skimage
import os
from tqdm import tqdm
import time

## Anomaly Detection by Statistical Analysis

In [495]:
def get_each_image(url: str):
    """
    This function accesses a passed url, downloads the image, and saves it to the local folder for processing. The image
    is overwritten each time the function is called.

    Parameters
    ----------
    url
        a URL address corresponding to a single image

    Returns
    -------
        the image

    """
    image_url = url
    image_temp = Image.open(requests.get(image_url, stream=True).raw)
    image_temp.save('.image.jpg')
    img = Image.open('.image.jpg')

    return img

In [496]:
def avg_image_brightness(url_array: list):
    """
    This function calculates the average brightness of an entire sampled image from a list of image urls and appends the
    values to a list of mean values.

    Parameters
    ----------
    url_array
        an array of image urls for which the average brightness is being calculated

    Returns
    -------
        an array of float type values corresponding to the average brightness of the sampled image
    """
    mean_array = []

    for i in url_array:
        img = get_each_image(i)
        image_stat = Stat(img)
        mean_array.append(image_stat.mean)

    return mean_array

In [497]:
def block_reduce_image(img, side_length: int):
    """
    This function down-samples an image to collect the average pixel values and converts them into an array.

    Parameters
    ----------
    img
        the image to be processed
    side_length
        the desired side length of the grid

    Returns
    -------
        a dataframe filled with specified image brightness values
    """
    pixel_array = np.array(img)
    down_sample = int(img.size[0] / side_length)
    array = skimage.measure.block_reduce(pixel_array, (down_sample, down_sample), np.mean)

    return array

In [505]:
def get_pixel_data(urls: list, side_length: int):
    """
    This function calculates the brightness of each resized pixel and stores the data in a dataframe for processing.

    Parameters
    ----------
    urls
        an array of image urls for which the average brightness per pixel is being calculated
    side_length
        the desired side length of the grid

    Returns
    -------
        a dataframe filled with specified image brightness values
    """
    df = pd.DataFrame()

    for i in tqdm(urls):
        bw_img = get_each_image(i).convert("L")
        ds_array = block_reduce_image(bw_img, side_length)

        row = ds_array.flatten()
        row = pd.Series(row)
        row = row.to_frame().T

        df = pd.concat([df, row], ignore_index=True)

    df['URLs'] = urls
    df.set_index('URLs', inplace=True)

    os.remove('.image.jpg')

    return df

In [499]:
def find_range(df):
    """
    This function subtracts the median value of each row from each cell in the row and calculates ranges based on
    preset percentiles. It then calculates a range based on a multiple of the "inner quartile range" and replaces each
    cell value with a boolean value indicating whether that value falls within (True), or outside of (False), the given
    range.

    Parameters
    ----------
    df
        a dataframe of pixel brightness values
    Returns
    -------
        an updated dataframe
    """
    df = df.sub(df.median(axis=1), axis=0)

    for col in df:
        pl = np.percentile(df[col], 4)
        ph = np.percentile(df[col], 96)
        iqr = ph - pl
        lower = pl - iqr * 1.5
        upper = ph + iqr * 1.5
        df[col] = df[col].between(lower, upper)

    return df

In [503]:
def find_corrupt_images(df):
    """
    This function applies a suite of tools to determine which images are likely corrupted and unsuitable for further
    processing.

    Parameters
    ----------
    df
        a data frame of pixel brightness values.

    Returns
    -------
        a list of unique image identifications for corrupted images.
    """
    df = find_range(df)
    counts = df.apply(pd.Series.value_counts, axis=1)

    if False in counts.columns:
        corrupted = counts.index[counts[False] > 0]
    else:
        corrupted = []

    return corrupted

In [504]:
if __name__ == "__main__":
    begin = "2012-01-01 00:00:01"
    end = "2012-02-01 23:59:59"
    image_data: VSOSearchResult = sample_by_size(begin, end, 100, "maunaloa")
    print("{} observations have been found.".format(image_data.n_queried_files))
    image_data.generate_url_metadata(fits_urls=True, header_urls=True, jpg_urls=True)
    image_urls = image_data.jpg_urls
    start = time.time()
    pixel_data = get_pixel_data(image_urls, 16)
    corrupt_images = find_corrupt_images(pixel_data)
    end = time.time()
    print("{} corrupted images were found: ".format(len(corrupt_images)))
    print('\n'.join(map(str, corrupt_images)))
    print("____________________________________________")
    print("The execution time is: ", (end - start) / 60, " minutes")
    print("____________________________________________")

100 observations have been found.


100%|██████████| 100/100 [01:03<00:00,  1.57it/s]


4 corrupted images were found: 
https://gong2.nso.edu/HA/hag/201201/20120102/20120102190114Mh.jpg
https://gong2.nso.edu/HA/hag/201201/20120104/20120104190414Mh.jpg
https://gong2.nso.edu/HA/hag/201201/20120107/20120107192214Mh.jpg
https://gong2.nso.edu/HA/hag/201201/20120111/20120111190014Mh.jpg
____________________________________________
The execution time is:  1.0627101143201192  minutes
____________________________________________


In [None]:
if __name__ == "__main__":
    begin = "2012-01-01 00:00:01"
    end = "2012-02-01 23:59:59"
    image_data: FidoSearchResult = sample_by_cadence(begin, end, 120, "maunaloa")
    print("{} observations have been found.".format(image_data.total_observations))

## Isolation Forest

In [506]:
from sklearn.ensemble import IsolationForest

In [507]:
data = pixel_data #get_pixel_data(image_urls, 16)
columns = data.columns

In [508]:
clf = IsolationForest(contamination=float(0.05))
clf.fit(data[columns])
pred = clf.predict(data[columns])
data['anomaly'] = pred
outliers = data.loc[data['anomaly'] == -1]
print(data['anomaly'].value_counts())

 1    95
-1     5
Name: anomaly, dtype: int64


In [509]:
outliers

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,247,248,249,250,251,252,253,254,255,anomaly
URLs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
https://gong2.nso.edu/HA/hag/201201/20120102/20120102190114Mh.jpg,50.240784,40.942078,37.127014,38.06665,55.907959,125.563354,172.987549,183.276917,181.634705,168.236328,...,74.374634,73.757812,69.58252,49.178284,19.937012,1.108276,1.0,31.555969,156.397522,-1
https://gong2.nso.edu/HA/hag/201201/20120104/20120104190414Mh.jpg,50.240784,40.942078,37.127014,38.085632,53.05896,113.948242,155.053284,161.261414,157.594177,142.090881,...,86.234558,86.636475,84.62561,63.307556,27.272766,1.149475,1.0,31.555969,156.397522,-1
https://gong2.nso.edu/HA/hag/201201/20120107/20120107192214Mh.jpg,50.240784,40.942078,37.127014,38.203247,55.43042,116.360962,157.207214,161.720825,160.917053,155.704407,...,2.302124,2.045593,2.114624,1.803467,1.405884,1.003784,1.0,31.555969,156.397522,-1
https://gong2.nso.edu/HA/hag/201201/20120111/20120111190014Mh.jpg,50.240784,40.942078,37.127014,38.076233,59.811829,124.953491,136.348633,133.590942,137.300659,156.149841,...,43.070557,54.739868,87.4104,101.785339,35.669922,1.057495,1.0,31.555969,156.397522,-1
https://gong2.nso.edu/HA/hag/201201/20120127/20120127180214Mh.jpg,50.240784,40.942078,37.127014,38.10437,27.153381,50.441223,69.361206,76.667603,75.273315,66.546875,...,71.047485,70.878906,63.479065,44.093567,17.934937,1.08374,1.0,31.555969,156.397522,-1


## More Validation

In [517]:
directory = '/Users/Copeland/Downloads/jpeg_images'

paths = []
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    paths.append(f)

paths

['/Users/Copeland/Downloads/jpeg_images/wavelet_20130727031814Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130727035014Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130728180014Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130731214114Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130729042214Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130726201214Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130729004114Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130801202214Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130728220114Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130729000914Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130731222514Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130731212014Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130729180114Mh.jpeg',
 '/Users/Copeland/Downloads/jpeg_images/wavelet_20130727020714Mh

In [511]:
def get_data(pics: list, side_length: int):
    df = pd.DataFrame()

    for i in pics:
        bw_img = Image.open(i).convert("L")
        ds_array = block_reduce_image(bw_img, side_length)

        row = ds_array.flatten()
        row = pd.Series(row)
        row = row.to_frame().T

        df = pd.concat([df, row], ignore_index=True)

    df['Filename'] = pics
    df.set_index('Filename', inplace=True)

    return df

In [512]:
def get_picture(img: str):
    image_temp = Image.open(img)
    image_temp.save('.image.jpg')
    img = Image.open('.image.jpg')

    return img

In [513]:
data1 = get_data(paths, 16)
corrupted1 = find_corrupt_images(data1)

In [514]:
data1

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/Users/Copeland/Downloads/jpeg_images/wavelet_20130727031814Mh.jpeg,0.0,0.0,0.0,0.028015,4.945374,12.411865,18.525146,26.063232,27.602905,23.059631,...,16.376526,24.697937,22.971497,12.004395,6.348999,1.541016,0.000000,0.0,0.0,0.0
/Users/Copeland/Downloads/jpeg_images/wavelet_20130727035014Mh.jpeg,0.0,0.0,0.0,0.037598,9.951477,24.488708,34.919678,42.942017,46.072449,41.963013,...,40.504211,48.723755,47.445923,37.886475,26.362732,10.604736,0.000000,0.0,0.0,0.0
/Users/Copeland/Downloads/jpeg_images/wavelet_20130728180014Mh.jpeg,0.0,0.0,0.0,0.024109,4.650757,12.192993,20.305908,28.667358,28.529236,21.695374,...,11.769409,21.008667,24.568359,19.083313,12.605469,3.575806,0.000000,0.0,0.0,0.0
/Users/Copeland/Downloads/jpeg_images/wavelet_20130731214114Mh.jpeg,0.0,0.0,0.0,0.029968,6.027039,14.556580,20.780640,27.400452,26.991516,20.015015,...,8.578491,16.529358,17.298218,9.292236,6.048828,2.639832,0.016235,0.0,0.0,0.0
/Users/Copeland/Downloads/jpeg_images/wavelet_20130729042214Mh.jpeg,0.0,0.0,0.0,0.021240,4.441284,10.955261,16.632141,26.156311,29.941406,25.907898,...,15.760864,24.752747,23.489380,11.514099,5.666016,1.741577,0.009338,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/Users/Copeland/Downloads/jpeg_images/wavelet_20130731224014Mh.jpeg,0.0,0.0,0.0,0.028870,6.300110,15.152893,21.297363,26.790283,26.124451,20.002258,...,9.924683,16.904602,16.375061,8.875244,5.579834,2.341064,0.016846,0.0,0.0,0.0
/Users/Copeland/Downloads/jpeg_images/wavelet_20130801173114Mh.jpeg,0.0,0.0,0.0,0.024841,4.575195,11.344849,18.013062,27.794006,28.912231,22.017883,...,12.797241,24.129333,26.313660,17.322693,10.997131,4.591187,0.034302,0.0,0.0,0.0
/Users/Copeland/Downloads/jpeg_images/wavelet_20130801010314Mh.jpeg,0.0,0.0,0.0,0.028625,6.212769,14.972107,21.426636,28.159790,27.760071,20.622192,...,14.276428,21.414124,19.760742,10.003418,5.164368,1.834595,0.016357,0.0,0.0,0.0
/Users/Copeland/Downloads/jpeg_images/wavelet_20130801031314Mh.jpeg,0.0,0.0,0.0,0.024170,5.297424,13.344299,19.646545,26.744080,27.064941,23.180298,...,14.853943,23.080933,20.959900,10.647095,5.421326,1.201172,0.000000,0.0,0.0,0.0


In [485]:
corrupted1

Index(['/Users/Copeland/Downloads/jpeg_images/wavelet_20130801042814Mh.jpeg',
       '/Users/Copeland/Downloads/jpeg_images/wavelet_20130801022214Mh.jpeg',
       '/Users/Copeland/Downloads/jpeg_images/wavelet_20130801023214Mh.jpeg',
       '/Users/Copeland/Downloads/jpeg_images/wavelet_20130728041914Mh.jpeg'],
      dtype='object', name='Filename')