## **Sample Dataset Creation**
This script is used for the following:

1. import the NL image dataframe for reference
2. further explore the NL dataset and compute count of images in sample sets
3. create separate balanced 'fog'/'nofog' day and night sample datasets for taining, validation and testing

In [4]:
# import necessary libraries
import pickle
import pandas as pd

In [5]:
# point to source image dataset
path = '/home/ubuntu/michael/my_pickles/'
infile = 'NL_image_df.pkl'

In [6]:
# create dataframe from pickle file of history object containig accuracy and loss values during trainingand create dataframe
NL_image_df = pd.read_pickle(path + infile)
NL_image_df

Unnamed: 0,filename,label,day_phase,phase,filepath
0,A5-HM92-ID13771_20190427_0101.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A5-...
1,A9-HM462-ID12800_20190811_2310.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
2,A5-HM17-ID11089_20190429_2151.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A5-...
3,A16-HM231-ID12256_20190810_0020.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A16...
5,A1-HM65-ID12966_20190427_2301.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A1-...
...,...,...,...,...,...
54207,A1-HM61-ID12961_20191203_0000.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A1-...
54208,A16-HM211-ID71681_20170907_1200.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A16...
54209,A27-HM582-ID10977_20170913_0931.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A27...
54210,A2-HM765-ID10913_20170907_0800.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A2-...


## **Entire Data Set Stats: Count image types in dataframe**

In [7]:
# count fog images
fog_images_count = NL_image_df[NL_image_df.label == "Fog"]
print(f'There are {len(fog_images_count)} fog images in the dataset.') #2015

There are 2015 fog images in the dataset.


In [8]:
# count no fog images
nofog_images_count = NL_image_df[NL_image_df.label == "No Fog"]
print(f'There are {len(nofog_images_count)} nofog images in the dataset.') #49230

There are 49230 nofog images in the dataset.


In [9]:
# compute ratio of fog to nofog images
fog_nofog_ratio = round((len(nofog_images_count) / len(fog_images_count)) , 0)
print(f'For every 1 fog image, there are {fog_nofog_ratio} non-fog images.') # 1 fog = 24 nofog

For every 1 fog image, there are 24.0 non-fog images.


In [10]:
# day images count
day_images_count = NL_image_df[NL_image_df.phase == "Day"]
print(f'There are {len(day_images_count)} day images in the dataset.') #28185

There are 28185 day images in the dataset.


In [11]:
# night images count
night_images_count = NL_image_df[NL_image_df.phase == "Night"]
print(f'There are {len(night_images_count)} night images in the dataset.') #23060

There are 23060 night images in the dataset.


In [12]:
# confirming counts of fog/nofog images
label_images_count = len(nofog_images_count) + len(fog_images_count)
label_images_count #51245

51245

In [13]:
# confirming counts of day/night images
phase_images_count = len(day_images_count) + len(night_images_count)
phase_images_count #51245

51245

## **Create dataframes for day images**

In [14]:
# make a dataframe for day fog images
day_fog_images_df = NL_image_df[(NL_image_df.label == "Fog") & (NL_image_df.phase == 'Day')]
day_fog_images_df

Unnamed: 0,filename,label,day_phase,phase,filepath
15,A28-HM1954-ID11818_20190201_0820.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A28...
172,A16-HM209-ID71700_20190322_0520.jpg,Fog,10,Day,/home/ubuntu/michael/nl_images/images_data/A16...
345,A9-HM471-ID12830_20190329_0520.jpg,Fog,10,Day,/home/ubuntu/michael/nl_images/images_data/A9-...
349,A16-HM209-ID71700_20190322_0730.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A16...
424,A50-HM1895-ID11628_20190301_0720.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A50...
...,...,...,...,...,...
54063,A15-HM214-ID12374_20200121_1630.jpg,Fog,11,Day,/home/ubuntu/michael/nl_images/images_data/A15...
54081,A15-HM249-ID13551_20170917_0541.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A15...
54155,A9-HM482-ID12824_20200121_0930.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A9-...
54163,A4-HM443-ID12001_20170916_1741.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A4-...


In [15]:
print(f'There are {len(day_fog_images_df)} day fog images') #561

There are 561 day fog images


In [16]:
# make a dataframe for day nofog images
day_nofog_images_df = NL_image_df[(NL_image_df.label == "No Fog") & (NL_image_df.phase == 'Day')]
day_nofog_images_df #27624

Unnamed: 0,filename,label,day_phase,phase,filepath
16,A9-HM430-ID12784_20190607_0321.jpg,No Fog,10,Day,/home/ubuntu/michael/nl_images/images_data/A9-...
33,A50-HM1969-ID11602_20190720_0931.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A50...
38,A1-HM49-ID12953_20191008_0830.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A1-...
40,A28-HM1954-ID11818_20190921_1720.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A28...
43,A50-HM1838-ID11510_20190518_0330.jpg,No Fog,10,Day,/home/ubuntu/michael/nl_images/images_data/A50...
...,...,...,...,...,...
54204,A4-HM643-ID111940_20170905_1600.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A4-...
54206,A5-HM110-ID13775_20170912_1321.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A5-...
54208,A16-HM211-ID71681_20170907_1200.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A16...
54209,A27-HM582-ID10977_20170913_0931.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A27...


In [17]:
print(f'There are {len(day_nofog_images_df)} day nofog images') #27624

There are 27624 day nofog images


In [18]:
# confirm day counts of fog and nofog images
day_images_sum = len(day_fog_images_df) + len(day_nofog_images_df)
day_images_sum #28185

28185

In [19]:
# create a day image dataframe (with both fog and nofog images)
day_images_df = NL_image_df[NL_image_df.phase == 'Day']
day_images_df #28185

Unnamed: 0,filename,label,day_phase,phase,filepath
15,A28-HM1954-ID11818_20190201_0820.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A28...
16,A9-HM430-ID12784_20190607_0321.jpg,No Fog,10,Day,/home/ubuntu/michael/nl_images/images_data/A9-...
33,A50-HM1969-ID11602_20190720_0931.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A50...
38,A1-HM49-ID12953_20191008_0830.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A1-...
40,A28-HM1954-ID11818_20190921_1720.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A28...
...,...,...,...,...,...
54204,A4-HM643-ID111940_20170905_1600.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A4-...
54206,A5-HM110-ID13775_20170912_1321.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A5-...
54208,A16-HM211-ID71681_20170907_1200.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A16...
54209,A27-HM582-ID10977_20170913_0931.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A27...


In [20]:
# undersample day no fog images
# 561 day fog images
day_sample_df = day_images_df.groupby('label').sample(n=561, random_state=1)
day_sample_df

Unnamed: 0,filename,label,day_phase,phase,filepath
4881,A2-HM748-ID10907_20190322_0540.jpg,Fog,10,Day,/home/ubuntu/michael/nl_images/images_data/A2-...
47516,A27-HM641-ID10973_20200101_1020.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A27...
31434,A50-HM1864-ID11640_20191231_0931.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A50...
51563,A15-HM236-ID13549_20200121_1450.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A15...
46608,A9-HM458-ID12839_20200121_1600.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A9-...
...,...,...,...,...,...
32928,A5-HM102-ID13773_20170915_1641.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A5-...
17133,A28-HM1960-ID13520_20170912_1651.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A28...
51583,A50-HM1841-ID11648_20170917_0541.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A50...
29703,A1-HM65-ID12966_20180920_1720.jpg,No Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A1-...


In [19]:
# save day dataframe to pickle
day_sample_df.to_pickle('/home/ubuntu/michael/my_pickles/day_df.pkl')

In [28]:
# make X variable list for day images
X_values_day = day_sample_df.filename.tolist()
len(X_values_day) #1122

1122

In [29]:
# make y variable list for day images
y_values_day = day_sample_df.label.tolist()
len(y_values_day) #1122

1122

## **Create dataframes for night images**

In [21]:
# create a dataframe for night fog images
night_fog_images_df = NL_image_df[(NL_image_df.label == "Fog") & (NL_image_df.phase == 'Night')]
night_fog_images_df #1454

Unnamed: 0,filename,label,day_phase,phase,filepath
0,A5-HM92-ID13771_20190427_0101.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A5-...
26,A4-HM449-ID12003_20190505_2220.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A4-...
31,A27-HM645-ID10971_20190322_0230.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A27...
51,A9-HM312-ID11179_20190901_2110.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
54,A28-HM1982-ID13517_20190429_0110.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A28...
...,...,...,...,...,...
54150,A50-HM1867-ID11639_20191231_1701.jpg,Fog,31,Night,/home/ubuntu/michael/nl_images/images_data/A50...
54159,A27-HM582-ID10977_20170907_0400.jpg,Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A27...
54167,A15-HM795-ID12044_20170913_0121.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A15...
54192,A50-HM1971-ID11560_20200122_0121.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...


In [22]:
print(f'There are {len(night_fog_images_df)} night fog images') #1454

There are 1454 night fog images


In [31]:
# create a dataframe for night nofog images
night_nofog_images_df = NL_image_df[(NL_image_df.label == "No Fog") & (NL_image_df.phase == 'Night')]
night_nofog_images_df #21606

Unnamed: 0,filename,label,day_phase,phase,filepath
1,A9-HM462-ID12800_20190811_2310.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
2,A5-HM17-ID11089_20190429_2151.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A5-...
3,A16-HM231-ID12256_20190810_0020.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A16...
5,A1-HM65-ID12966_20190427_2301.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A1-...
6,A9-HM446-ID12845_20190504_2211.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
...,...,...,...,...,...
54201,A4-HM118-ID13975_20200101_2111.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A4-...
54202,A27-HM589-ID10974_20170907_0400.jpg,No Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A27...
54203,A27-HM587-ID10975_20170907_0400.jpg,No Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A27...
54207,A1-HM61-ID12961_20191203_0000.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A1-...


In [32]:
print(f'There are {len(night_nofog_images_df)} night nofog images') #21606

There are 21606 night nofog images


In [33]:
# confirm count of night fog/nofog images
night_images_sum = len(night_fog_images_df) + len(night_nofog_images_df)
night_images_sum #23060

23060

In [34]:
# confirm count of day and night images
images_sum = day_images_sum + night_images_sum
images_sum #51245

51245

In [35]:
# compute ratio of day to night images
day_night_ratio = round((night_images_sum / day_images_sum), 4)
print(f'For every 1 day image, there are {day_night_ratio} night images.') # 1 day ~ 0.8 night images

For every 1 day image, there are 0.8182 night images.


In [27]:
# X_data = filename; y_data = label    
# day_phase has original
# phase has updated for night or day

In [36]:
# create a night image dataframe (with both fog and nofog images)
night_images_df = NL_image_df[NL_image_df.phase == 'Night']
night_images_df #23060

Unnamed: 0,filename,label,day_phase,phase,filepath
0,A5-HM92-ID13771_20190427_0101.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A5-...
1,A9-HM462-ID12800_20190811_2310.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
2,A5-HM17-ID11089_20190429_2151.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A5-...
3,A16-HM231-ID12256_20190810_0020.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A16...
5,A1-HM65-ID12966_20190427_2301.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A1-...
...,...,...,...,...,...
54202,A27-HM589-ID10974_20170907_0400.jpg,No Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A27...
54203,A27-HM587-ID10975_20170907_0400.jpg,No Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A27...
54205,A2-HM784-ID10917_20200123_0650.jpg,Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A2-...
54207,A1-HM61-ID12961_20191203_0000.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A1-...


In [37]:
# undersample night no fog images
# 1454 night fog images
night_sample_df = night_images_df.groupby('label').sample(n=1454, random_state=1)
night_sample_df #2908

Unnamed: 0,filename,label,day_phase,phase,filepath
52183,A9-HM470-ID12831_20200101_0331.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
51511,A9-HM467-ID12834_20191231_2051.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
42537,A9-HM467-ID12834_20200101_0101.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
37402,A50-HM1867-ID11639_20191231_0651.jpg,Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A50...
1220,A50-HM1868-ID11520_20190322_0201.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...
...,...,...,...,...,...
8855,A27-HM675-ID10959_20170913_0421.jpg,No Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A27...
3946,A9-HM470-ID12806_20190504_0011.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
30941,A50-HM1888-ID11528_20181009_1901.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...
8528,A4-HM52-ID11043_20170906_2250.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A4-...


In [38]:
# save day dataframe to pickle
night_sample_df.to_pickle('/home/ubuntu/michael/my_pickles/night_df.pkl')

In [39]:
# make X variable list for night images
X_values_night = night_sample_df.filename.tolist()
len(X_values_night) #2908

2908

In [40]:
# make y variable list for night images
y_values_night = night_sample_df.label.tolist()
len(y_values_night) #2908

2908

In [35]:
# create a nigth dataframe of input and output variables
night_io_df = pd.DataFrame()
night_io_df['Filename'] = X_values_night
night_io_df['Label'] = y_values_night
night_io_df

#save dataframe to pickle file
night_io_df.to_pickle('/home/ubuntu/michael/my_pickles/night_io_df.pkl')

In [41]:
df_list = [day_sample_df, night_sample_df]
sample_df = pd.concat(df_list)
sample_df

Unnamed: 0,filename,label,day_phase,phase,filepath
4881,A2-HM748-ID10907_20190322_0540.jpg,Fog,10,Day,/home/ubuntu/michael/nl_images/images_data/A2-...
47516,A27-HM641-ID10973_20200101_1020.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A27...
31434,A50-HM1864-ID11640_20191231_0931.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A50...
51563,A15-HM236-ID13549_20200121_1450.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A15...
46608,A9-HM458-ID12839_20200121_1600.jpg,Fog,1,Day,/home/ubuntu/michael/nl_images/images_data/A9-...
...,...,...,...,...,...
8855,A27-HM675-ID10959_20170913_0421.jpg,No Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A27...
3946,A9-HM470-ID12806_20190504_0011.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
30941,A50-HM1888-ID11528_20181009_1901.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...
8528,A4-HM52-ID11043_20170906_2250.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A4-...


In [40]:
sample_df.to_pickle('/home/ubuntu/michael/my_pickles/sample_df.pkl')