# **This script does the following for Night images:**

1. takes as input the source directory of all images stored in sample - subset of entire population of images
2. distributes images into train/validate/test folders with a 0.8/0.1/0.1 split
3. separates images into classes and stores in above folders (ex: train/fog and train/nofog, etc.)

In [1]:
# import necessary libraries
import os
import numpy as np
import pandas as pd
import pickle
import shutil

In [2]:
# read pickle file that contains sample_df of image filenames, image labels and image filepatsh
file_path = '/home/ubuntu/michael/my_pickles/'
infile = 'night_df.pkl' 
infile = open(file_path + infile, 'rb')
night_df = pickle.load(infile, encoding = 'utf-8')
night_df

Unnamed: 0,filename,label,day_phase,phase,filepath
52183,A9-HM470-ID12831_20200101_0331.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
51511,A9-HM467-ID12834_20191231_2051.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
42537,A9-HM467-ID12834_20200101_0101.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
37402,A50-HM1867-ID11639_20191231_0651.jpg,Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A50...
1220,A50-HM1868-ID11520_20190322_0201.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...
...,...,...,...,...,...
8855,A27-HM675-ID10959_20170913_0421.jpg,No Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A27...
3946,A9-HM470-ID12806_20190504_0011.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...
30941,A50-HM1888-ID11528_20181009_1901.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...
8528,A4-HM52-ID11043_20170906_2250.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A4-...


In [3]:
# change labels from strings to integers and map labels correctly
mapping = {'No Fog':0, 'Fog':1}
y_true = night_df.label.map(mapping)
night_df['y_true'] = y_true
night_df

Unnamed: 0,filename,label,day_phase,phase,filepath,y_true
52183,A9-HM470-ID12831_20200101_0331.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...,1
51511,A9-HM467-ID12834_20191231_2051.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...,1
42537,A9-HM467-ID12834_20200101_0101.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...,1
37402,A50-HM1867-ID11639_20191231_0651.jpg,Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A50...,1
1220,A50-HM1868-ID11520_20190322_0201.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...,1
...,...,...,...,...,...,...
8855,A27-HM675-ID10959_20170913_0421.jpg,No Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A27...,0
3946,A9-HM470-ID12806_20190504_0011.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...,0
30941,A50-HM1888-ID11528_20181009_1901.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...,0
8528,A4-HM52-ID11043_20170906_2250.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A4-...,0


In [4]:
# outcome variable (label)
sample_label_list = night_df.label
print(f'Total number of labels: {len(sample_label_list)}') #2908

Total number of labels: 2908


In [5]:
# predictor variable (image)
sample_filename_list = night_df.filename
print(f'Total number of files: {len(sample_filename_list)}') #2908

Total number of files: 2908


## **Randomly model shuffle dataset**

In [6]:
night_df = night_df.sample(axis = 0, frac = 1, random_state = 22).reset_index(drop=True)

In [7]:
night_df

Unnamed: 0,filename,label,day_phase,phase,filepath,y_true
0,A50-HM1841-ID11511_20200101_2041.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...,1
1,A1-HM49-ID12953_20170914_0041.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A1-...,0
2,A15-HM793-ID12043_20190811_0020.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A15...,0
3,A50-HM1927-ID11617_20200121_2050.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...,0
4,A9-HM470-ID12831_20200121_2100.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...,1
...,...,...,...,...,...,...
2903,A9-HM469-ID12805_20190814_0130.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...,0
2904,A50-HM1941-ID11612_20200122_2311.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...,1
2905,A50-HM1894-ID11530_20200101_0621.jpg,Fog,20,Night,/home/ubuntu/michael/nl_images/images_data/A50...,1
2906,A50-HM1924-ID11541_20190812_0140.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...,1


## **Create training, testing, and validation sets (of images)**

In [8]:
# 80/10/10 split: train/val/test
split_1 = int(0.8 * len(night_df.filename))
split_2 = int(0.9 * len(night_df.filename))
train_filenames = night_df.filename[:split_1]
val_filenames = night_df.filename[split_1:split_2]
test_filenames = night_df.filename[split_2:] 

In [9]:
len(night_df.filename) #4360

2908

In [10]:
len(train_filenames) #2326
len(val_filenames) # 291
len(test_filenames) # 291

291

In [30]:
night_train_filenames = train_filenames
night_val_filenames = val_filenames

In [31]:
night_train_filenames = train_filenames # these files cannot be used in test set
night_val_filenames = val_filenames # these files cannot be used in test set
night_filenames = pd.concat([night_train_filenames, night_val_filenames])
night_filenames # list of filenames that cannot be used in day test set

0       A50-HM1841-ID11511_20200101_2041.jpg
1          A1-HM49-ID12953_20170914_0041.jpg
2        A15-HM793-ID12043_20190811_0020.jpg
3       A50-HM1927-ID11617_20200121_2050.jpg
4         A9-HM470-ID12831_20200121_2100.jpg
                        ...                 
2612    A50-HM1873-ID11522_20200122_0100.jpg
2613    A50-HM1922-ID11619_20200101_2011.jpg
2614      A2-HM742-ID10906_20200101_1920.jpg
2615      A9-HM456-ID12840_20191231_1831.jpg
2616    A28-HM1995-ID11816_20170915_2131.jpg
Name: filename, Length: 2617, dtype: object

In [34]:
len(night_filenames)

2617

In [32]:
import pickle
file_path = '/home/ubuntu/michael/my_pickles/'
with open(file_path + 'night_filenames.pkl', 'wb') as f:pickle.dump(night_filenames, f)

In [11]:
# check % of entire sample_df is composed of training, validation and testing sets
train_percent = len(train_filenames) / len(night_df) # 0.8
validate_percent = len(val_filenames) / len(night_df) # 0.1
test_percent = len(test_filenames) / len(night_df) # 0.1

In [12]:
# check to ensure fog and nofog files add up to sum in sample_df
sum_sets = len(train_filenames) + len(val_filenames) + len(test_filenames)
#sum_sets #2908

In [13]:
# create a split for traininig
split_1 = int(len(night_df.filename) * 0.8)
split_1 #3488
round(split_1 / len(night_df.filename),2) #0.80

0.8

In [14]:
# create a split that will be used for testing and validation
split_2 = int(0.9 * len(night_df.filename)) 
split_2 #3924
round(split_2 / len(night_df.filename),2) #0.90

0.9

In [15]:
# check difference between splits
diff_1 = split_2 - split_1 #405
round(diff_1 / len(night_df.filename),2) #0.10

0.1

In [16]:
# check difference between sample_df and split 2
diff_2 = len(night_df.filename) - split_2 #405
round(diff_2 / len(night_df.filename),2) #0.10

0.1

## **Check distribution of "fog" and "no fog" in each set**

In [17]:
# split sample_df for training
night_train_df = night_df[:split_1]
night_train_df #2326

Unnamed: 0,filename,label,day_phase,phase,filepath,y_true
0,A50-HM1841-ID11511_20200101_2041.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...,1
1,A1-HM49-ID12953_20170914_0041.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A1-...,0
2,A15-HM793-ID12043_20190811_0020.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A15...,0
3,A50-HM1927-ID11617_20200121_2050.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A50...,0
4,A9-HM470-ID12831_20200121_2100.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...,1
...,...,...,...,...,...,...
2321,A1-HM65-ID12966_20170916_0121.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A1-...,1
2322,A28-HM1960-ID13520_20200101_0340.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A28...,1
2323,A5-HM80-ID13769_20170917_0111.jpg,No Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A5-...,0
2324,A9-HM493-ID12818_20191003_1940.jpg,Fog,0,Night,/home/ubuntu/michael/nl_images/images_data/A9-...,1


In [18]:
# find ratio of fog to nofog images in training set
night_train_fog = night_train_df[night_train_df.label == 'Fog']
night_train_nofog = night_train_df[night_train_df.label == 'No Fog']
night_train_ratio = (len(night_train_fog) / len(night_train_nofog))
night_train_ratio # 0.9914

0.9914383561643836

In [19]:
len(night_train_fog) #1158
#len(night_train_nofog) #1168

1158

In [20]:
# split sample_df for validation
night_val_df = night_df[split_1:split_2]
#night_val_df #291

In [21]:
# find ratio of fog to nofog images in validation set
night_val_fog = night_val_df[night_val_df.label == 'Fog']
night_val_nofog = night_val_df[night_val_df.label == 'No Fog']
night_val_ratio = (len(night_val_fog) / len(night_val_nofog))
night_val_ratio # 0.9662

0.9662162162162162

In [22]:
len(night_val_fog) #143

143

In [23]:
len(night_val_nofog) #148

148

In [24]:
# split sample_df for testing
night_test_df = night_df[split_2:]
#night_test_df #291

In [25]:
# find ratio of fog to nofog images in testing set
night_test_fog = night_test_df[night_test_df.label == 'Fog']
night_test_nofog = night_test_df[night_test_df.label == 'No Fog']
night_test_ratio = (len(night_test_fog) / len(night_test_nofog))
#night_test_ratio # 1.1087

In [27]:
len(night_test_nofog) #138

138

In [28]:
len(night_test_fog) #153

153

In [39]:
# save train/validation/testing dataframes
night_train_df.to_pickle('/home/ubuntu/michael/my_pickles/night_train_df.pkl')
night_val_df.to_pickle('/home/ubuntu/michael/my_pickles/night_val_df.pkl')
night_test_df.to_pickle('/home/ubuntu/michael/my_pickles/night_test_df.pkl')

## **Save images to train/test/validation folders**

In [90]:
# create lists of training filenames
night_train_fog_filenames = night_train_fog.filename.tolist() 
night_train_nofog_filenames = night_train_nofog.filename.tolist() 
#len(night_train_fog_filenames) # 1158
#len(night_train_nofog_filenames) # 1168
#night_train_fog_filenames[0]
#night_train_nofog_filenames[0]

In [92]:
# create lists of validation filenames
night_val_fog_filenames = night_val_fog.filename.tolist()
night_val_nofog_filenames = night_val_nofog.filename.tolist()
#len(night_val_nofog_filenames) # 148
#len(night_val_fog_filenames) # 143
#night_val_nofog_filenames[0]
#night_val_fog_filenames[0]

143

In [94]:
# create list of testing filenames
night_test_fog_filenames = night_test_fog.filename.tolist()
night_test_nofog_filenames = night_test_nofog.filename.tolist()
#len(night_test_nofog_filenames) # 138
#len(night_test_fog_filenames) # 153
#night_test_nofog_filenames[0]
#night_test_fog_filenames[0]

153

In [100]:
# create a funtion to iterate through list of files in src_dir and copy to appropriate dst_dir

src_dir = '/home/ubuntu/michael/nl_images/images_data/'

def copyFiles(fn_list, src_dir, dst_dir):
    for filename in fn_list:
        #print(filename)
        path = os.path.join(src_dir, filename)
        if os.path.exists(path):
            shutil.copyfile(path, dst_dir + '/' + filename)
        else:
            continue
    return

In [112]:
# call function to copy files from src_dir to dst_dir folder

#dst_dir_train_fog = '/home/ubuntu/michael/night/train/fog' 
#copyFiles(night_train_fog_filenames, src_dir, dst_dir_train_fog)

#dst_dir_train_nofog = '/home/ubuntu/michael/night/train/nofog/' 
#copyFiles(night_train_nofog_filenames, src_dir, dst_dir_train_nofog)

#dst_dir_val_fog = '/home/ubuntu/michael/night/validate/fog/' 
#copyFiles(night_val_fog_filenames, src_dir, dst_dir_val_fog)

#dst_dir_val_nofog = '/home/ubuntu/michael/night/validate/nofog/'
#copyFiles(night_val_nofog_filenames, src_dir, dst_dir_val_nofog )

#dst_dir_test_fog = '/home/ubuntu/michael/night/test/fog/' 
#copyFiles(night_test_fog_filenames, src_dir, dst_dir_test_fog)

dst_dir_test_nofog = '/home/ubuntu/michael/night/test/nofog/' 
copyFiles(night_test_nofog_filenames, src_dir, dst_dir_test_nofog )

In [102]:
# count files in each folder
def fileCount(img_dir):
    file_count = len([file for file in os.listdir(img_dir) if file.endswith('.jpg')])
    return file_count

In [103]:
n_tr_f_count = fileCount('/home/ubuntu/michael/night/train/fog')
n_tr_f_count

1158

In [105]:
n_tr_nf_count = fileCount('/home/ubuntu/michael/night/train/nofog/')
n_tr_nf_count

1168

In [107]:
n_val_f_count = fileCount('/home/ubuntu/michael/night/validate/fog/')
n_val_f_count

143

In [109]:
n_val_nf_count = fileCount('/home/ubuntu/michael/night/validate/nofog/')
n_val_nf_count

148

In [111]:
n_te_f_count = fileCount('/home/ubuntu/michael/night/test/fog/')
n_te_f_count

153

In [113]:
n_te_nf_count = fileCount('/home/ubuntu/michael/night/test/nofog/')
n_te_nf_count

138