In [None]:
import pandas as pd

## Get annotations from roboflow in the right format
Reasons for this notebook:

Roboflow outputted blank lines for tiles that were marked null (i.e. did not have any annotations on them). These blank lines cause errors when running anchor-optimization scripts and during training of the model. Typically, images without annotations should have the format:
`/path/to/image.jpg,,,,,` See [here](https://github.com/fizyr/keras-retinanet#csv-datasets). The below code removes these blank lines. 

Additionally, Roboflow limits the number of images per free project to 10,000 so we used two projects. The first project is stored in `roboflow2` and the second project is stored in `roboflow_split5`. The below code concatenates the csv files.

Finally, some accidental annotations were made. These annotations are less than a pixel wide/tall and are removed in the below code.

In [None]:
train_path = "../Data/image-level-split/train/train_anno.csv"
val_path = "../Data/image-level-split/valid/valid_anno.csv"
test_path = "../Data/image-level-split/test/test_anno.csv"

In [None]:
def process_csv(input_path, output_path): 
    
    # remove the blank lines
    
    df1 = pd.read_csv(input_path, header=None)
    print("dataframe 1 shape", df1.shape)

    # rename columns
    
    df1.rename(columns={0:"image", 1:"x1", 2:"y1", 3:"x2", 4:"y2", 5:"class", 6:"image"}, inplace=True)
    
    # remove the empty annotations 
    # see our EDA below for how we decided on 10 pixel cut offs.
    
    df1 = df1[abs(df1['x1'] - df1['x2']) >= 10]
    df1 = df1[abs(df1['y1'] - df1['y2']) >= 10]
    #print("dataframe 1 updated shape", df1.shape)

    # save to file and return the dataframe 
    
    df1.to_csv(output_path, index=False, header=False)
    return df1

In [None]:
train = process_csv(train_path, '../Data/image-level-split/train/annotations_final.csv')
val = process_csv(val_path, '../Data/image-level-split/valid/annotations_final.csv')
test = process_csv(test_path, '../Data/image-level-split/test/annotations_final.csv')

In [None]:
train

### Determine smallest legitimate bounding box

Here we get a sense of the size of bounding boxes using the annotations in `roboflow2/train` which contains the bulk of our training images.

In [None]:
df_train1 = pd.read_csv(train_path1, header=None)
df_train1.rename(columns={0:"image", 1:"x1", 2:"y1", 3:"x2", 4:"y2", 5:"class", 6:"image"}, inplace=True)
df_train1

In [None]:
df_train1['width'] = abs(df_train1['x1'] - df_train1['x2'])
df_train1['height'] = abs(df_train1['y1'] - df_train1['y2'])

In [None]:
df_train1.sort_values('width', ascending=True).head(20)

In [None]:
df_train1.sort_values('height', ascending=True).head(20)

### Quick summarization
Determine the number of elephant seals in each class for each split of the data

In [None]:
train_cows =  len(train[train['class'] == 'cow'])
train_bulls =  len(train[train['class'] == 'bull'])
train_pups =  len(train[train['class'] == 'pup'])

print("In the training dataset, there are", train_bulls, "bulls,", train_cows, "cows, and", train_pups, 
      "pups for a total of", len(train), "seals.")

In [None]:
val_cows =  len(val[val['class'] == 'cow'])
val_bulls =  len(val[val['class'] == 'bull'])
val_pups =  len(val[val['class'] == 'pup'])

print("In the validation dataset, there are", val_bulls, "bulls,", val_cows, "cows, and", val_pups, 
      "pups for a total of", len(val), "seals.")

In [None]:
test_cows =  len(test[test['class'] == 'cow'])
test_bulls =  len(test[test['class'] == 'bull'])
test_pups =  len(test[test['class'] == 'pup'])

print("In the testing dataset, there are", test_bulls, "bulls,", test_cows, "cows, and", test_pups, 
      "pups for a total of", len(test), "seals.")