# Generate CSV dataset file

In [1]:
import os
import pandas as pd

## French dataset

In [2]:
DATASET_PATH = "/gpfsscratch/rech/ads/commun/datasets/extracted/Deep learning lynx - data/0_dataset_raw"

In [3]:
individual_folders = os.listdir(DATASET_PATH)

In [4]:
dataset_dict = {
    "filepath": [],
    "lynx_id": [],
    "source": [],
    "pattern": [],
    "date": [],
    "location": [],
    "image_number": []
}

In [5]:
def parse_filename(filename):
    filename = filename.split(".")[0]
    parts = filename.split("_")
    
    return {"id_lynx": parts[0], "source": parts[1], "pattern": parts[2], "date": parts[3], "location": parts[4], "image_number": parts[5]}

In [6]:
for current_directory, _, files in os.walk(DATASET_PATH):
    if "0_dataset_Marie_3_individus" in current_directory :
        continue

    for file in files:
        absolute_path = os.path.abspath(os.path.join(current_directory, file))
        if file.endswith((".txt", ".xlsx", ".db")):
            continue
        if file.startswith(('broken')):
            print(file)
            continue
            
        try:
            parsed_filename = parse_filename(file)
        except:
            print(absolute_path)  # TODO: problem with these filenames
            continue  # skip them
            
        dataset_dict["filepath"].append(absolute_path)
        dataset_dict["lynx_id"].append(parsed_filename["id_lynx"])
        dataset_dict["source"].append(parsed_filename["source"])
        dataset_dict["pattern"].append(parsed_filename["pattern"])
        dataset_dict["date"].append(parsed_filename["date"])
        dataset_dict["location"].append(parsed_filename["location"])
        dataset_dict["image_number"].append(parsed_filename["image_number"])

In [7]:
df_france = pd.DataFrame(dataset_dict)
df_france

Unnamed: 0,filepath,lynx_id,source,pattern,date,location,image_number
0,/gpfsscratch/rech/ads/commun/datasets/extracte...,Van-Gogh,OCS,,2012-04-27,,5
1,/gpfsscratch/rech/ads/commun/datasets/extracte...,Van-Gogh,OCS,,2012-05-05,,1
2,/gpfsscratch/rech/ads/commun/datasets/extracte...,Van-Gogh,OCS,,2012-03-17,,3
3,/gpfsscratch/rech/ads/commun/datasets/extracte...,Van-Gogh,OCS,,2012-03-16,,6
4,/gpfsscratch/rech/ads/commun/datasets/extracte...,Van-Gogh,OCS,,2012-03-05,,7
...,...,...,...,...,...,...,...
3327,/gpfsscratch/rech/ads/commun/datasets/extracte...,679=L192,OFB,SPOTS,2015-07-21,Goumois,4
3328,/gpfsscratch/rech/ads/commun/datasets/extracte...,679=L192,OFB,SPOTS,2015-07-21,Goumois,3
3329,/gpfsscratch/rech/ads/commun/datasets/extracte...,679=L192,OFB,SPOTS,2015-07-21,Goumois,1
3330,/gpfsscratch/rech/ads/commun/datasets/extracte...,1017,OFB,SPOTS,2017-08-08,Chaux-Neuve,2


In [8]:
df_france["pattern"].value_counts()

pattern
OCELLES    2397
SPOTS       803
NA          132
Name: count, dtype: int64

In [9]:
df_france.to_csv("/gpfsscratch/rech/ads/commun/datasets/extracted/lynx_dataset_france.csv", index=False)

## Croatian dataset

In [10]:
df_croatia = pd.read_excel("/gpfsscratch/rech/ads/commun/datasets/extracted/croatia/list_of_lynx_photos1.xlsx")
df_croatia

Unnamed: 0,datum_vrijeme,pouzdanost,broj_zivotinja,broj_mladih,img,oznaka,ime,naziv,spol,lokacija,slikana_strana_zivotinje,lat,lon
0,2016-05-19 01:15:00,1,1,0,lynx/photos/RIS.JPG,Varošina 2016,Image,Geonatura d.o.o.,Unknown,POINT (15.439715 44.630702),right,15.439715,44.630702
1,2016-10-22 03:18:00,1,1,0,lynx/photos/PICT0290.JPG,Goran ZIP L09,Image,JU Priroda,M,POINT (14.9107031584 45.2327798187),left,14.910703,45.232780
2,2016-10-23 11:35:00,1,1,0,lynx/photos/PICT0310_GfUlKy6.JPG,Goran ZIP L09,Image,JU Priroda,M,POINT (14.9107031584 45.2327798187),right,14.910703,45.232780
3,2013-12-10 10:21:00,1,1,0,lynx/photos/PICT0236.JPG,Pakleni,Image,NP Paklenica,Unknown,POINT (15.4829004594 44.3279410748),none,15.482900,44.327941
4,2014-09-13 17:49:00,1,1,0,lynx/photos/PICT0001_8J6PCAA.JPG,Nik,Image,NP Paklenica,Unknown,POINT (15.4792073827 44.3398982543),left,15.479207,44.339898
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2007,2023-05-13 18:06:00,1,1,0,lynx/photos/B1000363.JPG,Slavko L35,Image,Veterinarski fakultet,M,POINT (14.949509 45.144032),left,14.949509,45.144032
2008,2022-04-05 05:31:00,1,1,0,lynx/photos/PICT0082_8hHOzSQ.JPG,Vilim,Image,PP Velebit,M,POINT (15.043362 44.881497),left,15.043362,44.881497
2009,2023-10-20 06:40:00,1,2,1,lynx/photos/IMG-20231112-WA0007.jpg,Crno jezero2 mladunac 2023,Image,Veterinarski fakultet,Unknown,POINT (15.24886 44.828308),left,15.248860,44.828308
2010,2023-10-20 06:40:00,1,2,1,lynx/photos/IMG-20231112-WA0007.jpg,Crno jezero 2,Image,Veterinarski fakultet,Z,POINT (15.24886 44.828308),left,15.248860,44.828308


#### Column renaming

In [11]:
names = {
    "datum_vrijeme": "date",
    "img": "filepath",
    "oznaka": "lynx_id",
    "spol": "sex",
    "lat": "latitude",
    "lon": "longitude",
    "slikana_strana_zivotinje": "side",
    "broj_zivotinja": "number_animals",
    "broj_mladih": "number_children",
    "naziv": "source", # doubt
}

df_croatia.rename(columns=names, inplace=True)
df_croatia

Unnamed: 0,date,pouzdanost,number_animals,number_children,filepath,lynx_id,ime,source,sex,lokacija,side,latitude,longitude
0,2016-05-19 01:15:00,1,1,0,lynx/photos/RIS.JPG,Varošina 2016,Image,Geonatura d.o.o.,Unknown,POINT (15.439715 44.630702),right,15.439715,44.630702
1,2016-10-22 03:18:00,1,1,0,lynx/photos/PICT0290.JPG,Goran ZIP L09,Image,JU Priroda,M,POINT (14.9107031584 45.2327798187),left,14.910703,45.232780
2,2016-10-23 11:35:00,1,1,0,lynx/photos/PICT0310_GfUlKy6.JPG,Goran ZIP L09,Image,JU Priroda,M,POINT (14.9107031584 45.2327798187),right,14.910703,45.232780
3,2013-12-10 10:21:00,1,1,0,lynx/photos/PICT0236.JPG,Pakleni,Image,NP Paklenica,Unknown,POINT (15.4829004594 44.3279410748),none,15.482900,44.327941
4,2014-09-13 17:49:00,1,1,0,lynx/photos/PICT0001_8J6PCAA.JPG,Nik,Image,NP Paklenica,Unknown,POINT (15.4792073827 44.3398982543),left,15.479207,44.339898
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2007,2023-05-13 18:06:00,1,1,0,lynx/photos/B1000363.JPG,Slavko L35,Image,Veterinarski fakultet,M,POINT (14.949509 45.144032),left,14.949509,45.144032
2008,2022-04-05 05:31:00,1,1,0,lynx/photos/PICT0082_8hHOzSQ.JPG,Vilim,Image,PP Velebit,M,POINT (15.043362 44.881497),left,15.043362,44.881497
2009,2023-10-20 06:40:00,1,2,1,lynx/photos/IMG-20231112-WA0007.jpg,Crno jezero2 mladunac 2023,Image,Veterinarski fakultet,Unknown,POINT (15.24886 44.828308),left,15.248860,44.828308
2010,2023-10-20 06:40:00,1,2,1,lynx/photos/IMG-20231112-WA0007.jpg,Crno jezero 2,Image,Veterinarski fakultet,Z,POINT (15.24886 44.828308),left,15.248860,44.828308


#### Drop useless columns
- lokacija=localisation : useless because this information is found in latitude and longitude
- pouzdanost=reliability : useless because always the same value (1)
- ime=image : useless because always the same value (Image)

In [12]:
df_croatia.drop(columns=["lokacija", "pouzdanost", "ime"], inplace=True)
df_croatia

Unnamed: 0,date,number_animals,number_children,filepath,lynx_id,source,sex,side,latitude,longitude
0,2016-05-19 01:15:00,1,0,lynx/photos/RIS.JPG,Varošina 2016,Geonatura d.o.o.,Unknown,right,15.439715,44.630702
1,2016-10-22 03:18:00,1,0,lynx/photos/PICT0290.JPG,Goran ZIP L09,JU Priroda,M,left,14.910703,45.232780
2,2016-10-23 11:35:00,1,0,lynx/photos/PICT0310_GfUlKy6.JPG,Goran ZIP L09,JU Priroda,M,right,14.910703,45.232780
3,2013-12-10 10:21:00,1,0,lynx/photos/PICT0236.JPG,Pakleni,NP Paklenica,Unknown,none,15.482900,44.327941
4,2014-09-13 17:49:00,1,0,lynx/photos/PICT0001_8J6PCAA.JPG,Nik,NP Paklenica,Unknown,left,15.479207,44.339898
...,...,...,...,...,...,...,...,...,...,...
2007,2023-05-13 18:06:00,1,0,lynx/photos/B1000363.JPG,Slavko L35,Veterinarski fakultet,M,left,14.949509,45.144032
2008,2022-04-05 05:31:00,1,0,lynx/photos/PICT0082_8hHOzSQ.JPG,Vilim,PP Velebit,M,left,15.043362,44.881497
2009,2023-10-20 06:40:00,2,1,lynx/photos/IMG-20231112-WA0007.jpg,Crno jezero2 mladunac 2023,Veterinarski fakultet,Unknown,left,15.248860,44.828308
2010,2023-10-20 06:40:00,2,1,lynx/photos/IMG-20231112-WA0007.jpg,Crno jezero 2,Veterinarski fakultet,Z,left,15.248860,44.828308


#### Duplicates

There are no duplicate lines.

In [13]:
df_croatia.duplicated().value_counts()

False    2012
Name: count, dtype: int64

However, some filepaths repeat themselves. **Some images may have several individuals on them. The associated lynx_id are therefore different.**

In [14]:
filepath_value_counts = df_croatia["filepath"].value_counts()
filepath_value_counts[filepath_value_counts != 1]

filepath
lynx/photos/1_27.JPG                        4
lynx/photos/1_5.JPG                         4
lynx/photos/04.12.20_mlad_1_L.png           3
lynx/photos/19.08.2020_Crna_ruja_2.JPG      3
lynx/photos/19.08.2020_Crna_ruja_1.JPG      3
                                           ..
lynx/photos/Screenshot_82.png               2
lynx/photos/IMG_0240_ulBxe30.JPG            2
lynx/photos/3._sterna_09.10_3.JPG           2
lynx/photos/I_00013b_Pcys5fy.JPG            2
lynx/photos/2023_01_24__Golo_trlo_11.JPG    2
Name: count, Length: 110, dtype: int64

In [15]:
df_croatia[df_croatia["filepath"] == "lynx/photos/1_5.JPG"]

Unnamed: 0,date,number_animals,number_children,filepath,lynx_id,source,sex,side,latitude,longitude
1715,2023-02-04 18:15:00,4,3,lynx/photos/1_5.JPG,Suzi mladunac3 2022,Rewilding Velebit,Unknown,right,14.958253,44.677339
1716,2023-02-04 18:15:00,4,3,lynx/photos/1_5.JPG,Suzi mladunac2 2022,Rewilding Velebit,Unknown,right,14.958253,44.677339
1717,2023-02-04 18:15:00,4,3,lynx/photos/1_5.JPG,Suzi mladunac1 2022,Rewilding Velebit,Unknown,right,14.958253,44.677339
1718,2023-02-04 18:15:00,4,3,lynx/photos/1_5.JPG,Suzi,Rewilding Velebit,Z,right,14.958253,44.677339


**Note**: it seems complicated to use images with several lynxes in a single image. This is because we don't know which lynx the `lynx_id` refers to in the image.

#### Fix filepath

Checks whether all filepaths currently contain the same prefix

In [16]:
df_croatia["filepath"].str.startswith("lynx/photos/").all()

True

In [17]:
df_croatia["filepath"] = df_croatia["filepath"].str.replace("lynx/photos/", "/gpfsscratch/rech/ads/commun/datasets/extracted/croatia/lynx_pic/")
df_croatia

Unnamed: 0,date,number_animals,number_children,filepath,lynx_id,source,sex,side,latitude,longitude
0,2016-05-19 01:15:00,1,0,/gpfsscratch/rech/ads/commun/datasets/extracte...,Varošina 2016,Geonatura d.o.o.,Unknown,right,15.439715,44.630702
1,2016-10-22 03:18:00,1,0,/gpfsscratch/rech/ads/commun/datasets/extracte...,Goran ZIP L09,JU Priroda,M,left,14.910703,45.232780
2,2016-10-23 11:35:00,1,0,/gpfsscratch/rech/ads/commun/datasets/extracte...,Goran ZIP L09,JU Priroda,M,right,14.910703,45.232780
3,2013-12-10 10:21:00,1,0,/gpfsscratch/rech/ads/commun/datasets/extracte...,Pakleni,NP Paklenica,Unknown,none,15.482900,44.327941
4,2014-09-13 17:49:00,1,0,/gpfsscratch/rech/ads/commun/datasets/extracte...,Nik,NP Paklenica,Unknown,left,15.479207,44.339898
...,...,...,...,...,...,...,...,...,...,...
2007,2023-05-13 18:06:00,1,0,/gpfsscratch/rech/ads/commun/datasets/extracte...,Slavko L35,Veterinarski fakultet,M,left,14.949509,45.144032
2008,2022-04-05 05:31:00,1,0,/gpfsscratch/rech/ads/commun/datasets/extracte...,Vilim,PP Velebit,M,left,15.043362,44.881497
2009,2023-10-20 06:40:00,2,1,/gpfsscratch/rech/ads/commun/datasets/extracte...,Crno jezero2 mladunac 2023,Veterinarski fakultet,Unknown,left,15.248860,44.828308
2010,2023-10-20 06:40:00,2,1,/gpfsscratch/rech/ads/commun/datasets/extracte...,Crno jezero 2,Veterinarski fakultet,Z,left,15.248860,44.828308


#### Deleting broken images

In [18]:
all_files_exist = all(df_croatia['filepath'].apply(os.path.exists))
all_files_exist

True

In [19]:
df_croatia.to_csv("/gpfsscratch/rech/ads/commun/datasets/extracted/croatia/lynx_dataset_croatia.csv", index=False)

## Merge the two dataframes

In [20]:
df_full = pd.concat([df_france, df_croatia], ignore_index=True)
df_full

Unnamed: 0,filepath,lynx_id,source,pattern,date,location,image_number,number_animals,number_children,sex,side,latitude,longitude
0,/gpfsscratch/rech/ads/commun/datasets/extracte...,Van-Gogh,OCS,,2012-04-27,,5,,,,,,
1,/gpfsscratch/rech/ads/commun/datasets/extracte...,Van-Gogh,OCS,,2012-05-05,,1,,,,,,
2,/gpfsscratch/rech/ads/commun/datasets/extracte...,Van-Gogh,OCS,,2012-03-17,,3,,,,,,
3,/gpfsscratch/rech/ads/commun/datasets/extracte...,Van-Gogh,OCS,,2012-03-16,,6,,,,,,
4,/gpfsscratch/rech/ads/commun/datasets/extracte...,Van-Gogh,OCS,,2012-03-05,,7,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5339,/gpfsscratch/rech/ads/commun/datasets/extracte...,Slavko L35,Veterinarski fakultet,,2023-05-13 18:06:00,,,1.0,0.0,M,left,14.949509,45.144032
5340,/gpfsscratch/rech/ads/commun/datasets/extracte...,Vilim,PP Velebit,,2022-04-05 05:31:00,,,1.0,0.0,M,left,15.043362,44.881497
5341,/gpfsscratch/rech/ads/commun/datasets/extracte...,Crno jezero2 mladunac 2023,Veterinarski fakultet,,2023-10-20 06:40:00,,,2.0,1.0,Unknown,left,15.248860,44.828308
5342,/gpfsscratch/rech/ads/commun/datasets/extracte...,Crno jezero 2,Veterinarski fakultet,,2023-10-20 06:40:00,,,2.0,1.0,Z,left,15.248860,44.828308


In [21]:
df_full.to_csv("/gpfsscratch/rech/ads/commun/datasets/extracted/lynx_dataset_full.csv", index=False)