In [10]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

## Load Data

In [11]:
import os
import shutil

df = pd.read_csv('../buffelgrass-onetime.csv')
df_filtered = df[['Observation_ID', 'Observation_Date', 'Create_Date','Latitude', 'Longitude', 'Abundance_Name']]
df_filtered['Abundance_Binary'] = df_filtered['Abundance_Name'].apply(lambda x: 1 if x == '75-94%' or x == '50-74%' or x == '95% or more' else 0)
files = os.listdir('planet-imgs-green/')
#files = [int(file[:4]) for file in files if file.endswith('green.png')]
df_filtered = df_filtered[np.isin(df_filtered.Observation_ID.values, files)].reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Abundance_Binary'] = df_filtered['Abundance_Name'].apply(lambda x: 1 if x == '75-94%' or x == '50-74%' or x == '95% or more' else 0)


In [5]:
df_filtered

Unnamed: 0,Observation_ID,Observation_Date,Create_Date,Latitude,Longitude,Abundance_Name,Abundance_Binary
0,2149,7/9/2019 7:00,7/9/2019,32.230428,-110.951362,,0
1,2150,7/9/2019 7:00,7/9/2019,32.194713,-110.914275,,0
2,2152,7/9/2019 7:00,7/9/2019,32.145890,-110.958221,5-24%,0
3,2153,7/10/2019 7:00,7/10/2019,32.225557,-110.953864,Less than 5%,0
4,2154,7/3/2019 7:00,7/10/2019,32.180766,-110.842864,75-94%,1
...,...,...,...,...,...,...,...
189,3218,11/28/2021 7:00,11/28/2021,32.235053,-111.002037,25-49%,0
190,3220,11/28/2021 7:00,11/28/2021,32.219952,-110.982477,5-24%,0
191,3221,11/28/2021 7:00,11/28/2021,32.208569,-110.988585,Less than 5%,0
192,3222,11/28/2021 7:00,11/28/2021,32.202816,-110.988831,50-74%,1


In [10]:
from sklearn.model_selection import KFold
df_train, df_test_split = train_test_split(df_filtered, test_size=0.2, random_state=42, stratify=df_filtered['Abundance_Binary'])
kf = KFold(n_splits=5)

In [21]:
df_train.to_csv('../buffelgrass-onetime-train.csv', index=False)
df_test_split.to_csv('../buffelgrass-onetime-test.csv', index=False)

In [6]:
for i, (train_index, val_index) in enumerate(kf.split(df_train)):
    
    ## generate splits of train & validation
    print('=====================')
    print(f'Split: {i}')
    df_val_split = df_train.iloc[val_index]
    df_train_split = df_train.iloc[train_index]
    print(f'Train Split: {df_train_split.shape}')
    print(f'Val Split: {df_val_split.shape}')
    print(f'Test Split: {df_test_split.shape}')
    
    ## get image ids
    train_imgs_ids = df_train_split.Observation_ID.values
    val_imgs_ids = df_val_split.Observation_ID.values
    test_imgs_ids = df_test_split.Observation_ID.values
    
    ## save images
    for img in train_imgs_ids:
        shutil.copyfile(f'planet-imgs-original/{img}_planet.png', f'planet-imgs-original/split{i+1}/train/{img}_planet.png')        
    for img in val_imgs_ids:
        shutil.copyfile(f'planet-imgs-original/{img}_planet.png', f'planet-imgs-original/split{i+1}/val/{img}_planet.png')
    for img in test_imgs_ids:
        shutil.copyfile(f'planet-imgs-original/{img}_planet.png', f'planet-imgs-original/split{i+1}/test/{img}_planet.png')
        
    ## save metadata
    df_train_meta = df_train_split[['Observation_ID', 'Abundance_Binary']].copy()
    df_train_meta.columns = ['file_name', 'labels']
    df_train_meta['file_name'] = [str(file)+"_planet.png" for file in df_train_meta.file_name.values]
    
    df_val_meta = df_val_split[['Observation_ID', 'Abundance_Binary']].copy()
    df_val_meta.columns = ['file_name', 'labels']
    df_val_meta['file_name'] = [str(file)+"_planet.png" for file in df_val_meta.file_name.values]
    
    df_test_meta = df_test_split[['Observation_ID', 'Abundance_Binary']].copy()
    df_test_meta.columns = ['file_name', 'labels']
    df_test_meta['file_name'] = [str(file)+"_planet.png" for file in df_test_meta.file_name.values]
    
    df_train_meta.to_csv(f'planet-imgs-original/split{i+1}/train/metadata.csv', index=False)
    df_val_meta.to_csv(f'planet-imgs-original/split{i+1}/val/metadata.csv', index=False)
    df_test_meta.to_csv(f'planet-imgs-original/split{i+1}/test/metadata.csv', index=False)

Split: 0
Train Split: (124, 7)
Val Split: (31, 7)
Test Split: (39, 7)
Split: 1
Train Split: (124, 7)
Val Split: (31, 7)
Test Split: (39, 7)
Split: 2
Train Split: (124, 7)
Val Split: (31, 7)
Test Split: (39, 7)
Split: 3
Train Split: (124, 7)
Val Split: (31, 7)
Test Split: (39, 7)
Split: 4
Train Split: (124, 7)
Val Split: (31, 7)
Test Split: (39, 7)


In [13]:
import os
from PIL import Image
for i in range(5):
    
    ## train
    train_files = os.listdir(f'planet-imgs-original/split{i+1}/train/')
    for file in train_files:
        if file.endswith('.png'):
            img = Image.open(f'planet-imgs-original/split{i+1}/train/{file}').convert("RGB")
            img.save(f'planet-imgs-original/split{i+1}/train/{file}')
        
    ## validation
    val_files = os.listdir(f'planet-imgs-original/split{i+1}/val/')
    for file in val_files:
        if file.endswith('.png'):    
            img = Image.open(f'planet-imgs-original/split{i+1}/val/{file}').convert("RGB")
            img.save(f'planet-imgs-original/split{i+1}/val/{file}')
        
    ## test
    test_files = os.listdir(f'planet-imgs-original/split{i+1}/test/')
    for file in test_files:
        if file.endswith('.png'):
            img = Image.open(f'planet-imgs-original/split{i+1}/test/{file}').convert("RGB")
            img.save(f'planet-imgs-original/split{i+1}/test/{file}')        