In [14]:
import pandas as pd
import numpy as np
import joblib
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
base_dir = "/content/drive/MyDrive/Data 298B Project Data/Rice Image Datasets - with Location and Time/Rice Leaf Diseases - Taiwan Filtered"

# **Image Metadata**

In [4]:
image_metadata = pd.read_csv(f"{base_dir}/image_metadata_taiwan_filtered_location.csv")

In [5]:
image_metadata.head()

Unnamed: 0,Id,Latitude,Longitude,Date,Class,Date and Time
0,P_20181227_153331_vHDR_Auto.jpg,24.073258,120.661451,2018-12-27,Brown Spot,2018:12:27 15:33:31
1,P_20181227_153343_vHDR_Auto (1).jpg,24.073258,120.661451,2018-12-27,Brown Spot,2018:12:27 15:33:43
2,P_20181227_153711_vHDR_Auto.jpg,24.073297,120.661364,2018-12-27,Brown Spot,2018:12:27 15:37:11
3,P_20181227_153709_vHDR_Auto.jpg,24.073297,120.661364,2018-12-27,Brown Spot,2018:12:27 15:37:09
4,P_20181227_154446_vHDR_Auto (1).jpg,24.07435,120.661598,2018-12-27,Brown Spot,2018:12:27 15:44:46


# **Weather Data**

In [6]:
weather_data = pd.read_csv(f"{base_dir}/weather_data_taiwan.csv")

In [7]:
weather_data.head()

Unnamed: 0,Latitude,Longitude,Date,Avg Temp 14d,Avg Humidity 14d,Total Precipitation 14d,Avg Wind Speed 14d
0,24.07,120.66,2018-12-27,19.328571,76.664286,5.7,29.171429
1,24.08,120.65,2018-12-27,19.328571,76.642857,5.7,29.171429
2,24.07,120.65,2018-12-27,19.307143,76.678571,5.7,29.171429
3,23.41,120.32,2019-03-15,20.335714,78.492857,50.8,19.428571
4,23.41,120.32,2020-01-24,18.614286,72.257143,1.9,21.385714


# **Remote Sensing Data**

In [8]:
remote_sensing_data = pd.read_csv(f"{base_dir}/remote_sensing_modis_taiwan.csv")

In [9]:
remote_sensing_data.head()

Unnamed: 0,Latitude,Longitude,Date,NDVI MODIS,NDVI - 1 MODIS,NDVI - 2 MODIS,EVI MODIS,EVI - 1 MODIS,EVI - 2 MODIS
0,24.073258,120.661451,2018-12-27,0.316,0.3335,0.2184,0.2176,0.1857,0.1328
1,24.073297,120.661364,2018-12-27,0.316,0.3335,0.2184,0.2176,0.1857,0.1328
2,24.07435,120.661598,2018-12-27,0.316,0.3335,0.2184,0.2176,0.1857,0.1328
3,24.074811,120.660849,2018-12-27,0.5403,0.398,0.3624,0.4185,0.2271,0.2348
4,24.074337,120.661612,2018-12-27,0.316,0.3335,0.2184,0.2176,0.1857,0.1328


# **Combined Dataset**

In [10]:
combined_data = pd.read_csv(f"{base_dir}/combined_data_taiwan.csv")

In [11]:
combined_data.head()

Unnamed: 0,Id,Latitude,Longitude,Date,Class,Date and Time,Avg Temp 14d,Avg Humidity 14d,Total Precipitation 14d,Avg Wind Speed 14d,NDVI MODIS,NDVI - 1 MODIS,NDVI - 2 MODIS,EVI MODIS,EVI - 1 MODIS,EVI - 2 MODIS
0,P_20181227_153331_vHDR_Auto.jpg,24.073258,120.661451,2018-12-27,Brown Spot,2018:12:27 15:33:31,19.328571,76.664286,5.7,29.171429,0.316,0.3335,0.2184,0.2176,0.1857,0.1328
1,P_20181227_153343_vHDR_Auto (1).jpg,24.073258,120.661451,2018-12-27,Brown Spot,2018:12:27 15:33:43,19.328571,76.664286,5.7,29.171429,0.316,0.3335,0.2184,0.2176,0.1857,0.1328
2,P_20181227_153711_vHDR_Auto.jpg,24.073297,120.661364,2018-12-27,Brown Spot,2018:12:27 15:37:11,19.328571,76.664286,5.7,29.171429,0.316,0.3335,0.2184,0.2176,0.1857,0.1328
3,P_20181227_153709_vHDR_Auto.jpg,24.073297,120.661364,2018-12-27,Brown Spot,2018:12:27 15:37:09,19.328571,76.664286,5.7,29.171429,0.316,0.3335,0.2184,0.2176,0.1857,0.1328
4,P_20181227_154446_vHDR_Auto (1).jpg,24.07435,120.661598,2018-12-27,Brown Spot,2018:12:27 15:44:46,19.328571,76.664286,5.7,29.171429,0.316,0.3335,0.2184,0.2176,0.1857,0.1328


# **Combined Dataset - with Indicators and Standardization**

In [12]:
combined_data_2 = pd.read_csv(f"{base_dir}/combined_data_taiwan_for_modeling.csv")

In [15]:
# Going to be show the dataframe with standardization for the train set

train_image_dir = f'{base_dir}/Train_Dataset_with_Augmentation_Rice'

# Function to filter a dataframe based on image directory content
def filter_df_based_on_directory(df, directory):
    existing_files = {f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))}
    return df[df['Id'].apply(lambda x: f"{x}" in existing_files)]

# Filtering the numerical dataframe for the training set only
train_df = filter_df_based_on_directory(combined_data_2, train_image_dir)

In [16]:
# Loading the label encoding and scaler from previously made joblib files
scaler = joblib.load(f'{base_dir}/scaler.joblib')

In [17]:
# Specifying which numerical features to standardize
features_to_standardize = ['Avg Temp 14d', 'Avg Humidity 14d', 'Total Precipitation 14d', 'Avg Wind Speed 14d']

train_df[features_to_standardize] = scaler.transform(train_df[features_to_standardize])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[features_to_standardize] = scaler.transform(train_df[features_to_standardize])


In [18]:
train_df.head()

Unnamed: 0,Id,Latitude,Longitude,Date,Class,Date and Time,Avg Temp 14d,Avg Humidity 14d,Total Precipitation 14d,Avg Wind Speed 14d,NDVI MODIS,NDVI - 1 MODIS,NDVI - 2 MODIS,EVI MODIS,EVI - 1 MODIS,EVI - 2 MODIS,NDVI 1 Decrease,NDVI 2 Decrease,EVI 1 Decrease,EVI 2 Decrease
1,P_20181227_153343_vHDR_Auto (1).jpg,24.073258,120.661451,2018-12-27,Brown Spot,2018:12:27 15:33:43,-0.723365,0.060275,-0.868913,1.263967,0.316,0.3335,0.2184,0.2176,0.1857,0.1328,1,0,0,0
2,P_20181227_153711_vHDR_Auto.jpg,24.073297,120.661364,2018-12-27,Brown Spot,2018:12:27 15:37:11,-0.723365,0.060275,-0.868913,1.263967,0.316,0.3335,0.2184,0.2176,0.1857,0.1328,1,0,0,0
3,P_20181227_153709_vHDR_Auto.jpg,24.073297,120.661364,2018-12-27,Brown Spot,2018:12:27 15:37:09,-0.723365,0.060275,-0.868913,1.263967,0.316,0.3335,0.2184,0.2176,0.1857,0.1328,1,0,0,0
5,P_20181227_155134_vHDR_Auto.jpg,24.074811,120.660849,2018-12-27,Brown Spot,2018:12:27 15:51:34,-0.723365,0.060275,-0.868913,1.263967,0.5403,0.398,0.3624,0.4185,0.2271,0.2348,0,0,0,0
6,P_20181227_154452_vHDR_Auto_HP (1).jpg,24.074337,120.661612,2018-12-27,Brown Spot,2018:12:27 15:44:52,-0.723365,0.060275,-0.868913,1.263967,0.316,0.3335,0.2184,0.2176,0.1857,0.1328,1,0,0,0
