In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os



# Import Classification DF

In [3]:

#Import classification df

#Force the 8th column to string to avoid error

dtype = {7: str}

#Import csv

classification_df = pd.read_csv('../variety_classification.csv', dtype=dtype)

###General Cleaning

#Create a working file path for the folders
classification_df['image_path'] = (classification_df['subset'].astype(str) + '/'
                                + classification_df['species'].astype(str) + '_'
                                + classification_df['variety'].astype(str) + '/'
                                + classification_df['variety_image_path'].apply(lambda x: str(x).split('/')[-1]))
classification_df.drop(columns=['variety_image_path'], inplace=True)

#Create a column with the file name
classification_df['file_name'] = classification_df['image_path'].apply(lambda x: str(x).split('/')[-1])

#Clean variable names and column name to avoid conflict with the class 'class'
classification_df['fruit_class'] = classification_df['species'] + '_' + classification_df['variety']

classification_df.drop(columns=['class'], inplace=True)

#Change variables type
classification_df['layout_id'] = classification_df['layout_id'].astype('str')
classification_df['cam'] = classification_df['cam'].astype('str')
classification_df['shop'] = classification_df['shop'].astype('str')
classification_df['date'] = classification_df['date'].astype('str')

classification_df['weight'] = pd.to_numeric(classification_df['weight'], errors='coerce')


#Clean whitespaces in str variables

classification_df[classification_df.select_dtypes('object').columns] = classification_df.select_dtypes('object').apply(lambda x: x.str.strip() if x.dtype == "object" else x)

#Set everything to lowercase

classification_df[classification_df.select_dtypes('object').columns] = classification_df.select_dtypes('object').apply(lambda x: x.str.strip().str.lower() if x.dtype == "object" else x)



In [4]:
classification_df.columns

Index(['species', 'variety', 'layout_id', 'for_cropping', 'packed', 'amount',
       'weight', 'uniform_background', 'spoiled', 'cam', 'city', 'shop',
       'crowd', 'date', 'subset', 'simp_amount', 'image_path', 'file_name',
       'fruit_class'],
      dtype='object')

## Cleaning Date Column

In [5]:
#classification_df.groupby('date').count()

#The format looks really messy, we are interested in the year and month, let's extract those.

#The format looks really messy, we are interested in the year and month, let's extract those.
classification_df['date'] = classification_df['date'].str.replace('-', '/').str.replace('.', '/').str.replace(';','/')
classification_df['date'] = classification_df['date'].str.replace('0202','2022')
classification_df['date'] = classification_df['date'].str.replace(' may ', '/05/')
classification_df['date'] = classification_df['date'].str.replace(' june ', '/06/')

# Convert Excel serial dates (e.g., '44741') if the string starts with '447' and return as 'YYYY/MM/DD' format
classification_df['date'] = classification_df['date'].apply(lambda row: (pd.to_datetime('1899-12-30') + pd.to_timedelta(float(row), unit='D')).strftime('%Y/%m/%d')
                                        if isinstance(row, str) and row.startswith('447') else row)
#Data was collected in 2022
classification_df['year'] = classification_df['date'].apply(lambda row: '2022' if '2022' in row else 0)

#Extracting Months
classification_df['month'] = classification_df['date'].apply(lambda row: row.split('/')[1] if len(row.split('/')) > 1 else 0)
classification_df['month'] = classification_df['month'].astype('str').apply(lambda row: '0' + row if len(row) == 1 else row)

#Extracting Days
classification_df['day'] = classification_df['date'].apply(
    lambda x: x.split('/', 2)[2][:2] if len(x.split('/', 2)[0]) > 2 and len(x.split('/', 2)) > 2
    else x.split('/', 2)[0] if len(x.split('/', 2)[0]) <= 2 else '0'
)
classification_df['day'] = classification_df['day'].astype('str').apply(lambda row: '0' + row if len(row) == 1 else row)


classification_df['full_date'] = classification_df['year'].astype('str') + '-' + classification_df['month'] + '-' + classification_df['day']
classification_df['full_date'] = pd.to_datetime(classification_df['full_date'], errors='coerce')

classification_df.drop(columns=['date'], inplace=True)



  classification_df['date'] = classification_df['date'].str.replace('-', '/').str.replace('.', '/').str.replace(';','/')


In [6]:
classification_df = classification_df[['file_name', 'fruit_class', 'layout_id', 'packed', 'amount', 'weight', 'crowd', 'subset', 'simp_amount', 'uniform_background', 'spoiled', 'full_date']]
classification_df.rename(columns={'fruit_class':'variety'}, inplace=True)



In [7]:
# Dropping rows with values we cannot use

classification_df = classification_df[~((classification_df['layout_id'] == '0') | (classification_df['layout_id'] == 0))]

# Keep only images with layout_id repeated 4 times

layout_id_counts = classification_df['layout_id'].value_counts()
classification_df = classification_df[classification_df['layout_id'].isin(layout_id_counts[layout_id_counts == 4].index)]

# Create a 'rank' column by grouping by 'layout_id' and applying the rank within each group

classification_df['rank'] = classification_df.groupby('layout_id').cumcount() + 1

# Drop rows with unknown amount (the weight of the fruits in the picture we can estimate from the amount of fruits in the image, 
# but we cannot estimate amount from the images)

classification_df = classification_df[~((classification_df['amount'] == 0) | (classification_df['amount'].isna()))]

# Get varieties with 'amount' == 0 in all rows

sum_amount = classification_df.groupby('variety')['amount'].sum().sort_values()
amount_0 = list(sum_amount[sum_amount == 0].reset_index()['variety'])
# Drop varieties with 'amount' == 0 in all rows
classification_df = classification_df[~(classification_df['variety'].isin(amount_0))]

In [8]:
#Filling the remaining NaN in weight

variety_weight_nan = list(classification_df[classification_df['weight'].isna()].groupby('variety').size().reset_index()['variety'])

classification_weight_nan = classification_df[(classification_df['variety'].isin(variety_weight_nan)) & 
                                            (classification_df['amount'] == 1) &
                                            (~classification_df['weight'].isna())]

mean_weights = classification_weight_nan.groupby('variety')['weight'].mean().reset_index()

# Convert the result to a dictionary
mean_weights_dict = mean_weights.set_index('variety')['weight'].apply(lambda x: round(x, 2)).to_dict()

classification_df['weight'] = classification_df.apply(
    lambda row: row['amount'] * mean_weights_dict.get(row['variety'], 0) 
    if pd.isna(row['weight']) else row['weight'], axis=1
)

In [9]:
#Cases with 0 in weight

#For pepper_sweet_elonged

# Step 1: Calculate the average weight for amount == 1 in the 'pepper_sweet_elonged' variety
average_weight_amount_pepper_sweet_elonged= classification_df[(classification_df['variety'] == 'pepper_sweet_elonged') & 
                                    (classification_df['amount'] == 3)]['weight'].mean()/3

# # Step 2: Fill NaN values in 'weight' for 'pepper_sweet_yellow' variety by multiplying average_weight_amount_pepper by 'amount'
# classification_df.loc[(classification_df['variety'] == 'pepper_sweet_elonged') & 
#                 (classification_df['weight'].isna()), 'weight'] = classification_df['amount'] * average_weight_amount_pepper_sweet_elonged

classification_df.loc[(classification_df['variety'] == 'pepper_sweet_elonged') & 
                (classification_df['weight'] == 0), 'weight'] = classification_df['amount'] * average_weight_amount_pepper_sweet_elonged


#For lime_lime

# Step 1: Calculate the average weight for amount == 2 in the 'lime_lime' variety
average_weight_amount_lime = (classification_df[(classification_df['variety'] == 'lime_lime') & 
                                    (classification_df['amount'] == 2)]['weight'].mean())/2

# # Step 3: Fill NaN values in 'weight' for 'lime_lime' variety by multiplying value_to_fill by 'amount'
# classification_df.loc[(classification_df['variety'] == 'lime_lime') & 
#                 (classification_df['weight'].isna()), 'weight'] = classification_df['amount'] * average_weight_amount_lime

classification_df.loc[(classification_df['variety'] == 'lime_lime') & 
                (classification_df['weight'] == 0), 'weight'] = classification_df['amount'] * average_weight_amount_lime

## Checking Outliers in Weight Column

In [10]:
average_weight_per_variety = classification_df.groupby(['variety','amount'])['weight'].mean().reset_index()

avg_weight_1_fruit = average_weight_per_variety[average_weight_per_variety['amount'] == 1]

avg_weight_1_dict = avg_weight_1_fruit.set_index('variety')['weight'].apply(lambda x: round(x, 2)).to_dict()


In [11]:
# Identify outliers between the actual group weight and the expected group weight (which is count * average_weight).
# Define an outlier as any group where the total weight deviates too far from the expected weight. 
# For this, allow a margin of error, say ±30% around the expected weight.

margin_of_error = 0.3

# Add the expected weight column
classification_df['expected_weight'] = classification_df['amount'] * classification_df['variety'].map(avg_weight_1_dict)

# Calculate the allowed deviation range based on the margin of error
classification_df['min_weight'] = classification_df['expected_weight'] * (1 - margin_of_error)
classification_df['max_weight'] = classification_df['expected_weight'] * (1 + margin_of_error)


In [12]:
#Flag any rows where the actual weight is outside the min_weight and max_weight range

# Identify outliers

classification_df['is_outlier'] = (classification_df['weight'] < classification_df['min_weight']) | (classification_df['weight'] > classification_df['max_weight'])

print(classification_df['is_outlier'].value_counts())

#Over 24.000 / 4 weights that are outside the range 

False    58421
True     23900
Name: is_outlier, dtype: int64


In [13]:
# Function to adjust the weight with a random margin within the allowed range
def adjust_weight(row):
    if row['is_outlier']:
        # Get the average weight for the variety from the dictionary
        avg_weight = avg_weight_1_dict.get(row['variety'], 0)  # Default to 0 if variety not found
        # Random factor within the margin of error range
        random_factor = np.random.uniform(1 - margin_of_error, 1 + margin_of_error)
        adjusted_weight = row['amount'] * avg_weight * random_factor
        return adjusted_weight
    else:
        return row['weight']

# Apply the adjustment
classification_df['adjusted_weight'] = classification_df.apply(adjust_weight, axis=1)


classification_df = classification_df.drop(columns=['expected_weight', 'min_weight', 'max_weight','is_outlier'])

classification_df.head()

Unnamed: 0,file_name,variety,layout_id,packed,amount,weight,crowd,subset,simp_amount,uniform_background,spoiled,full_date,rank,adjusted_weight
0,007576.jpg,apple_fuji,1246,1,5,776.0,0,train,4-6,0,0,2022-06-10,1,776.0
1,085095.jpg,apple_fuji,1246,1,5,776.0,0,train,4-6,0,0,2022-06-10,2,776.0
2,077142.jpg,apple_fuji,1246,1,5,776.0,0,train,4-6,0,0,2022-06-10,3,776.0
3,091219.jpg,apple_fuji,1246,1,5,776.0,0,train,4-6,0,0,2022-06-10,4,776.0
4,019846.jpg,apple_fuji,4610,1,5,776.0,1,test,4-6,0,0,2022-06-10,1,776.0


# Extract cleaned csv all fruits

In [14]:
classification_df.to_csv('cleaned_csv_all_fruits.csv', index=False, header=True)

# Extract cleaned csv 19 class

In [15]:
#Following the labels for the YoLo8 this are the varieties we will be working with
#Replaced pear_williams_rouge for apple_fuji because there was not enough information for pear_williams_rouge

fruits_only = ['carrot_carrot', 'apple_red_delicious', 'tomato_pink', 
            'cucumber_long','banana_yellow', 'apple_granny', 'apple_fuji', 
            'pepper_sweet_red','orange_orange', 'onion_white', 'apple_ligol', 'lime_lime',
            'avocado_hass', 'apple_golden', 'kiwi_kiwi', 'tomato_cherry_red', 'pepper_sweet_green',
            'pepper_sweet_yellow', 'lemon_yellow']

varieties_dict = {i: fruits_only[i] for i in range(len(fruits_only))} #to have the same index as yolo
print(varieties_dict)

{0: 'carrot_carrot', 1: 'apple_red_delicious', 2: 'tomato_pink', 3: 'cucumber_long', 4: 'banana_yellow', 5: 'apple_granny', 6: 'apple_fuji', 7: 'pepper_sweet_red', 8: 'orange_orange', 9: 'onion_white', 10: 'apple_ligol', 11: 'lime_lime', 12: 'avocado_hass', 13: 'apple_golden', 14: 'kiwi_kiwi', 15: 'tomato_cherry_red', 16: 'pepper_sweet_green', 17: 'pepper_sweet_yellow', 18: 'lemon_yellow'}


In [16]:
filtered_df = classification_df[(classification_df['variety'].isin(fruits_only))]
filtered_df = filtered_df.copy()
reversed_varieties_dict = {v: k for k, v in varieties_dict.items()}
filtered_df['label'] = filtered_df['variety'].map(reversed_varieties_dict)
filtered_df.head()

Unnamed: 0,file_name,variety,layout_id,packed,amount,weight,crowd,subset,simp_amount,uniform_background,spoiled,full_date,rank,adjusted_weight,label
0,007576.jpg,apple_fuji,1246,1,5,776.0,0,train,4-6,0,0,2022-06-10,1,776.0,6
1,085095.jpg,apple_fuji,1246,1,5,776.0,0,train,4-6,0,0,2022-06-10,2,776.0,6
2,077142.jpg,apple_fuji,1246,1,5,776.0,0,train,4-6,0,0,2022-06-10,3,776.0,6
3,091219.jpg,apple_fuji,1246,1,5,776.0,0,train,4-6,0,0,2022-06-10,4,776.0,6
4,019846.jpg,apple_fuji,4610,1,5,776.0,1,test,4-6,0,0,2022-06-10,1,776.0,6


In [17]:
#Filter out rows with amount > 12

print(filtered_df['simp_amount'].value_counts())

filtered_df = filtered_df[filtered_df['amount'] <= 12]
filtered_df['simp_amount'] = filtered_df['simp_amount'].replace('7+', '7-12')

print(filtered_df['simp_amount'].value_counts())

filtered_df = filtered_df.drop(columns=['full_date'])

4-6    8104
2-3    7924
1      7404
7+     6996
Name: simp_amount, dtype: int64
4-6     8104
2-3     7924
1       7404
7-12    6952
Name: simp_amount, dtype: int64


In [19]:
filtered_df.to_csv('cleaned_csv_19_class.csv', index=False, header=True)
