# Predicting avocado ripeness | PyCon Greece conference

### Steps

1. Load data
2. Clean data
3. Explore data
4. Choose predictors
5. Build model
6. Validate model
7. Repeat from step 2 until validation has satisfying results

In [1]:
# General tasks
import pandas as pd

# Create plots
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go

# Explore data
import itertools
import math

# Select predictors
import statsmodels.api as sm

# Build model
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Validate model
import numpy as np
pd.set_option('display.max_colwidth', None) 

## 1. Load data

In [2]:
# Load data 
DATA_PATH = r"C:\\Users\\PinelopiTheotoki\\Desktop\\fruit_W15.csv" #replace with your path
fruit = pd.read_csv(DATA_PATH)

## 2. Clean data

In [3]:
# Handle missing data
fruit = fruit[~fruit['pressure'].isna()]

# Handle outliers
fruit = fruit[fruit['pressure'] < 29]

# Change format
fruit['created'] = pd.to_datetime(fruit['created'])

# Keep only relevant data
fruit = fruit[fruit['label'].str.contains('W15')]

## 3. Explore data

In [4]:
# Prepare data based on target
fruit2 = fruit.groupby(['created_day','label'])['pressure'].mean().reset_index()
rooms = sorted(fruit2['label'].unique())

# Define grid layout
ncols = 3
nrows = math.ceil(len(rooms)/ncols)

fig = sp.make_subplots(rows=nrows, cols=ncols,
    subplot_titles=[f"Room: {l}" for l in rooms],
    shared_xaxes=True)

# Add one line plot per room
for i, l in enumerate(rooms):
    r, c = divmod(i, ncols)
    d = fruit2[fruit2['label']==l]
    fig.add_trace(go.Scatter(x=d['created_day'], y=d['pressure'],
                             mode='lines+markers', name=l),
                  row=r+1, col=c+1)

# Style for presentation 
fig.update_layout(height=300*nrows, title="Daily mean pressure per room",
                  template="simple_white", showlegend=False)

# Format axes
fig.update_xaxes(range=['2025-04-07','2025-04-18'],
                  tickformat="%b %d",
                  title="Date")
fig.update_yaxes(title="Pressure (kg)")

fig.show()

## 4. Select predictors

In [5]:
fruit_mean = pd.read_excel("C://Users//PinelopiTheotoki//Desktop//Findings_ARE.xlsx")
# Drop first 3 rows
fruit_mean = fruit_mean.drop([0, 1, 2])

In [7]:
# Define all predictors and generate all possible non-empty combinations
predictors = ['Size','Temp', 'Avg CO2 (%)', 'Avg O2 (%)', 'Initial Pressure']

combinations = [combo for i in range(1, len(predictors) + 1) 
                for combo in itertools.combinations(predictors, i)]

# Remove nan from predictors
fruit_mean = fruit_mean.dropna(subset=predictors)

# Compute Adjusted R², and AIC for each combination
results = []
for combo in combinations:
    X = sm.add_constant(fruit_mean[list(combo)])
    y = fruit_mean['Final Pressure']
    model = sm.OLS(y, X).fit()
    results.append([combo, model.rsquared_adj, model.aic])

# Convert to DataFrame
results_fruit_mean = pd.DataFrame(results, columns=['Predictors', 'AdjR2', 'AIC'])

# Sort by AIC
results_fruit_mean = results_fruit_mean.sort_values(by='AIC').reset_index(drop=True)

# Round to 2 decimals
results_fruit_mean = results_fruit_mean.round(2)

results_fruit_mean.head(5)


Unnamed: 0,Predictors,AdjR2,AIC
0,"(Size, Temp, Initial Pressure)",0.86,28.65
1,"(Size, Temp, Avg O2 (%), Initial Pressure)",0.86,29.87
2,"(Temp, Initial Pressure)",0.86,30.09
3,"(Size, Temp, Avg CO2 (%), Initial Pressure)",0.86,30.55
4,"(Temp, Avg O2 (%), Initial Pressure)",0.86,30.6


## 5. Build the model

In [8]:
# Data definition
X = fruit_mean[["Temp", "Initial Pressure", "Size"]].values
y = fruit_mean['Final Pressure'].values 

# Leave-One-Out Cross Validation 
loo = LeaveOneOut()

# Lists to store results
intercepts, coefficients, predictions, errors = [], [], [], []

# LOOCV: Fit model and make predictions
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)[0]
    predictions.append(y_pred)
    errors.append(y_pred - y_test[0])
    intercepts.append(model.intercept_)
    coefficients.append(model.coef_)

# Add predictions to dataframe
fruit_mean['Prediction'] = predictions
fruit_mean['Error'] = fruit_mean['Final Pressure'] - fruit_mean['Prediction']
fruit_mean['Squared Error'] = fruit_mean['Error'] ** 2

## 6. Validate model

In [9]:
# Compute mean absolute error (the error was computed on previous step)
avg_absolute_error = np.mean(np.abs(errors)) 

# Print error metric
print(f"Average Absolute Error: {avg_absolute_error}")

Average Absolute Error: 0.23341648690455907


In [10]:
# Create scatter plot for actual vs predicted values
fig = go.Figure()

# Add predictions
fig.add_trace(go.Scatter(
    x=fruit_mean['Final Pressure'],
    y=fruit_mean['Prediction'],
    mode='markers',
    name='Predictions',
    marker=dict(color='green', size=8, opacity=0.7)
))

# Add perfect prediction line (y=x)
fig.add_trace(go.Scatter(
    x=[fruit_mean['Final Pressure'].min(), fruit_mean['Final Pressure'].max()],
    y=[fruit_mean['Final Pressure'].min(), fruit_mean['Final Pressure'].max()],
    mode='lines',
    name='Perfect Prediction',
    line=dict(color='black', dash='dash')
))

# Update layout
fig.update_layout(
    title='Actual vs Predicted Final Pressure (LOOCV)',
    xaxis_title='Actual Final Pressure',
    yaxis_title='Predicted Final Pressure',
    legend=dict(x=0.02, y=0.98),
    template='plotly_white'
)

fig.show()