# Task for Today  

***

## Blueberry Yield Prediction  
  
Given *data about wild blueberries*, let's try to predict the **yield** for a given record.  
  
We will use a random forest regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestRegressor


In [None]:
data = pd.read_csv('/content/WildBlueberryPollinationSimulationData.csv')

In [None]:
data

Unnamed: 0,Row#,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,0,37.5,0.750,0.250,0.250,0.250,86.0,52.0,71.9,62.0,30.0,50.8,16.00,0.26,0.410652,0.408159,31.678898,3813.165795
1,1,37.5,0.750,0.250,0.250,0.250,86.0,52.0,71.9,62.0,30.0,50.8,1.00,0.10,0.444254,0.425458,33.449385,4947.605663
2,2,37.5,0.750,0.250,0.250,0.250,94.6,57.2,79.0,68.2,33.0,55.9,16.00,0.26,0.383787,0.399172,30.546306,3866.798965
3,3,37.5,0.750,0.250,0.250,0.250,94.6,57.2,79.0,68.2,33.0,55.9,1.00,0.10,0.407564,0.408789,31.562586,4303.943030
4,4,37.5,0.750,0.250,0.250,0.250,86.0,52.0,71.9,62.0,30.0,50.8,24.00,0.39,0.354413,0.382703,28.873714,3436.493543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,772,10.0,0.537,0.117,0.409,0.058,86.0,52.0,71.9,62.0,30.0,50.8,3.77,0.06,0.486815,0.428012,33.447471,5333.873335
773,773,40.0,0.537,0.117,0.409,0.058,86.0,52.0,71.9,62.0,30.0,50.8,3.77,0.06,0.342841,0.377915,28.462005,3373.436842
774,774,20.0,0.537,0.117,0.409,0.058,86.0,52.0,71.9,62.0,30.0,50.8,24.00,0.39,0.404617,0.401670,30.748240,4203.027624
775,775,20.0,0.537,0.117,0.409,0.058,89.0,39.0,65.6,66.0,28.0,45.3,3.77,0.06,0.401538,0.399935,30.582161,4166.299735


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Row#                  777 non-null    int64  
 1   clonesize             777 non-null    float64
 2   honeybee              777 non-null    float64
 3   bumbles               777 non-null    float64
 4   andrena               777 non-null    float64
 5   osmia                 777 non-null    float64
 6   MaxOfUpperTRange      777 non-null    float64
 7   MinOfUpperTRange      777 non-null    float64
 8   AverageOfUpperTRange  777 non-null    float64
 9   MaxOfLowerTRange      777 non-null    float64
 10  MinOfLowerTRange      777 non-null    float64
 11  AverageOfLowerTRange  777 non-null    float64
 12  RainingDays           777 non-null    float64
 13  AverageRainingDays    777 non-null    float64
 14  fruitset              777 non-null    float64
 15  fruitmass             7

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    df = df.drop('Row#', axis=1)

    y = df['yield']
    X = df.drop('yield', axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
214,12.5,0.25,0.250,0.50,0.50,94.6,57.2,79.0,68.2,33.0,55.9,1.00,0.10,0.582954,0.488176,40.559770
88,12.5,0.25,0.250,0.25,0.50,86.0,52.0,71.9,62.0,30.0,50.8,34.00,0.56,0.435969,0.419720,32.815794
479,25.0,0.50,0.250,0.38,0.63,94.6,57.2,79.0,68.2,33.0,55.9,24.00,0.39,0.364565,0.391617,29.908518
602,25.0,0.50,0.250,0.75,0.50,86.0,52.0,71.9,62.0,30.0,50.8,1.00,0.10,0.523846,0.460305,37.277297
147,12.5,0.25,0.250,0.38,0.38,86.0,52.0,71.9,62.0,30.0,50.8,16.00,0.26,0.553730,0.471250,38.534569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,25.0,0.50,0.380,0.50,0.63,94.6,57.2,79.0,68.2,33.0,55.9,16.00,0.26,0.527592,0.464639,37.782288
767,20.0,0.00,0.585,0.00,0.00,86.0,52.0,71.9,62.0,30.0,50.8,3.77,0.06,0.599984,0.529791,46.585105
72,12.5,0.25,0.250,0.25,0.38,86.0,52.0,71.9,62.0,30.0,50.8,34.00,0.56,0.416271,0.409438,31.577558
235,12.5,0.25,0.250,0.50,0.63,77.4,46.8,64.7,55.8,27.0,45.8,16.00,0.26,0.589306,0.488616,40.546480


In [None]:
y_train

214    7243.226111
88     4684.893205
479    3723.523376
602    6521.291119
147    6683.200614
          ...     
715    6327.477365
767    7575.801245
72     4350.424670
235    7560.205645
37     4356.945873
Name: yield, Length: 543, dtype: float64

# Training

In [None]:
model = RandomForestRegressor(random_state=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(np.mean((y_test - y_pred)**2))

r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))

print("     RMSE: {:.2f}".format(rmse))
print("R^2 Score: {:5f}".format(r2))

     RMSE: 186.23
R^2 Score: 0.982435


In [None]:
params = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [2, 4, 6, 8, 10]
}

model = GridSearchCV(RandomForestRegressor(random_state=1), params)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(np.mean((y_test - y_pred)**2))

r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))

print("     RMSE: {:.2f}".format(rmse))
print("R^2 Score: {:5f}".format(r2))

     RMSE: 185.37
R^2 Score: 0.982596


In [None]:
model.best_params_

{'max_depth': 10, 'n_estimators': 200}

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/AHF7mbBOUgM

#Author
- Gabriel Atkin - https://www.kaggle.com/gcdatkin
- Kaggle link - https://www.kaggle.com/code/gcdatkin/blueberry-yield-prediction/notebook
- Datasets - https://www.kaggle.com/code/gcdatkin/blueberry-yield-prediction/input

#Kelompok 1
- Adzira Rafisha Najlaffaiza
- Mohammad Fahmi Aziz
- Raisya Marsandra