In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/yield_prediction.csv', index_col=False)
df.head()

Unnamed: 0,id,water,uv,area,fertilizer_usage,yield,pesticides,region,categories
0,0,0.072,80.179,9.414,0,29.878,2.231,6,c
1,1,5.413,58.359,9.681,3,53.416,1.81,6,c
2,2,9.731,78.506,7.189,1,63.391,2.455,1,d
3,3,10.995,69.248,1.738,3,17.984,0.603,2,a
4,4,2.617,87.658,9.706,1,49.768,2.91,6,c


In [3]:
print(f"Missing values: \n{df.isnull().sum()}")
print(f"Number of rows: {df.shape[0]}")

Missing values: 
id                   0
water               42
uv                   0
area                 0
fertilizer_usage     0
yield                0
pesticides           0
region               0
categories           0
dtype: int64
Number of rows: 1000


In [4]:
df.describe()

Unnamed: 0,id,water,uv,area,fertilizer_usage,yield,pesticides,region
count,1000.0,958.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,6.668327,78.701995,8.098848,2.294,58.758571,3.452301,3.039
std,288.819436,2.84299,245.848978,2.692632,1.554986,24.563683,2.076921,1.883886
min,0.0,0.072,-2500.0,0.263,0.0,2.843,0.014,0.0
25%,249.75,4.58475,66.493,6.297,1.0,40.698,1.8045,2.0
50%,499.5,6.476,73.7,7.9875,3.0,55.6025,3.2755,2.0
75%,749.25,8.75875,80.60825,9.90025,3.0,73.6455,4.916,5.0
max,999.0,18.1,7400.62,18.311,5.0,148.845,9.532,6.0


In [5]:
fig = px.histogram(df, x="uv", title="UV Histogram")
fig2 = px.box(df, x="uv", title="UV Boxplot")
fig.show()
fig2.show()

In [6]:
fig = px.histogram(df, x="water", title="Water Histogram")
fig2 = px.box(df, x="water", title="Water Boxplot")
fig.show()
fig2.show()

In [7]:
fig = px.histogram(df, x="area", title="Area Histogram")
fig2 = px.box(df, x="area", title="Area Boxplot")
fig.show()
fig2.show()

In [8]:
fig = px.histogram(df, x="fertilizer_usage", title="Fertilizer Usage Histogram")
fig2 = px.box(df, x="fertilizer_usage", title="Fertilizer Usage Boxplot")
fig.show()
fig2.show()

In [9]:
fig = px.histogram(df, x="pesticides", title="Pesticides Histogram")
fig2 = px.box(df, x="pesticides", title="Pesticides Boxplot")
fig.show()
fig2.show()

In [10]:
fig = px.histogram(df, x="region", title="Region Histogram")
fig2 = px.box(df, x="region", title="Region Boxplot")
fig.show()
fig2.show()

##### After looking at the histograms and box plots of all numerical columns of the dataset. It was visually determined that the only column that have outliers is the "uv" column. Based on this observation. we defined outliers as follows: $$Outlier \ge C\sigma+M \text{ or } Outlier \le M - C\sigma$$ Where $C = 5$ and $\sigma$ is the standard deviation and $M$ is median.

In [11]:
# Replace NaN values Medians for a list of columns
def replace_nan_median(df, columns):
    for column in columns:
        median = df[column].median()
        df[column].fillna(median, inplace=True)

replace_nan_median(df, ["uv", "water", "area", "fertilizer_usage", "pesticides"])
print(f"Missing values: \n{df.isnull().sum()}")

Missing values: 
id                  0
water               0
uv                  0
area                0
fertilizer_usage    0
yield               0
pesticides          0
region              0
categories          0
dtype: int64


In [12]:
C = 5
#Replace outliers with the median of a list of columns
def replace_outliers_median(df, columns):
    for column in columns:
        median = df[column].median()
        std = df[column].std()
        outliers = df[(df[column] > median + C * std) | (df[column] < median - C * std)]
        df[column][outliers.index] = median

replace_outliers_median(df, ["uv", "water", "area", "fertilizer_usage", "pesticides"])
df.describe()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/i

Unnamed: 0,id,water,uv,area,fertilizer_usage,yield,pesticides,region
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,6.660249,73.948775,8.098848,2.294,58.758571,3.452301,3.039
std,288.819436,2.782853,9.932102,2.692632,1.554986,24.563683,2.076921,1.883886
min,0.0,0.072,45.264,0.263,0.0,2.843,0.014,0.0
25%,249.75,4.6955,66.50125,6.297,1.0,40.698,1.8045,2.0
50%,499.5,6.476,73.7,7.9875,3.0,55.6025,3.2755,2.0
75%,749.25,8.611,80.56625,9.90025,3.0,73.6455,4.916,5.0
max,999.0,18.1,106.31,18.311,5.0,148.845,9.532,6.0


In [13]:
fig = px.histogram(df, x="uv", title="Adjusted UV Histogram")
fig2 = px.box(df, x="uv", title="Adjusted UV Boxplot")
fig.show()
fig2.show()

In [14]:
def create_category_one_hot(row):
    categories = row['categories'].split(',')
    for category in categories:
        col_name = f"category_{category}"
        row[col_name] = 1
    return row

df = df.apply(create_category_one_hot, axis=1)
df = df.fillna(0)
df.drop(columns=['categories'], inplace=True)
df["category_a"] = df["category_a"].astype(int)
df["category_c"] = df["category_c"].astype(int)
df["category_d"] = df["category_d"].astype(int)
df = df[['uv', 'water', 'area', 'fertilizer_usage', 'pesticides', 'category_a', 'category_c', 'category_d', 'yield']]
df.head(n=20)

Unnamed: 0,uv,water,area,fertilizer_usage,pesticides,category_a,category_c,category_d,yield
0,80.179,0.072,9.414,0.0,2.231,0,1,0,29.878
1,58.359,5.413,9.681,3.0,1.81,0,1,0,53.416
2,78.506,9.731,7.189,1.0,2.455,0,0,1,63.391
3,69.248,10.995,1.738,3.0,0.603,1,0,0,17.984
4,87.658,2.617,9.706,1.0,2.91,0,1,0,49.768
5,51.727,2.19,9.152,0.0,1.788,0,0,1,35.488
6,79.767,10.679,6.311,1.0,0.458,1,0,0,43.157
7,80.43,3.62,9.473,0.0,3.533,1,1,0,32.752
8,61.683,9.857,5.24,1.0,5.09,1,0,0,57.811
9,79.671,4.321,6.818,0.0,1.255,0,0,1,11.65


In [15]:
X = df.drop('yield', axis=1)
y = df['yield']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)
print(f"X_train shape: {X_train.shape}")

X_train shape: (800, 8)


In [16]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 365.522905205


## 2. Regression Tree Optimization (30 Marks)
Apply regression tree and tune parameters to prevent overfitting.
Implement post-pruning without Python libraries and compare it with results with pre-pruning and SKlearn post-pruning.
Discuss the effectiveness of each method in this dataset.

## 3. Comparison with Random Forest and SVR (25 Marks)
Optimize parameters for Decision Trees (DTs), Random Forest, and Support Vector Regression (SVR).
Compare results using sutable metrics.
Discuss the strengths and weaknesses of each method.

## 4. Classification Task (10 Marks)
Add a threshold to the label column for a classification task.
Assign classes: 20% lowest yield per hectare as "low" and 80% as "high."
Apply SVM, DTs, and optimize parameters then compare them with suitable classification metrics.

## 5. Regression Tree as a Classifier (20-30 Marks)
Take the best regression tree from question 2.
Evaluate it as a classification problem following question 4 criteria.
Perform threshold analysis ( like ROC curve) to find the optimal threshold.
Compare the tree with optimal threshold with DTs from question 4. When comparing the results with Decision Trees (DTs) from question 4, consider the scenario where the underlying data is originally regression-based, but the task at hand requires a binary classification of 'yes' and 'no.' Discuss the implications of using regression labels directly versus thresholding them for classification purposes. Provide detailed insights into why one approach may be preferred over the other

# Submit a PDF report.
# Include a Google Colab link at the top of the report.
# Ensure accessibility to the Colab link for everyone.
# Follow these steps, keeping the analysis clear and straightforward, to successfully complete SYDE 522 Assignment 2.