In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error


In [None]:
data = pd.read_excel("/content/onion.xlsx")

In [None]:
data.head()

Unnamed: 0,State Name,District Name,Market Name,Group,Arrivals (Tonnes),Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Reported Date
0,Tamil Nadu,Kancheepuram,Acharapakkam,Onion,33.5,1060,1085,1075,2011-03-05
1,Tamil Nadu,Kancheepuram,Acharapakkam,Onion,33.0,1075,1150,1100,2011-03-04
2,Tamil Nadu,Kancheepuram,Acharapakkam,Onion,35.0,980,1200,1000,2011-03-03
3,Tamil Nadu,Kancheepuram,Acharapakkam,Onion,37.7,975,1000,985,2011-03-02
4,Tamil Nadu,Kancheepuram,Acharapakkam,Onion,37.5,980,1025,1000,2011-03-01


In [None]:
data.tail()

Unnamed: 0,State Name,District Name,Market Name,Group,Arrivals (Tonnes),Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Reported Date
545,Tamil Nadu,Namakkal,Namagiripettai,Onion,0.1,3650,4000,3800,2024-01-08
546,Tamil Nadu,Namakkal,Namagiripettai,Onion,1.17,4950,6600,5775,2023-11-17
547,Tamil Nadu,Namakkal,Namagiripettai,Onion,0.2,6810,7000,6900,2023-10-27
548,Tamil Nadu,Namakkal,Namagiripettai,Onion,0.2,7810,8000,7905,2023-10-25
549,Tamil Nadu,Namakkal,Namagiripettai,Onion,1.2,4700,4950,4825,2023-10-18


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550 entries, 0 to 549
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   State Name                 550 non-null    object        
 1   District Name              550 non-null    object        
 2   Market Name                550 non-null    object        
 3   Group                      550 non-null    object        
 4   Arrivals (Tonnes)          536 non-null    float64       
 5   Min Price (Rs./Quintal)    550 non-null    int64         
 6   Max Price (Rs./Quintal)    550 non-null    int64         
 7   Modal Price (Rs./Quintal)  550 non-null    int64         
 8   Reported Date              550 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 38.8+ KB


In [None]:
data = data.dropna()

data['Reported Date'] = pd.to_datetime(data['Reported Date']).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Reported Date'] = pd.to_datetime(data['Reported Date']).astype(int)


In [None]:
features = ['Arrivals (Tonnes)', 'Min Price (Rs./Quintal)', 'Max Price (Rs./Quintal)', 'Reported Date']
target = 'Modal Price (Rs./Quintal)'

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

scaler = StandardScaler()
train_scaled = scaler.fit_transform(train[features])
test_scaled = scaler.transform(test[features])


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer

# Define a pipeline with scaler and MLPRegressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', MLPRegressor(max_iter=1000, random_state=42))
])

# Define hyperparameters to tune
param_grid = {
    'regressor__hidden_layer_sizes': [(100,), (100, 90), (100, 50, 25),(100,75,50,25)],
    'regressor__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
    'regressor__activation': ['relu', 'tanh']
}

# Define Mean Squared Error as the scoring metric for GridSearchCV
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, scoring=scorer, cv=5)
grid_search.fit(train[features], train[target])

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
predictions = best_model.predict(test[features])

comparison_df = pd.DataFrame({'Actual Price': test[target], 'Predicted Price': predictions})
print(comparison_df)
mse = mean_absolute_error(test[target], predictions)
print("Mean Squared Error:", mse)



     Actual Price  Predicted Price
130          1600      1594.566906
145          1700      1691.027008
167          1400      1395.763236
258          1100      1093.933356
97           1000       993.747023
..            ...              ...
388          2300      2298.863332
381          2400      2395.362672
549          4825      4839.771178
397          2400      2399.083492
297          1100      1091.199482

[108 rows x 2 columns]
Mean Squared Error: 46.015276419248266


In [None]:
from sklearn.metrics import mean_absolute_error
predictions_model1 = model1.predict(test_scaled)
predictions_model2 = model2.predict(test_scaled)
predictions_best_model = best_model.predict(test[features])

mae_model1 = mean_absolute_error(test[target], predictions_model1)
print("Mean Absolute Error (Gradient Boosting):", mae_model1)



mae_model2 = mean_absolute_error(test[target], predictions_model2)
print("Mean Absolute Error (Random Forest):", mae_model2)



mae_model3 = mean_absolute_error(test[target], predictions_best_model)
print("Mean Absolute Error (MLP Regressor):", mae_model3)

plt.figure(figsize=(18, 6))

plt.subplot(1, 3, 1)
plt.scatter(test[target], predictions_model1, alpha=0.5)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Model 1 - Gradient Boosting')

plt.subplot(1, 3, 2)
plt.scatter(test[target], predictions_model2, alpha=0.5)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Model 2 - Random Forest')

plt.subplot(1, 3, 3)
plt.scatter(test[target], predictions_best_model, alpha=0.5)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Best Model - MLPRegressor')

plt.tight_layout()
plt.show()

NameError: name 'model1' is not defined

In [None]:
from sklearn.metrics import mean_squared_error
predictions_model1 = model1.predict(test_scaled)
predictions_model2 = model2.predict(test_scaled)
predictions_best_model = best_model.predict(test[features])

# Calculate mean squared error for each model
mse_model1 = mean_squared_error(test[target], predictions_model1)
mse_model2 = mean_squared_error(test[target], predictions_model2)
mse_best_model = mean_squared_error(test[target], predictions_best_model)

# Print MSE for each model
print("Mean Squared Error (Model 1 - Gradient Boosting):", mse_model1)
print("Mean Squared Error (Model 2 - Random Forest):", mse_model2)
print("Mean Squared Error (Best Model - MLPRegressor):", mse_best_model)

# Optionally, you can plot the actual vs predicted values for each model
plt.figure(figsize=(18, 6))

plt.subplot(1, 3, 1)
plt.scatter(test[target], predictions_model1, alpha=0.5)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Model 1 - Gradient Boosting')

plt.subplot(1, 3, 2)
plt.scatter(test[target], predictions_model2, alpha=0.5)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Model 2 - Random Forest')

plt.subplot(1, 3, 3)
plt.scatter(test[target], predictions_best_model, alpha=0.5)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Best Model - MLPRegressor')

plt.tight_layout()
plt.show()

In [None]:
#Scatterplot
plt.figure(figsize=(10, 6))
plt.scatter(test.index, test[target], color='red', label='Actual Price')
plt.scatter(test.index, predictions, color='blue', label='Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.legend()
plt.show()

In [None]:
#Histogram
plt.figure(figsize=(10, 6))
sns.histplot(comparison_df['Actual Price'], kde=True, color='red', label='Actual Price')
sns.histplot(comparison_df['Predicted Price'], kde=True, color='blue', label='Predicted Price')
plt.title('Histogram of Actual vs Predicted Prices')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.legend()
plt.show()

In [None]:
model1 = GradientBoostingRegressor()
model1.fit(train_scaled, train[target])

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model2 = RandomForestRegressor(n_estimators=100, random_state=42)
model2.fit(train_scaled, train[target])

# Make predictions
predictions = model2.predict(test_scaled)

# Create a DataFrame for actual and predicted values
comparison_data = pd.DataFrame({'Actual Price': test[target], 'Predicted Price': predictions})

# Plotting the distributional graph
plt.figure(figsize=(10, 6))
sns.histplot(comparison_data['Actual Price'], kde=True, color='blue', label='Actual Price')
sns.histplot(comparison_data['Predicted Price'], kde=True, color='red', label='Predicted Price')
plt.title('Distribution of Actual and Predicted Prices (Random Forest)')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.legend()
mae = mean_absolute_error(comparison_data['Actual Price'], comparison_data['Predicted Price'])
print("Mean Absolute Error:", mae)

NameError: name 'RandomForestRegressor' is not defined

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Model names
models = ['Model 1 (Gradient Boosting)', 'Model 2 (Random Forest)', 'Best Model (MLPRegressor)']

# Mean Squared Errors (MSE) for each model
mse_values = [mse_model1, mse_model2, mse_best_model]

# Number of models
num_models = len(models)

# Create a radar chart
angles = np.linspace(0, 2 * np.pi, num_models, endpoint=False).tolist()

fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
ax.fill(angles, mse_values, color='red', alpha=0.25)
ax.plot(angles, mse_values, color='red', linewidth=2)

ax.set_yticklabels([])
ax.set_xticks(angles)
ax.set_xticklabels(models, fontsize=10, fontweight='bold')

plt.title('Mean Squared Error (MSE) Comparison')
plt.show()


NameError: name 'mse_model1' is not defined

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define the number of models and angles
num_models = 3  # Number of models
angles = np.linspace(0, 2 * np.pi, num_models, endpoint=False).tolist()

# Predictions for each model
predictions_models = [predictions_model1, predictions_model2, predictions_best_model]

# Create a circular plot for model predictions
plt.figure(figsize=(6, 6))

for i, predictions in enumerate(predictions_models):
    theta = np.linspace(i * (2 * np.pi / num_models), (i + 1) * (2 * np.pi / num_models), len(predictions))
    plt.plot(theta, predictions, label=f'Model {i + 1}')

plt.xlabel('Angle')
plt.ylabel('Predicted Price')
plt.title('Circular Plot of Model Predictions')
plt.legend()
plt.show()


In [None]:
import numpy as np

# Sort the test[target] and predictions arrays based on test[target]
sorted_indices = np.argsort(test[target])
sorted_actual = test[target].values[sorted_indices]
sorted_predictions_model1 = predictions_model1[sorted_indices]
sorted_predictions_model2 = predictions_model2[sorted_indices]
sorted_predictions_best_model = predictions_best_model[sorted_indices]

# Scatter plot for comparing predicted values of different models
plt.figure(figsize=(8, 6))

plt.scatter(sorted_actual, sorted_predictions_model1, label='Model 1 (Gradient Boosting)', alpha=0.5)
plt.scatter(sorted_actual, sorted_predictions_model2, label='Model 2 (Random Forest)', alpha=0.5)
plt.scatter(sorted_actual, sorted_predictions_best_model, label='Best Model (MLPRegressor)', alpha=0.5)

plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Comparison of Predicted Prices by Different Models')
plt.legend()
plt.grid(True)
plt.show()
