In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

# Step 1: Load the data
data_path = r"C:\Users\Hiremath\OneDrive\Desktop\New folder\05.08.2023 fliprobo\2\avocado.csv"
data = pd.read_csv(data_path)

# Step 2: Exploratory Data Analysis (EDA) - if needed, perform data exploration here

# Step 3: Preprocessing and Feature Engineering
# Convert 'Date' column to pandas datetime
data['Date'] = pd.to_datetime(data['Date'])

# Extract year, month, and day as numerical features
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

# Drop the original 'Date' column as it's no longer needed
data.drop('Date', axis=1, inplace=True)

# Encode 'type' column to numerical values (0 for conventional, 1 for organic)
label_encoder = LabelEncoder()
data['type'] = label_encoder.fit_transform(data['type'])

# One-hot encode the 'region' column
data = pd.get_dummies(data, columns=['region'])

# Step 4: Split the data into train and test sets
X = data.drop(['AveragePrice'], axis=1)
y_regression = data['AveragePrice']
y_classification = data['type']

X_train, X_test, y_train_regression, y_test_regression, y_train_classification, y_test_classification = \
    train_test_split(X, y_regression, y_classification, test_size=0.2, random_state=42)

# Step 5: Model Building (Classification)
classification_model = RandomForestClassifier()  # You can try other classifiers as well
classification_model.fit(X_train, y_train_classification)

# Step 6: Model Evaluation (Classification)
y_pred_classification = classification_model.predict(X_test)
classification_accuracy = accuracy_score(y_test_classification, y_pred_classification)
print("Classification Accuracy:", classification_accuracy)

# Step 7: Model Building (Regression)
regression_model = RandomForestRegressor()  # You can try other regression models as well
regression_model.fit(X_train, y_train_regression)

# Step 8: Model Evaluation (Regression)
y_pred_regression = regression_model.predict(X_test)
regression_mse = mean_squared_error(y_test_regression, y_pred_regression)
print("Regression Mean Squared Error:", regression_mse)

# Save the best model for production
# (You can choose the best model based on classification_accuracy and regression_mse)


Classification Accuracy: 1.0
Regression Mean Squared Error: 0.014574551342465745
