In [None]:
pip install pandas numpy matplotlib scikit-learn


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error

# Load the dataset
url = 'https://raw.githubusercontent.com/dsrscientist/Data-Science-ML-Capstone-Projects/master/avocado.csv.zip'
df = pd.read_csv(url)

# Explore the dataset
print(df.head())  # Display the first few rows of the dataset
print(df.info())  # Check data types and missing values

# Data Preprocessing

# Convert Date column to datetime type
df['Date'] = pd.to_datetime(df['Date'])

# Drop irrelevant columns
df.drop(['Unnamed: 0'], axis=1, inplace=True)

# Encoding categorical columns
encoder = LabelEncoder()
df['type'] = encoder.fit_transform(df['type'])

# Separate features and target for classification
X_cls = df.drop(['Region'], axis=1)
y_cls = df['Region']

# Separate features and target for regression
X_reg = df.drop(['AveragePrice'], axis=1)
y_reg = df['AveragePrice']

# Split the data for both tasks
X_cls_train, X_cls_test, y_cls_train, y_cls_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Scaling the features for regression
scaler = StandardScaler()
X_reg_train_scaled = scaler.fit_transform(X_reg_train)
X_reg_test_scaled = scaler.transform(X_reg_test)

# Classification Task: Using Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_cls_train, y_cls_train)
y_cls_pred = clf.predict(X_cls_test)

# Regression Task: Using Linear Regression
regressor = LinearRegression()
regressor.fit(X_reg_train_scaled, y_reg_train)
y_reg_pred = regressor.predict(X_reg_test_scaled)

# Evaluation for Classification
accuracy = accuracy_score(y_cls_test, y_cls_pred)
print("Classification Accuracy:", accuracy)

# Evaluation for Regression
mse = mean_squared_error(y_reg_test, y_reg_pred)
print("Regression Mean Squared Error:", mse)

# Visualization for Regression
plt.scatter(y_reg_test, y_reg_pred)
plt.xlabel("Actual Average Price")
plt.ylabel("Predicted Average Price")
plt.title("Actual Average Price vs. Predicted Average Price")
plt.show()
