# Avocado price prediction

* **Task type:** regression
* **Models used:** linear, XGB regression


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score


In [None]:
df = pd.read_csv("../input/avocado-prices/avocado.csv")

In [None]:
df

# Data preprocessing

In [None]:
df = df.drop('Unnamed: 0', axis=1)

In [None]:
df

**As there are two types of avocados, let's see the price distribution of each one.**

In [None]:
import plotly.express as px
#conventional = df[df['type'] == 'conventional']
#organic = df[df['type'] == 'organic']

fig = px.histogram(df, x='AveragePrice', color='type',
                   marginal='box',
                   hover_data=df.columns)


fig.show()

**So, on average, organic avocados are more expensive (as expected).**

**Let's also check whether geography influences the price.**

In [None]:
fig = px.box(df, x='region', y='AveragePrice')
fig.show()

**Correlation matrix**

In [None]:
corr = df.corr()
corr

f, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 10))
ax.set_title('Correlation Matrix', fontsize=16)

sns.heatmap(corr, vmin=-1, vmax=1, cmap='viridis', annot=True)

**Checking for missing & duplicated data.**

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().any()

In [None]:
df

# Modeling

In [None]:
df1 = df.copy()

#Introducing new feature = 'season'


df1['Date'] = pd.to_datetime(df1['Date'])
df1['month'] = df1['Date'].dt.month

conditions = [(df1['month'].between(3,5,inclusive=True)),
           (df1['month'].between(6,8,inclusive=True)),
           (df1['month'].between(9,11,inclusive=True)),
           (df1['month'].between(12,2,inclusive=True))]

values = [0,1,2,3]
#spring = 0, summer = 1, fall = 2, winter = 3
df1['seasons'] = np.select(conditions, values)


#encoding labels for 'type'

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1['type'] = le.fit_transform(df1['type'])

# and region (One Hot Encoding instead of labelizing)
ohe = pd.get_dummies(data=df1, columns=['region'])


X = ohe.drop(['AveragePrice','Date','4046','4225','4770','Small Bags','Large Bags','XLarge Bags'], axis=1)
y = df1['AveragePrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [None]:
X_train

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

pipe0 = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])
pipe0.fit(X_train, y_train)
y_pred0 = pipe0.predict(X_test)
r2_score(y_test, y_pred0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

pipe = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestRegressor())])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

In [None]:
from xgboost import XGBRegressor

pipe2 = Pipeline([('scaler', StandardScaler()), ('xgb', XGBRegressor())])
pipe2.fit(X_train, y_train)
y_pred2 = pipe2.predict(X_test)
r2_score(y_test, y_pred2)

**Apparently, the best model is the one with boosting (XGB).**

In [None]:
pd.DataFrame(pipe2['xgb'].feature_importances_, index=X_train.columns, columns=['Feature Importances'])

**So, avocado type accounts for 86% of the price prediction.**