In [2]:
import numpy as np
import pandas as pd
import seaborn as sns 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
import pandas_profiling as pp

In [3]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
df = pd.read_csv("C:\Users\bhato\Downloads\avocado.csv")

SyntaxError: ignored

DATA OVERVIEW

In [5]:
pp.ProfileReport(df)

NameError: ignored

DATA PREPROCESSING

In [None]:
df = df.drop('Unnamed: 0', axis=1)

In [None]:
df

As there are two types of avocados, let's see the price distribution of each one

In [None]:
import plotly.express as px
#conventional = df[df['type'] == 'conventional']
#organic = df[df['type'] == 'organic']

fig = px.histogram(df, x='AveragePrice', color='type',
                   marginal='box', # or violin, rug
                   hover_data=df.columns)


fig.show()

So, on average, organic avocados are more expensive (as expected).

Let's also check whether geography influences the price.



In [None]:
fig = px.box(df, x='region', y='AveragePrice')
fig.show()

Correlation matrix

In [None]:
corr = df.corr()
corr

f, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 10))
ax.set_title('Correlation Matrix', fontsize=16)

sns.heatmap(corr, vmin=-1, vmax=1, cmap='viridis', annot=True)

Checking for missing & duplicated data.

In [None]:
df.isnull().any()

In [None]:
df.duplicated().any()

In [None]:
df

Modeling

In [None]:
df1 = df.copy()

#Introducing new feature = 'season'


df1['Date'] = pd.to_datetime(df1['Date'])
df1['month'] = df1['Date'].dt.month

conditions = [(df1['month'].between(3,5,inclusive=True)),
           (df1['month'].between(6,8,inclusive=True)),
           (df1['month'].between(9,11,inclusive=True)),
           (df1['month'].between(12,2,inclusive=True))]

values = [0,1,2,3]
#spring = 0, summer = 1, fall = 2, winter = 3
df1['seasons'] = np.select(conditions, values)


#encoding labels for 'type'

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1['type'] = le.fit_transform(df1['type'])

# and region (One Hot Encoding instead of labelizing)
ohe = pd.get_dummies(data=df1, columns=['region'])


X = ohe.drop(['AveragePrice','Date','4046','4225','4770','Small Bags','Large Bags','XLarge Bags'], axis=1)
y = df1['AveragePrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [None]:
X_train

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

pipe0 = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])
pipe0.fit(X_train, y_train)
y_pred0 = pipe0.predict(X_test)
r2_score(y_test, y_pred0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

pipe = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestRegressor())])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

In [None]:
from xgboost import XGBRegressor

pipe2 = Pipeline([('scaler', StandardScaler()), ('xgb', XGBRegressor())])
pipe2.fit(X_train, y_train)
y_pred2 = pipe2.predict(X_test)
r2_score(y_test, y_pred2)

Apparently, the best model is the one with boosting (XGB).

In [None]:
pd.DataFrame(pipe2['xgb'].feature_importances_, index=X_train.columns, columns=['Feature Importances'])

So, avocado type accounts for 86% of the price prediction.

PLEASE NOTE! This is a historical analysis, and does not actually produce real-life value for the avocado market analysis. A predictory model will be built further on.

We have a plenty of features now, due to the presence of dummies. Let's see if we can do dimensiality reduction and preserve the R2 score.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=6)

X_train_cut, X_test_cut, y_train_cut, y_test_cut = train_test_split(pca.fit_transform(X), y, test_size=0.33, random_state=0)


from xgboost import XGBRegressor

pipe3 = Pipeline([('scaler', StandardScaler()), ('xgb', XGBRegressor())])
pipe3.fit(X_train_cut, y_train_cut)
y_pred3 = pipe3.predict(X_test_cut)
r2_score(y_test_cut, y_pred3)

So, by reducing the number of features we sacrifice ~15% R2. Luckily, the dataset is not that huge and we have the option to stay with the initial number of feature. Anyway, PCA is a very good exercise

Prediction for the next year
For this part of the notebook, I will use 2 techniques:

Build a simple linear model using scipy.
Use FB prophet package and see what the result will be.
Building a simple linear model

In [None]:
df = pd.read_csv("C:\Users\bhato\Downloads\avocado.csv")

In [None]:
df2 = df[df['region'] == 'California'].drop(['Date','region'], axis=1)
df2 = df2[df['type'] == 'organic']

In [None]:
from scipy import stats

X_lin = df2['year'].reset_index(drop=True)
y_lin = df2['AveragePrice'].reset_index(drop=True)


slope, intercept, r, p, std_err = stats.linregress(X_lin, y_lin) # scipy

def prediction(x):
  return slope * x + intercept

name = 'Avg. Avocado price (organic) in 2019'
md = list(map(prediction, X_lin)) # scipy

X_pred_lin = 2019
y_pred_lin = prediction(X_pred_lin)

print('Predicted avicado price in California in 2019 is: %f USD' % y_pred_lin)

X_lin2 = X_lin.append(pd.Series(X_pred_lin))
y_lin2 = y_lin.append(pd.Series(y_pred_lin))
md2 = list(map(prediction, X_lin2)) 

plt.scatter(X_lin2, y_lin2) # Scatter Plot
plt.plot(X_lin2, md2, color='green')
plt.xticks(np.arange(min(X_lin2), max(X_lin2+1), 1.0))
plt.show()

#plt.ylim(ymin=0) # starts at zero
#plt.legend(['Model Prediction using Linear Regression', 'Avocado Prices (2015-2018)'])
#plt.show()

Using FB Prophet

In [None]:
#pip install fbprophet

In [None]:
from fbprophet import Prophet 
from fbprophet.plot import add_changepoints_to_plot

In [None]:
df = df[df['region'] == 'California']
df['Date'] = df['Date'].str[:-3] 
df = df[df['type'] == 'organic']

In [None]:
agg = {'AveragePrice': 'mean'}
data = df.groupby(df['Date']).aggregate(agg).reset_index()
data.head()

In [None]:
df_ts = pd.DataFrame() 
df_ts['ds'] = pd.to_datetime(data['Date']) 
df_ts['y'] = data['AveragePrice'] 
df_ts.head()

In [None]:
m = Prophet(yearly_seasonality=True, \
            daily_seasonality=False, weekly_seasonality=False) 
m.fit(df_ts)
future = m.make_future_dataframe(periods=12*5, freq='M')

In [None]:
forecast = m.predict(future) 
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper', 'trend', 'trend_lower', 'trend_upper']].tail()

In [None]:
fig = m.plot(forecast) 
fig.show()
a = add_changepoints_to_plot(fig.gca(), m, forecast)

Interesting. So, the Prophet suggests the price of around $ 1.75 in 2019, and then the price for organic avocados will go up in California.