In [5]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv('avocado.csv', index_col='Unnamed: 0')
df

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2015-12-13,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,2015-12-06,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany
4,2015-11-29,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,2018-02-04,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,organic,2018,WestTexNewMexico
8,2018-01-28,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico
9,2018-01-21,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico
10,2018-01-14,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico


### Task 1 (Regression): Avg Prices

In [100]:
df = df.sample(frac=1).reset_index(drop=True)                                       # shuffle df first

reg_df = df.copy()
reg_df['type'] = 1*(reg_df['type'] == 'organic')                                    # organic - 1, conventional - 0
reg_df['region'] = pd.factorize(reg_df['region'])[0] + 1                            # region - [0, 54]
reg_df['Month'] = df['Date'].apply(pd.to_datetime, errors = 'coerce').dt.month      # convert the Data column to only have the Month since already have year
reg_df = reg_df.drop(columns=['Date'])

reg_df

Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,Month
0,1.49,1889.99,624.20,39.22,0.00,1226.57,995.08,231.49,0.00,1,2015,1,5
1,1.56,5580.67,27.66,438.56,0.00,5114.45,5111.12,3.33,0.00,1,2017,2,12
2,1.42,514742.18,96728.68,134993.60,920.60,282099.30,165320.50,116481.78,297.02,0,2018,3,3
3,0.88,295706.69,1673.41,211337.24,6471.52,76224.52,67998.45,7823.60,402.47,0,2016,4,5
4,0.99,259151.84,139176.17,7391.96,36.98,112546.73,86638.12,23799.21,2109.40,0,2017,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,1.15,243366.13,59620.01,30542.73,19975.22,133228.17,123961.02,9260.42,6.73,0,2018,52,1
18245,1.64,63734.67,7949.89,22176.16,6.27,33602.35,28969.13,4633.22,0.00,1,2016,31,11
18246,0.92,4829487.03,2562669.53,971082.67,23769.90,1271964.93,531132.05,696042.61,44790.27,0,2016,33,5
18247,1.64,1862.24,22.53,1036.23,0.00,803.48,803.48,0.00,0.00,1,2017,4,2


In [101]:
features_reg = reg_df.columns.tolist()
features_reg.remove('AveragePrice')           # delete target column of 'AveragePrice'
features_reg

['Total Volume',
 '4046',
 '4225',
 '4770',
 'Total Bags',
 'Small Bags',
 'Large Bags',
 'XLarge Bags',
 'type',
 'year',
 'region',
 'Month']

#### Alg 1: Linear Regression

In [102]:
from sklearn import pipeline, preprocessing, model_selection
from sklearn import linear_model

In [103]:
pipe_lin = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', linear_model.LinearRegression())
])
xval = model_selection.KFold(10, shuffle=True)

result_lin = model_selection.cross_validate(pipe_lin, reg_df[features_reg], reg_df['AveragePrice'], cv=xval, scoring='r2', return_estimator=True, return_train_score=True)

print('Output of LinearRegression:')
print('R^2 mean (test):', result_lin['test_score'].mean())     # worse than random guessing, :(
print('R^2 STD (test):', result_lin['test_score'].std())       # not overfitting!
print('R^2 mean (train):', result_lin['train_score'].mean())
print('R^2 STD (train):', result_lin['test_score'].std())      # not overfitting!

Output of LinearRegression:
R^2 mean (test): 0.4386324596912451
R^2 STD (test): 0.016102208776286764
R^2 mean (train): 0.43991826091300384
R^2 STD (train): 0.016102208776286764


#### Alg 2: Random Forest - need to try out different max_depth here!

In [2]:
from sklearn import ensemble

In [105]:
pipe_rf = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', ensemble.RandomForestRegressor(max_depth=20))
]) 
xval = model_selection.KFold(10, shuffle=True)

result_rf = model_selection.cross_validate(pipe_rf, reg_df[features_reg], reg_df['AveragePrice'], cv=xval, scoring='r2', return_estimator=True, return_train_score=True)

print('Output of RandomForest (max_depth=20):')
print('R^2 mean (test):', result_rf['test_score'].mean())     # pretty good
print('R^2 STD (test):', result_rf['test_score'].std())       # not overfitting!
print('R^2 mean (train):', result_rf['train_score'].mean())
print('R^2 STD (train):', result_rf['test_score'].std())      # not overfitting!

Output of RandomForest (max_depth=20):
R^2 mean (test): 0.8965524386655946
R^2 STD (test): 0.006356476926948599
R^2 mean (train): 0.984851266277993
R^2 STD (train): 0.006356476926948599


In [106]:
pipe_rf = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', ensemble.RandomForestRegressor(max_depth=25))
]) 
xval = model_selection.KFold(10, shuffle=True)

result_rf = model_selection.cross_validate(pipe_rf, reg_df[features_reg], reg_df['AveragePrice'], cv=xval, scoring='r2', return_estimator=True, return_train_score=True)

print('Output of RandomForest (max_depth=25):')
print('R^2 mean (test):', result_rf['test_score'].mean())     # improving!
print('R^2 STD (test):', result_rf['test_score'].std())       # not overfitting!
print('R^2 mean (train):', result_rf['train_score'].mean())
print('R^2 STD (train):', result_rf['test_score'].std())      # not overfitting!

Output of RandomForest (max_depth=25):
R^2 mean (test): 0.8970267407051382
R^2 STD (test): 0.004097097964961787
R^2 mean (train): 0.985503079392816
R^2 STD (train): 0.004097097964961787


In [107]:
pipe_rf_best = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', ensemble.RandomForestRegressor(max_depth=30))
]) 
xval = model_selection.KFold(10, shuffle=True)

result_rf_best = model_selection.cross_validate(pipe_rf_best, reg_df[features_reg], reg_df['AveragePrice'], cv=xval, scoring='r2', return_estimator=True, return_train_score=True)

print('Output of RandomForest (max_depth=30):')
print('R^2 mean (test):', result_rf_best['test_score'].mean())     # best version right now
print('R^2 STD (test):', result_rf_best['test_score'].std())       # not overfitting!
print('R^2 mean (train):', result_rf_best['train_score'].mean())
print('R^2 STD (train):', result_rf_best['test_score'].std())      # not overfitting!

Output of RandomForest (max_depth=30):
R^2 mean (test): 0.8977815011458536
R^2 STD (test): 0.0038777943602128356
R^2 mean (train): 0.985463699343606
R^2 STD (train): 0.0038777943602128356


In [108]:
pipe_rf = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', ensemble.RandomForestRegressor(max_depth=35))
]) 
xval = model_selection.KFold(10, shuffle=True)

result_rf = model_selection.cross_validate(pipe_rf, reg_df[features_reg], reg_df['AveragePrice'], cv=xval, scoring='r2', return_estimator=True, return_train_score=True)

print('Output of RandomForest (max_depth=35):')
print('R^2 mean (test):', result_rf['test_score'].mean())     # declined
print('R^2 STD (test):', result_rf['test_score'].std())       # not overfitting!
print('R^2 mean (train):', result_rf['train_score'].mean())
print('R^2 STD (train):', result_rf['test_score'].std())      # not overfitting!

Output of RandomForest (max_depth=35):
R^2 mean (test): 0.8974150690237739
R^2 STD (test): 0.00799453573591934
R^2 mean (train): 0.9855062431364733
R^2 STD (train): 0.00799453573591934


#### Alg 3: Neural Network

In [7]:
from sklearn import neural_network

In [110]:
pipe_nn = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', neural_network.MLPRegressor())
])
xval = model_selection.KFold(10, shuffle=True)

result_nn = model_selection.cross_validate(pipe_nn, reg_df[features_reg], reg_df['AveragePrice'], cv=xval, scoring='r2', return_train_score=True)

print('Output of NeuralNetwork:')
print('R^2 mean (test):', result_nn['test_score'].mean())     # perform much worse than random forest actually
print('R^2 STD (test):', result_nn['test_score'].std())       # not overfitting!
print('R^2 mean (train):', result_nn['train_score'].mean())
print('R^2 STD (train):', result_nn['test_score'].std())      # not overfitting!

Output of NeuralNetwork:
R^2 mean (test): 0.6354031448078876
R^2 STD (test): 0.013890302619691708
R^2 mean (train): 0.6441296580077622
R^2 STD (train): 0.013890302619691708


### Task 2 (Classification): Organicity

In [8]:
df = df.sample(frac=1).reset_index(drop=True)                                       # shuffle df first

cla_df = df.copy()
cla_df['type'] = 1*(cla_df['type'] == 'organic')                                    # organic - 1, conventional - 0
cla_df['region'] = pd.factorize(cla_df['region'])[0] + 1                            # region - [0, 54]
cla_df['Month'] = df['Date'].apply(pd.to_datetime, errors = 'coerce').dt.month      # convert the Data column to only have the Month since already have year
cla_df = cla_df.drop(columns=['Date'])

cla_df

Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,Month
0,0.72,6680323.25,3128527.61,1539488.77,537748.12,1474558.75,1071647.44,348148.30,54763.01,0,2016,1,5
1,1.50,644584.67,235569.01,258475.58,5308.79,145231.29,109325.32,35905.97,0.00,1,2015,2,3
2,1.76,415130.61,113711.41,118369.81,14533.01,168516.38,106243.97,61915.21,357.20,0,2017,3,9
3,1.50,492249.41,166509.01,261622.93,9937.63,54179.84,52113.51,561.33,1505.00,0,2017,4,6
4,1.23,6492.79,344.80,3395.88,524.90,2227.21,915.26,1311.95,0.00,1,2015,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,1.50,4941.74,150.16,3819.72,0.00,971.86,903.33,68.53,0.00,1,2015,50,2
18245,1.59,3257752.57,208054.74,2048824.18,17032.95,983840.70,922618.70,60122.74,1099.26,0,2016,47,10
18246,1.17,3329.28,18.38,24.23,0.00,3286.67,3286.67,0.00,0.00,1,2016,29,12
18247,1.10,128848.68,4270.64,75604.83,13860.26,35112.95,21300.25,11566.09,2246.61,0,2015,15,10


In [10]:
features_cla_removed = cla_df.columns.tolist()
features_cla_removed.remove('type')               # delete target column of 'type'
features_cla_removed.remove('year')
features_cla_removed.remove('region')             # delete these 2 variables first
features_cla_removed

['AveragePrice',
 'Total Volume',
 '4046',
 '4225',
 '4770',
 'Total Bags',
 'Small Bags',
 'Large Bags',
 'XLarge Bags',
 'Month']

In [15]:
from sklearn import pipeline, preprocessing, model_selection, metrics

In [16]:
pipe_rf_cla = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', ensemble.RandomForestClassifier(max_depth=5))
])
xval = model_selection.KFold(10, shuffle=True)

scorer = metrics.make_scorer(metrics.cohen_kappa_score)
result_rf_cla_no_year = model_selection.cross_validate(pipe_rf_cla, cla_df[features_cla_removed], cla_df['type'], cv=xval, scoring=scorer, return_estimator=True, return_train_score=True)

print('Output of RandomForest on Organicity (with None of the 2):')
print('Kappa mean (test):', result_rf_cla_no_year['test_score'].mean())     # better than linear model
print('Kappa STD (test):', result_rf_cla_no_year['test_score'].std())       # not overfitting!
print('Kappa mean (train):', result_rf_cla_no_year['train_score'].mean())
print('Kappa STD (train):', result_rf_cla_no_year['test_score'].std())      # not overfitting!

Output of RandomForest on Organicity (with None of the 2):
Kappa mean (test): 0.9427761062499563
Kappa STD (test): 0.006293301016752833
Kappa mean (train): 0.9462247368539256
Kappa STD (train): 0.006293301016752833


#### Would including the variable Year and Region improve our model’s accuracy? - Year

In [17]:
features_cla_with_year = features_cla_removed .copy()
features_cla_with_year.append('year')
features_cla_with_year

['AveragePrice',
 'Total Volume',
 '4046',
 '4225',
 '4770',
 'Total Bags',
 'Small Bags',
 'Large Bags',
 'XLarge Bags',
 'Month',
 'year']

In [18]:
pipe_rf_cla = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', ensemble.RandomForestClassifier(max_depth=5))
])
xval = model_selection.KFold(10, shuffle=True)

scorer = metrics.make_scorer(metrics.cohen_kappa_score)
result_rf_cla_no_year = model_selection.cross_validate(pipe_rf_cla, cla_df[features_cla_with_year], cla_df['type'], cv=xval, scoring=scorer, return_estimator=True, return_train_score=True)

print('Output of RandomForest on Organicity (with Year):')
print('Kappa mean (test):', result_rf_cla_no_year['test_score'].mean())
print('Kappa STD (test):', result_rf_cla_no_year['test_score'].std())
print('Kappa mean (train):', result_rf_cla_no_year['train_score'].mean())
print('Kappa STD (train):', result_rf_cla_no_year['test_score'].std())

Output of RandomForest on Organicity (with Year):
Kappa mean (test): 0.9421191732008743
Kappa STD (test): 0.008065257595597336
Kappa mean (train): 0.9457012874394032
Kappa STD (train): 0.008065257595597336


#### What about Region?

In [19]:
features_cla_with_region = features_cla_removed .copy()
features_cla_with_region.append('region')
features_cla_with_region

['AveragePrice',
 'Total Volume',
 '4046',
 '4225',
 '4770',
 'Total Bags',
 'Small Bags',
 'Large Bags',
 'XLarge Bags',
 'Month',
 'region']

In [20]:
pipe_rf_cla = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', ensemble.RandomForestClassifier(max_depth=5))
])
xval = model_selection.KFold(10, shuffle=True)

scorer = metrics.make_scorer(metrics.cohen_kappa_score)
result_rf_cla_no_year = model_selection.cross_validate(pipe_rf_cla, cla_df[features_cla_with_region], cla_df['type'], cv=xval, scoring=scorer, return_estimator=True, return_train_score=True)

print('Output of RandomForest on Organicity (with Region):')
print('Kappa mean (test):', result_rf_cla_no_year['test_score'].mean())
print('Kappa STD (test):', result_rf_cla_no_year['test_score'].std())
print('Kappa mean (train):', result_rf_cla_no_year['train_score'].mean())
print('Kappa STD (train):', result_rf_cla_no_year['test_score'].std())

Output of RandomForest on Organicity (with Region):
Kappa mean (test): 0.9578966200044103
Kappa STD (test): 0.004889872619771657
Kappa mean (train): 0.9624937538605909
Kappa STD (train): 0.004889872619771657


#### With both of them?

In [22]:
features_cla_both = cla_df.columns.tolist()
features_cla_both.remove('type')               # delete target column of 'type'
features_cla_both

['AveragePrice',
 'Total Volume',
 '4046',
 '4225',
 '4770',
 'Total Bags',
 'Small Bags',
 'Large Bags',
 'XLarge Bags',
 'year',
 'region',
 'Month']

In [23]:
pipe_rf_cla = pipeline.Pipeline([
    ('scaling', preprocessing.StandardScaler()),
    ('model', ensemble.RandomForestClassifier(max_depth=5))
])
xval = model_selection.KFold(10, shuffle=True)

scorer = metrics.make_scorer(metrics.cohen_kappa_score)
result_rf_cla_no_year = model_selection.cross_validate(pipe_rf_cla, cla_df[features_cla_both], cla_df['type'], cv=xval, scoring=scorer, return_estimator=True, return_train_score=True)

print('Output of RandomForest on Organicity (with both):')
print('Kappa mean (test):', result_rf_cla_no_year['test_score'].mean())
print('Kappa STD (test):', result_rf_cla_no_year['test_score'].std())
print('Kappa mean (train):', result_rf_cla_no_year['train_score'].mean())
print('Kappa STD (train):', result_rf_cla_no_year['test_score'].std())

Output of RandomForest on Organicity (with both):
Kappa mean (test): 0.9554850060512188
Kappa STD (test): 0.006033030579584947
Kappa mean (train): 0.9608132800616891
Kappa STD (train): 0.006033030579584947
