In [38]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

#Import dataset
df = pd.read_csv('dataset/final_df.csv', sep = ';', index_col = 0)


In [39]:
#2016 and 2017 will be used to test
test_2016 = df.loc[df['year'] == 2016]
test_2017 = df.loc[df['year'] == 2017]

test = pd.concat([test_2016, test_2017])

train = df[~df.year.isin([2016,2017])]

#Adjust the model

X_train = train.iloc[:,1:5].values
y_train = train.iloc[:,0].values

X_test = test.iloc[:,1:5].values
y_test = test.iloc[:,0].values

label_encoder_city_code = LabelEncoder()
label_encoder_product = LabelEncoder()
label_encoder_product_type = LabelEncoder()

X_train[:,0] = label_encoder_city_code.fit_transform(X_train[:,0])
X_train[:,1] = label_encoder_product.fit_transform(X_train[:,1])
X_train[:,2] = label_encoder_product_type.fit_transform(X_train[:,2])

X_test[:,0] = label_encoder_city_code.fit_transform(X_test[:,0])
X_test[:,1] = label_encoder_product.fit_transform(X_test[:,1])
X_test[:,2] = label_encoder_product_type.fit_transform(X_test[:,2])

regressor_tree = DecisionTreeRegressor()
regressor_tree.fit(X_train, y_train)
regressor_tree.score(X_test, y_test)

#Predict the values
predictions = regressor_tree.predict(X_test)

#Insert values into dataframe
test['predicted_area'] = predictions.tolist()
test.loc[:,'destinated_area'] = test.loc[:,'destinated_area'].map(lambda x: 0 if x < 0 else x)

In [40]:
def wmape(df):
    real = df.iloc[:,0].values
    predict = df.iloc[:,5].values
    return abs(real-predict).sum()/real.sum()

In [41]:
#Calculate the error
(wmape(test.loc[test['product_type'] == 'pasture']) + \
wmape(test.loc[test['product_type'] == 'temporary']) + \
wmape(test.loc[test['product_type'] == 'permanent']))/3

0.338518714530131

In [42]:
wmape(test.loc[test['product_type'] == 'pasture'])

0.03795314030092393

In [43]:
wmape(test.loc[test['product_type'] == 'temporary'])

0.6421681617522302

In [44]:
wmape(test.loc[test['product_type'] == 'permanent'])

0.33543484153723896

In [45]:
import pandas
import random

#Set seed
random.seed(10)

#Import dataset
dataset = pd.read_csv('dataset/dataframe_after_exploratory_analysis_and_cleaning_data_1.csv',index_col = 0, sep = ';')

#Cross Validation

#Separe the test dataset
auxiliar_test = dataset.loc[dataset.year.isin([2016,2017])]
dataset_without_2016_2017 = dataset.loc[~dataset.year.isin([2016,2017])]
temporary = auxiliar_test.loc[auxiliar_test.product_type == 'temporary']
permanent = auxiliar_test.loc[auxiliar_test.product_type == 'permanent']
pasture = auxiliar_test.loc[auxiliar_test.product_type == 'pasture']

#Temporary test
temporary_without_test_cases = pd.DataFrame([], columns = ['destinated_area', 'city_code', 'product', 'product_type','year'])
temporary_test_cases = pd.DataFrame([], columns = ['destinated_area', 'city_code', 'product', 'product_type','year'])
for year,sub_df in temporary.groupby('year'):
    position = random.sample([i for i in range(len(sub_df))],int(len(sub_df)*(0.05)))
    temporary_without_test_cases = pd.concat([temporary_without_test_cases,sub_df.iloc[list(set(range(len(sub_df)))-set(position)),:]])
    temporary_test_cases = pd.concat([temporary_test_cases,sub_df.iloc[position,:]])

#Permanent test
permanent_without_test_cases = pd.DataFrame([], columns = ['destinated_area', 'city_code', 'product', 'product_type','year'])
permanent_test_cases = pd.DataFrame([], columns = ['destinated_area', 'city_code', 'product', 'product_type','year'])
for year,sub_df in permanent.groupby('year'):
    position = random.sample([i for i in range(len(sub_df))],int(len(sub_df)*(0.05)))
    permanent_without_test_cases = pd.concat([permanent_without_test_cases,sub_df.iloc[list(set(range(len(sub_df)))-set(position)),:]])
    permanent_test_cases = pd.concat([permanent_test_cases,sub_df.iloc[position,:]])

#Pasture test
pasture_without_test_cases = pd.DataFrame([], columns = ['destinated_area', 'city_code', 'product', 'product_type','year'])
pasture_test_cases = pd.DataFrame([], columns = ['destinated_area', 'city_code', 'product', 'product_type','year'])
for year,sub_df in pasture.groupby('year'):
    position = random.sample([i for i in range(len(sub_df))],int(len(sub_df)*(0.05)))
    pasture_without_test_cases = pd.concat([pasture_without_test_cases,sub_df.iloc[list(set(range(len(sub_df)))-set(position)),:]])
    pasture_test_cases = pd.concat([pasture_test_cases,sub_df.iloc[position,:]])

test = pd.concat([temporary_test_cases,permanent_test_cases,pasture_test_cases])
dataset = pd.concat([temporary_without_test_cases,permanent_without_test_cases,pasture_without_test_cases,dataset_without_2016_2017])

#Split dataset into 10 sub_dataset
temporary = dataset.loc[dataset.product_type == 'temporary']
permanent = dataset.loc[dataset.product_type == 'permanent']
pasture = dataset.loc[dataset.product_type == 'pasture']
size = int(len(dataset)*0.1)
df = []
for _ in range(10):
    df.append(pd.DataFrame([], columns = ['destinated_area', 'city_code', 'product', 'product_type','year']))
for product_type,sub_df_1 in dataset.groupby('product_type'):
    for year,sub_df_2 in sub_df_1.groupby('year'):
        positions_splited = []
        all_positions = [i for i in range(len(sub_df_2))]
        chosen_positions = set()
        for _ in range(9):
            aux = random.sample(list(set(all_positions)-chosen_positions),int(len(sub_df_2)*0.10))
            positions_splited.append(aux)
            chosen_positions = chosen_positions.union(aux)
        positions_splited.append(list(set(all_positions)-chosen_positions))
        for i in range(10):
            df[i] = pd.concat([df[i],sub_df_2.iloc[positions_splited[i],:]])
            dataset = pd.concat([dataset,sub_df_2.iloc[list(set(range(len(sub_df_2)))-set(positions_splited[i])),:]])

In [46]:
def wmape(df):
    real = df.iloc[:,0].values
    predict = df.iloc[:,5].values
    return abs(real-predict).sum()/real.sum()

from sklearn.model_selection import GridSearchCV

cross_wmape = []
cross_score = []

params = {'min_samples_split': [2,3,4], 'splitter': ['best','random'],
           "max_features":["log2","sqrt",None], "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90]}

for i, value in enumerate(df):
    
    #We applied cross validation by dividing the dataset into 9 training dataset and 1 testing dataset
    train = pd.concat(df[0:i].copy() + df[i+1:].copy())
    test_cross = df[i].copy()

    X_train = train.iloc[:,1:5].values
    y_train = train.iloc[:,0].values
    X_test = test_cross.iloc[:,1:5].values
    y_test = test_cross.iloc[:,0].values

    label_encoder_city_code = LabelEncoder()
    label_encoder_product = LabelEncoder()
    label_encoder_product_type = LabelEncoder()

    X_train[:,0] = label_encoder_city_code.fit_transform(X_train[:,0])
    X_train[:,1] = label_encoder_product.fit_transform(X_train[:,1])
    X_train[:,2] = label_encoder_product_type.fit_transform(X_train[:,2])

    X_test[:,0] = label_encoder_city_code.fit_transform(X_test[:,0])
    X_test[:,1] = label_encoder_product.fit_transform(X_test[:,1])
    X_test[:,2] = label_encoder_product_type.fit_transform(X_test[:,2])

    regressor_tree = DecisionTreeRegressor()
    regressor_tree.fit(X_train, y_train)
    
    #Predict the values
    predictions = regressor_tree.predict(X_test)
    
    #Insert values into dataframe
    test_cross['predicted_area'] = predictions.tolist()
    test_cross.loc[:,'destinated_area'] = test_cross.loc[:,'destinated_area'].map(lambda x: 0 if x < 0 else x)
    
    #Calculate the w_mape
    k = (wmape(test_cross.loc[test_cross['product_type'] == 'pasture']) + \
    wmape(test_cross.loc[test_cross['product_type'] == 'temporary']) + \
    wmape(test_cross.loc[test_cross['product_type'] == 'permanent']))/3
    
    #cross_wmape is an array that shows each w_mape
    cross_wmape.append(k)
    
    #tuning the parameters
    grid_search = GridSearchCV(estimator= DecisionTreeRegressor(), param_grid = params)
    grid_search.fit(X_train,y_train)
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_


In [35]:
print(cross_wmape)

[0.1263523833768869, 0.14322005242096947, 0.12894151566728215, 0.1419711736397079, 0.11845909875248477, 0.11664508803699958, 0.13648249337322485, 0.11870072940861891, 0.12700241843809154, 0.12466871332662399]
