In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
import itertools
import json
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder



In [2]:
df = pd.read_csv('syn_dasta.csv')

# LabelEncoder object: le
le = LabelEncoder()

df['proxy_species'] = pd.DataFrame(le.fit_transform(df['species']))

In [3]:
# Create OneHotEncoder: ohe
ohe = OneHotEncoder(sparse=False)

# df_encoded
df_encoded = pd.DataFrame(ohe.fit_transform(df['proxy_species'].values.reshape(-1, 1)))

# Renaming columns
df_encoded.columns = le.classes_

print le.classes_

# Joining df_encoded to df
df = df.join(df_encoded)

# print df_encoded

# Dropping proxy_species and species
df = df.drop(['species', 'proxy_species'], axis=1)

['African elephant  ' 'African giant pouched rat' 'Arctic Fox  '
 'Asian elephant  ' 'Baboon   ' 'Big brown bat ' 'Brazilian tapir  '
 'Cat   ' 'Chimpanzee   ' 'Chinchilla   ']


In [4]:
# df_encoded.head()

In [5]:
# encoder_list = zip(range(len(le.classes_)), le.classes_)
# data = json.dumps({key: value for (key, value) in encoder_list})
# with open("encoder.json","w") as f:
#   f.write(data)

In [6]:
# age = 2.1
# species = 'African giant pouched rat'

# with open("encoder.json", "r") as f:
#     data = json.loads(f.read())
    
# cols = ['age']

# colvals = [age]

# for key in data:
#     cols.append(str(data[key]))
#     if data[key] == species:
#         colvals.append(1)
#     else:
#         colvals.append(0)
        
# if species not in cols:
#     return 'unknown species'

# d = pd.DataFrame(columns=cols)

# d.loc[0] = colvals


In [7]:
# Separating features and target variable
x, y = df.iloc[:,:-1], df.iloc[:,-1]

# Splitting into train and test ~ Hide
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=111)

# Creating DMatrix
df_dm = xgb.DMatrix(data=x, label=y)
# df_dm = xgb.DMatrix(data=x_train, label=y_train)
# df_dm_test = xgb.DMatrix(data=x_test)

# parameter dictionary: params
params = {"objective":"reg:linear"}

# Create list of max_depth values
eta_vals = [0.001, 0.01, 0.1]
reg_params = [1, 10]
max_depths = [1, 2, 5]
subsamples = [0.3, 0.6, 0.9]

list_of_params = [eta_vals, reg_params, max_depths, subsamples]
params_vary = list(itertools.product(*list_of_params))

print len(params_vary)

best_rmse = []

# Systematically vary params
for curr_val in params_vary:

    params["eta"] = curr_val[0]
    params["lambda"] = curr_val[1]
    params["max_depth"] = curr_val[2]
    params["subsample"] = curr_val[3]
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=df_dm, params=params, nfold=10, num_boost_round=100, early_stopping_rounds=25, metrics="rmse", as_pandas=True, seed=123)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

comb = pd.DataFrame(list(zip(params_vary, best_rmse)),columns=["params","best_rmse"])

54


In [8]:
(eta, lam, max_depth, subsample) = comb.loc[comb['best_rmse'].idxmin()]['params']

In [9]:
# print eta

In [10]:
params = {"objective":"reg:linear", "eta": eta, "lambda": lam, "max_depth": max_depth, "subsample": subsample}

In [11]:
# Train the model: xg_reg
xg_reg = xgb.train(params=params, dtrain=df_dm, num_boost_round=100)

# Save the model
xg_reg.save_model('trained.model')

In [13]:
# # load model
# bst = xgb.Booster()  # init model
# bst.load_model('trained.model')  # load data

# # Predict the labels of the test set: preds
# preds = bst.predict(df_dm_test)

# # Compute and print the RMSE
# rmse = np.sqrt(mean_squared_error(y_test,preds))
# print("RMSE: %f" % (rmse))