In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [2]:
train_data = pd.read_csv('/content/XGB_train.csv', index_col = 0)
test_data = pd.read_csv('/content/XGB_test.csv')

In [3]:
train_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
#Feature Engineering

In [5]:
X = train_data.select_dtypes(include = ['number']).copy().drop(columns = 'SalePrice')
y = train_data['SalePrice']

In [6]:
X.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [7]:
X['n_bathrooms'] = X['BsmtFullBath'] + (X['BsmtFullBath'] * 0.5) + X['FullBath'] + (X['FullBath'] * 0.5)

X['area_with_basement'] = X['GrLivArea'] + X['TotalBsmtSF']

In [8]:
X.shape

(1460, 38)

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [11]:
### Model Train

In [12]:
xgb_r = xgb.XGBRegressor()

In [27]:
from sklearn.model_selection import GridSearchCV

In [14]:
param_grid = {
    'max_depth':[4, 5, 6],
    'n_estimators':[500, 600, 700],
    'learning_rate':[0.01, 0.015]
}

In [15]:
search = GridSearchCV(xgb_r, param_grid, cv = 5).fit(X_train, y_train)

print("The best hyperparameters are ", search.best_params_)

The best hyperparameters are  {'learning_rate': 0.015, 'max_depth': 5, 'n_estimators': 600}


In [16]:
regressor = xgb.XGBRegressor(learning_rate = search.best_params_['learning_rate'],
                             n_estimators = search.best_params_['n_estimators'],
                             max_depth = search.best_params_['max_depth'])

regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [17]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def metrics(y_test, y_pred):
  mse = mean_squared_error(y_test, y_pred)
  mae = mean_absolute_error(y_test, y_pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, y_pred)

  print('MSE = ', mse)
  print('RMSE = ', rmse)
  print('MAE = ', mae)
  print('r2 = ', r2)

In [18]:
metrics(y_test, y_pred)

MSE =  1582734453.5878878
RMSE =  39783.5952823257
MAE =  18374.108893407534
r2 =  0.8455784674232467


In [19]:
import lightgbm as lgb

In [20]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,
    "max_bin": 512,
    "num_iterations": 100000
}

In [21]:
gbm = lgb.LGBMRegressor(**hyper_params)

In [None]:
gbm.fit(X_train, y_train,
        #eval_set=[(X_test, y_test)],
        #eval_metric='l1',

        )

In [25]:
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)



In [26]:
metrics(y_test, y_pred)

MSE =  1438653632.5920978
RMSE =  37929.58782523345
MAE =  18705.29122576856
r2 =  0.8596358989416232


In [None]:
### Walmart

In [None]:
walmart = pd.read_csv('/content/Walmart.csv', parse_dates=True)

In [None]:
walmart.head()

In [None]:
walmart['Date'] = pd.to_datetime(walmart['Date'])

In [None]:
walmart.info()

In [None]:
walmart['month'] = walmart['Date'].dt.month

In [None]:
walmart['day'] = walmart['Date'].dt.day

In [None]:
walmart['year'] = walmart['Date'].dt.year

In [None]:
walmart.head()

In [None]:
def encode(data, col, max_val):
  data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
  data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
  return data

In [None]:
data = encode(walmart, 'day', 31)
data = encode(walmart, 'month', 12)

In [None]:
data.head()

In [None]:
data.plot.scatter('day_sin', 'day_cos').set_aspect('equal')

In [None]:
data.plot.scatter('month_sin', 'month_cos').set_aspect('equal')