In [26]:
import pandas as pd
import numpy as np

# Model Selection
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
data = pd.read_csv('dataset/train.csv')
data

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4454,ff85154c8,1065000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4455,ffb6b3f4f,48000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,80000.0,0,0,0,0,0,0,0
4456,ffcf61eb6,2800000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4457,ffea67e98,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [4]:
data.shape

(4459, 4993)

In [5]:
# Data Cleaning
# Will do the same operations done in the previous notebook

In [6]:
# Removing constant columns (columns that have transactions/data entries whose data deviates to 0)
colsToRemove = []

for col in data.columns:
    if col != 'ID' and col != 'target':
        if data[col].std() == 0: 
            colsToRemove.append(col)
        
# remove constant columns in the training set
data.drop(colsToRemove, axis=1, inplace=True)

print("Removed `{}` Constant Columns\n".format(len(colsToRemove)))
print(colsToRemove)

Removed `256` Constant Columns

['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c', '0e410eb3d', '992e6d1d3', '90a7

In [7]:
data.shape

(4459, 4737)

## Dimension Reduction

Will attempt a variety of techniques and perform modeling. The purpose of this is to project the high number of features onto a low dimension dataset so as to aid in modeling and avoid a number of issues.


The best performing dimension reduction technique will be the go to


### PCA

PCA is a linear dimensionality reductiion technique. It transforms a set of features into a smaller set of features, while retaining as much of variation in the data as the previous dataset.

The algorithm works by checking for correlation in the features, and any highly correlated features are combined, and the process is repeated until the 'principal components are found'



In [8]:
# Data splitting

drop_cols = ['ID', 'target']

X = data.drop(drop_cols, axis = 1)

X



Unnamed: 0,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,0.0,0,0.0,0,0,0,0,0,2200000.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,0.0,0,0.0,0,0,0,0,0,2000000.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4454,0.0,0,0.0,0,0,0,0,0,70000.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4455,0.0,0,0.0,0,0,0,0,0,375000.0,0.0,...,0.0,0.0,80000.0,0,0,0,0,0,0,0
4456,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4457,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [9]:
y = np.log1p(data['target'].values)
y

array([17.45309674, 13.3046866 , 16.11809575, ..., 14.84513033,
       16.11809575, 16.81124288])

In [10]:
# The X dataset will be reduced to 6 components

from sklearn.decomposition import PCA

pca = PCA(n_components=6)
pca.fit(X)
pca_samples = pca.transform(X)

In [11]:
pca_samples

array([[-15245717.01152265,   -549759.08425384,   -363804.23305556,
          -808035.2556682 ,   -446984.25835245,   -340935.77239863],
       [-21474297.23987216,   -701261.08363575,   -399587.18435009,
          -856127.92434564,   -483144.93723169,   -302578.1800781 ],
       [-29071156.46069926,   -748901.00660913,   -471447.55167374,
          -936413.44890905,   -487209.38034237,   -330520.85600716],
       ...,
       [ 31705332.49484184,   -896473.14865579,   -573040.47186668,
          -931254.52278825,   -837590.4431572 ,   -455781.97507371],
       [-27472957.34981967,   -747602.85345935,   -471903.2275619 ,
          -937413.84475503,   -504634.93314937,   -314318.61659796],
       [  -386989.72927109,   -935713.29436082,   -559531.9472478 ,
          -766682.65006733,   -667338.56965426,   -457139.56550334]])

In [12]:
data_PCA = pd.DataFrame(pca_samples)
data_PCA

Unnamed: 0,0,1,2,3,4,5
0,-1.524572e+07,-549759.084254,-363804.233056,-808035.255668,-446984.258352,-340935.772399
1,-2.147430e+07,-701261.083636,-399587.184350,-856127.924346,-483144.937232,-302578.180078
2,-2.907116e+07,-748901.006609,-471447.551674,-936413.448909,-487209.380342,-330520.856007
3,-3.082328e+07,-742542.892412,-465596.446288,-943410.269433,-501543.943795,-309822.503177
4,-2.621677e+07,-724835.222689,-474196.142496,-932446.172349,-517663.209619,-312166.595312
...,...,...,...,...,...,...
4454,-2.859725e+05,-682862.844529,-428164.160678,-836029.047545,-466338.331503,-331865.200047
4455,-2.997172e+07,-741339.930405,-466549.323759,-932944.918728,-492062.635183,-302559.184414
4456,3.170533e+07,-896473.148656,-573040.471867,-931254.522788,-837590.443157,-455781.975074
4457,-2.747296e+07,-747602.853459,-471903.227562,-937413.844755,-504634.933149,-314318.616598


Will proceed to do modelling as previous notebook, but this time with reduced data

In [13]:
data_PCA.shape, y.shape

((4459, 6), (4459,))

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(data_PCA, y, test_size = 0.2, random_state = 0)

In [15]:
X_train.shape, X_valid.shape

((3567, 6), (892, 6))

## Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [17]:
reg = LinearRegression()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_valid)

rmse = np.sqrt(metrics.mean_squared_error(y_valid, y_pred))
rmse

1.8443474239804085

#### The RMSE reduced to 1.8 after PCA, from trillions before PCA!!!!

## Ridge Regression

In [18]:
## Ridge Regression
ridge = Ridge(alpha = 10, solver = 'cholesky')
ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_valid)

rmse = np.sqrt(metrics.mean_squared_error(y_valid, y_pred_ridge))
rmse


1.8443474239804087

Ridge Regression RMSE is 1.84, same as linear regression

In [19]:
ridge = Ridge(alpha = 1, solver = 'cholesky')
ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_valid)

rmse = np.sqrt(metrics.mean_squared_error(y_valid, y_pred_ridge))
rmse


1.8443474239804087

No difference in RMSE with an alpha of 1

In [20]:
ridge = Ridge(alpha = 0.1, solver = 'cholesky')
ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_valid)

rmse = np.sqrt(metrics.mean_squared_error(y_valid, y_pred_ridge))
rmse

1.8443474239804087

## Lasso regression

In [21]:
# lasso 
# alpha is 0.1
lass = Lasso(alpha = 0.1)
lass.fit(X_train, y_train)

y_pred_lass = lass.predict(X_valid)

rmse = np.sqrt(metrics.mean_squared_error(y_valid, y_pred_lass))
rmse

1.8443474064053624

In [22]:
# alpha is 1.0
lass = Lasso(alpha = 1.0)
lass.fit(X_train, y_train)

y_pred_lass = lass.predict(X_valid)

rmse = np.sqrt(metrics.mean_squared_error(y_valid, y_pred_lass))
rmse

1.8443472480915546

The RMSE is still 1.844

## Support Vector Machines. (SVM)

In [23]:
# SVR with a kernel of rbf
svr = SVR(kernel = 'rbf')
svr.fit(X_train, y_train)

y_pred_svr = svr.predict(X_valid)

rmse = np.sqrt(metrics.mean_squared_error(y_valid, y_pred_svr))
rmse

1.6457334365471676

The RMSE is 1.645, improvement over the linear models

## Decision Trees

In [25]:
# Regression using decision trees

dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_valid)

rmse = np.sqrt(metrics.mean_squared_error(y_valid, y_pred_dt))
rmse

2.288371694580132

The RMSE using decision trees is higher

## Random Forests

In [28]:
# Regression using random forest

forest = RandomForestRegressor()

forest.fit(X_train, y_train)

y_pred_forest = forest.predict(X_valid)

rmse = np.sqrt(metrics.mean_squared_error(y_valid, y_pred_forest))
rmse

1.6255186069734835

In [27]:
# Regression using random forest

forest = RandomForestRegressor(n_estimators=300, max_depth=6, min_samples_split=30, random_state=30)

forest.fit(X_train, y_train)

y_pred_forest = forest.predict(X_valid)

rmse = np.sqrt(metrics.mean_squared_error(y_valid, y_pred_forest))
rmse

1.5729854199525097

The RMSE of random forest is the best one so far, at 1.5.

Thats an improvement over the default parameters, where we increased the no of n_estimators, min sample split to 30 and max depth to 6

## Light Gradient Boost Modeling (LGBM)

In [25]:
# Will include test data and perform some operations

data2 = pd.read_csv('dataset/test.csv')
data2


Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49337,fff73b677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49338,fff7b5923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49339,fff7c698f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49340,fff8dba89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
data2.shape

(49342, 4992)

In [27]:
colsToRemove2 = []

for col in data2.columns:
    if col != 'ID':
        if data2[col].std() == 0: 
            colsToRemove2.append(col)
        
# remove constant columns in the training set
data2.drop(colsToRemove2, axis=1, inplace=True)

print("Removed `{}` Constant Columns\n".format(len(colsToRemove2)))
print(colsToRemove2)

Removed `0` Constant Columns

[]


In [29]:
data2.shape

(49342, 4992)

In [30]:
data2 = data2.drop(['ID'], axis = 1)
data2

Unnamed: 0,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# pca = PCA(n_components=6)
# pca.fit(data2)
# pca_samples2 = pca.transform(data2)

In [None]:
X_test_PCA = pd.DataFrame(pca_samples2)
X_test_PCA

In [35]:
from sklearn import preprocessing, model_selection, metrics
import lightgbm as lgb

In [40]:
#Defining LGMB model

def run_lgb(X_train, y_train, X_valid, y_valid, X_test_PCA):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        #"num_leaves" : 80,
        "learning_rate" : 0.001,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 5,
        "bagging_seed" : 42,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(X_train, label = y_train)
    lgval   = lgb.Dataset(X_valid,   label = y_valid)
    
    evals_result = {}
    
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=200, evals_result=evals_result)
    
    pred_test = model.predict(X_test_PCA, num_iteration=model.best_iteration)
    
    return pred_test, model, evals_result

In [41]:
import warnings
warnings.filterwarnings('ignore')
pred_test_LGBM, model_LGBM, evals_result_LGBM = run_lgb(X_train, y_train, X_valid, y_valid, X_test_PCA)

Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 1.67474
[400]	valid_0's rmse: 1.64702
[600]	valid_0's rmse: 1.62811
[800]	valid_0's rmse: 1.61482
[1000]	valid_0's rmse: 1.60462
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 1.60462


#### After PCA, the LGB model's RMSE is 1.6. Without PCA it was 1.4

## XGBoost

In [53]:
import xgboost as xgb

In [60]:

def run_xgb(X_train, y_train, X_valid, y_valid, X_test_PCA):
    params = {'objective': 'reg:linear', 
          'eval_metric': 'rmse',
          'eta': 0.005,
          'max_depth': 15, 
          'subsample': 0.7, 
          'colsample_bytree': 0.5,
          'alpha':0,
          'random_state': 42, 
          'silent': True}
    
    tr_data = xgb.DMatrix(X_train, y_train)
    va_data = xgb.DMatrix(X_valid, y_valid)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 30, verbose_eval=100)
    
    dtest = xgb.DMatrix(X_test_PCA)
    xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
    
    return xgb_pred_y, model_xgb

In [61]:
warnings.filterwarnings('ignore')
pred_test_xgb, model_xgb = run_xgb(X_train, X_valid, y_train, y_valid, X_test_PCA)
print("XGB Training Completed...")

XGBoostError: [15:59:36] /Users/runner/miniforge3/conda-bld/xgboost-split_1667849653518/work/src/data/data.cc:455: Check failed: this->labels.Size() % this->num_row_ == 0 (1785 vs. 0) : Incorrect size for labels.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x00000001217d7fc4 dmlc::LogMessageFatal::~LogMessageFatal() + 116
  [bt] (1) 2   libxgboost.dylib                    0x00000001218753dd xgboost::MetaInfo::SetInfoFromHost(xgboost::GenericParameter const&, xgboost::StringView, xgboost::Json) + 4301
  [bt] (2) 3   libxgboost.dylib                    0x00000001218741df xgboost::MetaInfo::SetInfo(xgboost::GenericParameter const&, xgboost::StringView, xgboost::StringView) + 159
  [bt] (3) 4   libxgboost.dylib                    0x00000001217ef2c9 XGDMatrixSetInfoFromInterface + 249
  [bt] (4) 5   libffi.7.dylib                      0x000000010c87aead ffi_call_unix64 + 85
  [bt] (5) 6   ???                                 0x00007ff7b3b991e0 0x0 + 140701848932832



In [24]:
X_test_PCA

NameError: name 'X_test_PCA' is not defined