# Models 1

## imports

In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import SGDRegressor

from sklearn.model_selection import KFold, RandomizedSearchCV, cross_val_score
import lightgbm as lgbm
from scipy import stats


## load data

In [3]:
dfcombo = pd.read_csv('combo.csv')
dfcombo.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
zipcode,217324.0,48161.549309,27812.434636,1001.0,25315.0,47171.0,71241.0,99901.0
year,217324.0,2016.619789,2.857517,2012.0,2014.0,2017.0,2019.0,2021.0
population,217324.0,14192.464555,15809.836341,25.0,2512.0,7483.0,21630.0,130352.0
median_household_income,217324.0,60253.62238,24950.906483,2499.0,43789.75,54638.5,70286.0,250001.0
median_age,217324.0,41.236737,6.945752,15.9,36.7,40.9,45.2,84.3
median_rent,217324.0,923.939557,374.824226,99.0,671.0,821.0,1074.0,3501.0
mean_travel_time_to_work,217324.0,6207.458339,7099.169141,0.0,1035.0,3109.0,9409.0,60956.0
median_value,217324.0,212311.41693,179707.66554,9999.0,108500.0,158200.0,247500.0,2000001.0
ave_num_rooms,217324.0,5.731863,0.833573,1.3,5.3,5.7,6.2,10.0
gini,217324.0,0.424868,0.055501,0.128,0.388275,0.4209,0.4573,0.7865


In [47]:
# dfcombo.info()

Per the previous notebook, 2019, 2020, and 2021 seem to be abormal. These correspond to COVID years (where 2019 represents price changes between Dec 2019 and Dec 2020). I will add one-hot encoding for those years. (to be used in linear regression and SGD)

Also, I will make "year" a categorical column (to be used in random forest and lightGBM) 

In [4]:
years_to_encode = [2019, 2020, 2021]
for year in years_to_encode:
    dfcombo[f'year_{year}'] = (dfcombo['year'] == year).astype(int)
# dfcombo.sample(10)

In [56]:
dfcombo['year'] = dfcombo['year'].astype('category')
dfcombo['state'] = dfcombo['state'].astype('category')

* featuresA to be use in linear regression and SGD (one-hot encoding of years)
* featuresB to be used in random forest and lightGBM (year as categorixal variable)

In [50]:
featuresX = dfcombo.columns.tolist()
del featuresX[31:]
del featuresX[0:2]
featuresA = featuresX + ['year_2019', 'year_2020', 'year_2021']
featuresB = featuresX + ['year'] + ['state']
ycol='pct_next_1yr'
y = [ycol]
featuresA

['population',
 'median_household_income',
 'median_age',
 'median_rent',
 'mean_travel_time_to_work',
 'median_value',
 'ave_num_rooms',
 'gini',
 'cost_of_living_perc',
 'median_RE_tax',
 'labor_force_perc',
 'unemployed_perc',
 'bach_degr_perc',
 'masters_degr_perc',
 'peops_per_household',
 'owner_occ_perc',
 'new_units_perc',
 'families_wU18_perc',
 'poverty_perc',
 'non_families_perc',
 'vacant_perc',
 'perc_moved_fr_same_county',
 'perc_moved_fr_other_county',
 'perc_moved_fr_other_state',
 'perc_moved_fr_abroad',
 'single_fam_perc',
 'public_trans_perc',
 'foreign_born_perc',
 'male_perc',
 'year_2019',
 'year_2020',
 'year_2021']

## data for linear regression and SGD

In [48]:
X, y = dfcombo[featuresA], dfcombo[ycol]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

kf = KFold(n_splits=5, shuffle=True, random_state=42)


## linear regression

In [40]:


model = LinearRegression()


scores = cross_val_score(model, X_train_scaled, y_train, cv=kf, scoring=make_scorer(r2_score))

mean_cv_score = np.mean(scores)
std_cv_score = np.std(scores)

model.fit(X_train_scaled, y_train)

test_score = r2_score(y_test, model.predict(X_test_scaled))

print(f"Mean R^2 Score from CV: {mean_cv_score:.4f}")
print(f"Standard Deviation of R^2 Score from CV: {std_cv_score:.4f}")
print(f"R^2 Score on Test Set: {test_score:.4f}")

Mean R^2 Score from CV: 0.2835
Standard Deviation of R^2 Score from CV: 0.0028
R^2 Score on Test Set: 0.2879


In [41]:
# df_lr = dfcombo.dropna(subset=y+featuresA).copy()
# scaler=StandardScaler()
# df_lr[featuresA] = scaler.fit_transform(df_lr[featuresA])
# train, test = train_test_split(df_lr, test_size=0.2, random_state=42)
# # test, val = train_test_split(temp, test_size=0.5, random_state=42)



# X_train, y_train = train[featuresA], train[ycol]
# X_test, y_test = test[featuresA], test[ycol]
# X_val, y_val = val[featuresA], val[ycol]

# # 4. Perform linear regression
# model = LinearRegression()
# model.fit(X_train, y_train)

# # Coefficients and intercept
# # print("Coefficients:", model.coef_)
# # print("Intercept:", model.intercept_)

# # Predictions on the validation set
# y_val_pred = model.predict(X_val)

# # Performance metrics on validation set
# print("Validation Mean Squared Error:", mean_squared_error(y_val, y_val_pred))
# print("Validation R^2 Score:", r2_score(y_val, y_val_pred))

Coefficients: [ 0.02575564  0.00295885  0.0045707   0.00095735 -0.02483927 -0.00223225
 -0.00082236 -0.00100647  0.00876684 -0.01091657  0.00042327  0.00141879
  0.00428933 -0.00283555  0.00111581 -0.00177144  0.00126464  0.0021219
  0.00123232  0.00062886 -0.00199209  0.00412314  0.00116142  0.00137308
 -0.00102011  0.00331685 -0.00249056  0.00776216  0.00010761  0.01813233
  0.02755689  0.01104623]
Intercept: 0.07257931847840812
Validation Mean Squared Error: 0.002619342181344577
Validation R^2 Score: 0.29033849842290627


When I added the year dummies, the R2 went from .09 to .29

## SGDRegressor

In [45]:
# I examined the parameters, and the only one that might matter much is 'loss'.
loss_params = ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']

for param in loss_params:
    sgd_regressor = SGDRegressor(random_state=42)   # max_iter=1000, tol=1e-3, 
    scores = cross_val_score(sgd_regressor, X_train_scaled, y_train, cv=kf, scoring=make_scorer(r2_score))

    mean_cv_score = np.mean(scores)
    std_cv_score = np.std(scores)

    sgd_regressor.fit(X_train_scaled, y_train)
    test_score = r2_score(y_test, sgd_regressor.predict(X_test_scaled))

    print('loss param: ', param)
    print(f"Mean R^2 Score from CV: {mean_cv_score:.4f}")
    print(f"Standard Deviation of R^2 Score from CV: {std_cv_score:.4f}")
    print(f"R^2 Score on Test Set: {test_score:.4f}")


loss param squared_error
Mean R^2 Score from CV: 0.2795
Standard Deviation of R^2 Score from CV: 0.0035
R^2 Score on Test Set: 0.2841
loss param huber
Mean R^2 Score from CV: 0.2795
Standard Deviation of R^2 Score from CV: 0.0035
R^2 Score on Test Set: 0.2841
loss param epsilon_insensitive
Mean R^2 Score from CV: 0.2795
Standard Deviation of R^2 Score from CV: 0.0035
R^2 Score on Test Set: 0.2841
loss param squared_epsilon_insensitive
Mean R^2 Score from CV: 0.2795
Standard Deviation of R^2 Score from CV: 0.0035
R^2 Score on Test Set: 0.2841


In [44]:
# sgd_regressor = SGDRegressor(random_state=42)   # max_iter=1000, tol=1e-3, 
# sgd_regressor.fit(X_train, y_train)

# # Predictions on the validation set
# y_pred = sgd_regressor.predict(X_val)

# # Performance metrics on validation set
# print("Validation Mean Squared Error:", mean_squared_error(y_val, y_val_pred))
# print("Validation R^2 Score:", r2_score(y_val, y_val_pred))

When I added the year dummies, the R2 went from .09 to .29

## redo data for random forest and lightGMB
* because year is now handled as categorical
* also add state as categorical
* remove standard scaler?

In [57]:
X = dfcombo[featuresB]
y = dfcombo[ycol]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


ValueError: could not convert string to float: 'ID'

In [44]:
# df_lr = dfcombo.dropna(subset=y+featuresB).copy()
# scaler=StandardScaler()
# df_lr[featuresB] = scaler.fit_transform(df_lr[featuresB])
# train, temp = train_test_split(df_lr, test_size=0.2, random_state=42)
# test, val = train_test_split(temp, test_size=0.5, random_state=42)

# X_train, y_train = train[featuresB], train[ycol]
# X_test, y_test = test[featuresB], test[ycol]
# X_val, y_val = val[featuresB], val[ycol]

## random forest regressor (<span style="color:red">slow</span> ~8 minutes on laptop)

In [51]:
rf_regressor = RandomForestRegressor(random_state=42) 
                            # n_estimators=100, <- default, slow, R2=.238. with year 0.51
                            # n_estimators=10, <-         faster, R2=.154
rf_regressor.fit(X_train, y_train)

# Predictions on the validation set
y_val_pred = rf_regressor.predict(X_test)
print("Validation Mean Squared Error:", mean_squared_error(y_val, y_val_pred))
print("Validation R^2 Score:", r2_score(y_val, y_val_pred))

NameError: name 'y_val' is not defined

In [52]:
y_test_pred = rf_regressor.predict(X_test)
print("Test Mean Squared Error:", mean_squared_error(y_test, y_test_pred))
print("Test R^2 Score:", r2_score(y_test, y_test_pred))

Test Mean Squared Error: 0.0020574369614385075
Test R^2 Score: 0.4548789012534332


In [45]:
rf_regressor = RandomForestRegressor(random_state=42) 
                            # n_estimators=100, <- default, slow, R2=.238. with year 0.51
                            # n_estimators=10, <-         faster, R2=.154
rf_regressor.fit(X_train, y_train)

# Predictions on the validation set
y_val_pred = rf_regressor.predict(X_test)
print("Validation Mean Squared Error:", mean_squared_error(y_val, y_val_pred))
print("Validation R^2 Score:", r2_score(y_val, y_val_pred))

Validation Mean Squared Error: 0.0017972674753294884
Validation R^2 Score: 0.5130641791049713


## lightGBM with KFold and RandomizedSearchCV

In [48]:
X=dfcombo[featuresB].copy()
y=dfcombo[[ycol]].copy()
y.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pct_next_1yr,217324.0,0.072615,0.061442,-0.395326,0.033453,0.064001,0.10372,0.75288


In [52]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217324 entries, 0 to 217323
Data columns (total 30 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   population                  217324 non-null  int64   
 1   median_household_income     217324 non-null  float64 
 2   median_age                  217324 non-null  float64 
 3   median_rent                 217324 non-null  float64 
 4   mean_travel_time_to_work    217324 non-null  float64 
 5   median_value                217324 non-null  float64 
 6   ave_num_rooms               217324 non-null  float64 
 7   gini                        217324 non-null  float64 
 8   cost_of_living_perc         217324 non-null  float64 
 9   median_RE_tax               217324 non-null  float64 
 10  labor_force_perc            217324 non-null  float64 
 11  unemployed_perc             217324 non-null  float64 
 12  bach_degr_perc              217324 non-null  float64 
 13 

In [53]:
X_scaled = scaler.fit_transform(X)

# needed?
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


param_dist_old = {'n_estimators': [50, 75, 100, 150, 200, 300],
              'learning_rate': stats.uniform(0.01, 0.2),
              'objective': ["regression","poisson", "tweedie"#"gamma"
              #, "tweedie"
              ],
              'max_depth': [None] + [x for x in range(1,12)],
              'num_leaves': [x for x in range(10, 40, 3)],
              'min_data_in_leaf': [20,100,250],
              'reg_alpha': np.linspace(0,2, num=4),
              'reg_lambda': np.linspace(0,2, num=4),
              'subsample': [0.8, 1],
              'colsample_bytree': [0.8,1],
              'boosting_type': ["gbdt","dart"],
              'random_state':[2102021],
              #'n_jobs':[5],
}

param_dist = {# 'n_estimators': [50],
              # 'learning_rate': [.15], # stats.uniform(0.05, 0.2),
              # 'objective': ["regression",    # "poisson", "tweedie"#"gamma"
              #, "tweedie"
              # ],
              # 'max_depth': [6,12],# [None] + [x for x in range(1,12)],
              'num_leaves': [31,62,127], # [x for x in range(10, 40, 3)],
              'min_data_in_leaf': [20,250],
              'reg_alpha': [.1,.5], #np.linspace(0,2, num=4),
              # 'reg_lambda': [0,2], # np.linspace(0,2, num=4),
              'lambda_l1': [0,1,1.5],
              'lambda_l2': [0,1],
              # 'subsample': [0.8, 1],
              # 'colsample_bytree': [0.8,1],
              # 'boosting_type': ["gbdt"],
              'random_state':[42],
              #'n_jobs':[5],
}


myfolds = KFold(n_splits=5)


lgbm_mod = lgbm.LGBMRegressor()
# rand_cv_hist = RandomizedSearchCV(lgbm_mod, param_dist, cv=myfolds, n_jobs=1, n_iter=1000, scoring='neg_mean_absolute_error')
rand_cv_hist = RandomizedSearchCV(lgbm_mod, param_dist, cv=myfolds, n_jobs=-1, n_iter=50, scoring='r2', random_state=42, verbose=1)

In [54]:
# import os
# os.environ['KMP_DUPLICATE_LIB_OK']='True'

rand_cv_hist.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072462
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049280 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072474
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start training from score 0.072609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017973 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.105778 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023076 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.058377 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start training from score 0.072609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072474
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010720 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072474
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.097210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072462
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016964 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072462
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022718 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start training from score 0.072609
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023562 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train s

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010910 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072474
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017587 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007993 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023635 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059313 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072474
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016565 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start training from score 0.072609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022426 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034508 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029323 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.090600 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.131370 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072462
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.080936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070450 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044840 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021725 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072474
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054942 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072462
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018790 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.137946 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072462
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077922 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072474
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.111027 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041547 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064098 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.086625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072474
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016676 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start training from score 0.072609
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026278 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020460 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start 

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052452 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start training from score 0.072609
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.124878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014857 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.081296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start training from score 0.072609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031909 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025246 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057960 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072462
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007486 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.113651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040321 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064609 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start training from score 0.072609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072462
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070824 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051941 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start training from score 0.072609
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.109373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072462
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010224 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018024 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.059556 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072462
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061934 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020672 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start training from score 0.072609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026727 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043837 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019762 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start training from score 0.072609
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072474
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009016 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020432 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.085970 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.084046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.080653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074081 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072474
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.076177 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 139088, number of used features: 30
[LightGBM] [Info] Start training from score 0.072609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038528 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.112218 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train s

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044410 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072531
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099398 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7225
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start training from score 0.072462
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.112238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 139087, number of used features: 30
[LightGBM] [Info] Start 

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 173859, number of used features: 30
[LightGBM] [Info] Start training from score 0.072509


In [55]:
# save CV result to csv 
# pd.DataFrame(rand_cv_hist.cv_results_).to_csv("LightGBM-Hyper-CV.csv")

best_model = rand_cv_hist.best_estimator_
# Figure out the model that you want to interpret
# fit the best model for that comment code
# interpret as below

# lgbm = bestmodel.fit(X,Y)

y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2}")

# variable importance from lgbm
# lgbm.plot_importance()
# plot_importance(best_model)
# plt.show()


R2 Score: 0.503668893989339


R^2 went from .233 without year, to .504 with year