# Models 1

## imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import SGDRegressor

from sklearn.model_selection import KFold, RandomizedSearchCV
import lightgbm as lgbm
from scipy import stats


## load data

In [2]:
df_combo = pd.read_csv('combo.csv')
df_combo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263170 entries, 0 to 263169
Data columns (total 46 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   zipcode                     263170 non-null  int64  
 1   year                        263170 non-null  int64  
 2   population                  263170 non-null  int64  
 3   median_household_income     262955 non-null  float64
 4   median_age                  263170 non-null  float64
 5   median_rent                 262890 non-null  float64
 6   mean_travel_time_to_work    263163 non-null  float64
 7   median_value                262986 non-null  float64
 8   ave_num_rooms               263170 non-null  float64
 9   gini                        263163 non-null  float64
 10  cost_of_living_perc         263163 non-null  float64
 11  median_RE_tax               262178 non-null  float64
 12  labor_force_perc            263106 non-null  float64
 13  unemployed_per

In [3]:
# df_combo.columns.tolist()

In [4]:
ycol = 'pct_next_1yr'
dependent = [ycol]
features = ['population',
            'median_household_income',
            'median_age',
            'median_rent',
            'mean_travel_time_to_work',
            'median_value',
            'ave_num_rooms',
            'gini',
            'cost_of_living_perc',
            'median_RE_tax',
            'labor_force_perc',
            'unemployed_perc',
            'bach_degr_perc',
            'masters_degr_perc',
            'peops_per_household',
            'owner_occ_perc',
            'new_units_perc',
            'families_wU18_perc',
            'poverty_perc',
            'non_families_perc',
            'vacant_perc',
            'perc_moved_fr_same_county',
            'perc_moved_fr_other_county',
            'perc_moved_fr_other_state',
            'perc_moved_fr_abroad',
            'single_fam_perc',
            'public_trans_perc',
            'foreign_born_perc',
            'male_perc',
            'typ_price',]

## linear regression

In [5]:
df_lr = df_combo.dropna(subset=dependent+features).copy()
scaler=StandardScaler()
df_lr[features] = scaler.fit_transform(df_lr[features])
train, temp = train_test_split(df_lr, test_size=0.2, random_state=42)
test, val = train_test_split(temp, test_size=0.5, random_state=42)

X_train, y_train = train[features], train[ycol]
X_test, y_test = test[features], test[ycol]
X_val, y_val = val[features], val[ycol]

# 4. Perform linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# Coefficients and intercept
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

# Predictions on the validation set
y_val_pred = model.predict(X_val)

# Performance metrics on validation set
print("Validation Mean Squared Error:", mean_squared_error(y_val, y_val_pred))
print("Validation R^2 Score:", r2_score(y_val, y_val_pred))

Coefficients: [ 3.77776509e+00 -9.41142757e-02 -4.03254056e-02 -2.08927941e-01
 -3.54818972e+00 -4.14957478e-02 -6.05197039e-03  2.62088287e-02
  7.50469674e-02 -4.67143237e-02 -1.06070111e-01 -4.74760072e-01
  1.34663221e-01 -5.01954105e-01  2.27713173e-02 -1.89204650e-01
  9.56847695e-01 -2.24868844e-01  5.54929221e-02  2.06658086e-01
 -5.00846635e-01  1.30632881e-01  7.93088059e-04  7.89956194e-02
 -2.50600955e-01  2.19520367e-01 -5.09920994e-01  9.18104051e-01
  6.92636249e-03  1.78378733e-01]
Intercept: 7.295682076165879
Validation Mean Squared Error: 39.14405727617852
Validation R^2 Score: -0.035114980271853335


<span style="color:red">The multicollinearity is probably a big part of the problem</span>

## random forest regressor <span style="color:red">(slow)</span>

In [6]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Predictions on the validation set
y_val_pred = rf_regressor.predict(X_val)
print("Validation Mean Squared Error:", mean_squared_error(y_val, y_val_pred))
print("Validation R^2 Score:", r2_score(y_val, y_val_pred))

KeyboardInterrupt: 

## SGDRegressor

In [7]:
sgd_regressor = SGDRegressor(random_state=42)   # max_iter=1000, tol=1e-3, 
sgd_regressor.fit(X_train, y_train)

# Predictions on the validation set
y_val_pred = sgd_regressor.predict(X_val)

# Performance metrics on validation set
print("Validation Mean Squared Error:", mean_squared_error(y_val, y_val_pred))
print("Validation R^2 Score:", r2_score(y_val, y_val_pred))

Validation Mean Squared Error: 5.1747452033596536e+20
Validation R^2 Score: -1.368395780052983e+19


In [8]:
len(df_combo)

263170

## lightGBM with KFold and RandomizedSearchCV

In [12]:
pd.set_option('mode.use_inf_as_na', True)

In [13]:
# subset = df_combo.head(50000)

# X=subset[features].copy()
# y=subset[dependent].copy()
X=df_combo[features].copy()
y=df_combo[dependent].copy()

In [22]:
# X.iloc[0:5,0:5]
# X.describe().iloc[:,0:5]
X.describe().iloc[:,5:10]
X.describe().iloc[:,10:]

Unnamed: 0,labor_force_perc,unemployed_perc,bach_degr_perc,masters_degr_perc,peops_per_household,owner_occ_perc,new_units_perc,families_wU18_perc,poverty_perc,non_families_perc,vacant_perc,perc_moved_fr_same_county,perc_moved_fr_other_county,perc_moved_fr_other_state,perc_moved_fr_abroad,single_fam_perc,public_trans_perc,foreign_born_perc,male_perc,typ_price
count,263106.0,263106.0,263113.0,263113.0,263093.0,263093.0,263093.0,263093.0,263106.0,263093.0,263093.0,263106.0,263106.0,263106.0,263106.0,263093.0,262977.0,263113.0,263113.0,230850.0
mean,49.01211,3.291356,11.081654,4.729735,2.625477,73.227603,1.156123,29.994586,3.823812,32.189429,29.537499,6.472336,3.189382,1.909446,0.361722,93.248353,2.113216,6.741636,49.870896,236255.0
std,8.130011,2.276322,6.679312,4.017262,,15.602309,,9.66857,3.684217,10.822451,,4.638885,3.204131,2.416389,0.800608,,6.133908,9.497177,4.130237,225103.6
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5995.128
25%,44.491991,1.857003,6.327631,2.094735,2.372197,66.317326,0.0,24.544735,1.19167,25.423729,7.420269,3.252033,1.171155,0.438871,0.0,72.693231,0.0,0.927193,47.870872,119256.5
50%,49.682529,2.937407,9.526101,3.563461,2.572667,76.231263,0.147804,29.979821,2.892949,31.314286,13.645458,5.781549,2.483723,1.276781,0.089486,86.662011,0.197796,2.933819,49.488505,174514.5
75%,54.080904,4.312975,14.494681,6.17181,2.804146,83.807267,1.163707,35.466313,5.380669,37.564767,25.525526,8.815812,4.257417,2.577141,0.431202,99.960506,1.427036,8.357429,51.420839,272156.6
max,100.0,100.0,100.0,100.0,42.523077,100.0,2250.0,100.0,68.181818,100.0,62300.0,100.0,71.717172,100.0,69.724771,50600.0,83.237617,100.0,100.0,7482104.0


In [15]:
X_scaled = scaler.fit_transform(X)

# needed?
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


param_dist_old = {'n_estimators': [50, 75, 100, 150, 200, 300],
              'learning_rate': stats.uniform(0.01, 0.2),
              'objective': ["regression","poisson", "tweedie"#"gamma"
              #, "tweedie"
              ],
              'max_depth': [None] + [x for x in range(1,12)],
              'num_leaves': [x for x in range(10, 40, 3)],
              'min_data_in_leaf': [20,100,250],
              'reg_alpha': np.linspace(0,2, num=4),
              'reg_lambda': np.linspace(0,2, num=4),
              'subsample': [0.8, 1],
              'colsample_bytree': [0.8,1],
              'boosting_type': ["gbdt","dart"],
              'random_state':[2102021],
              #'n_jobs':[5],
}

param_dist = {# 'n_estimators': [50],
              # 'learning_rate': [.15], # stats.uniform(0.05, 0.2),
              # 'objective': ["regression",    # "poisson", "tweedie"#"gamma"
              #, "tweedie"
              # ],
              # 'max_depth': [6,12],# [None] + [x for x in range(1,12)],
              'num_leaves': [31,62,127], # [x for x in range(10, 40, 3)],
              'min_data_in_leaf': [20,250],
              'reg_alpha': [.1,.5], #np.linspace(0,2, num=4),
              # 'reg_lambda': [0,2], # np.linspace(0,2, num=4),
              'lambda_l1': [0,1,1.5],
              'lambda_l2': [0,1],
              # 'subsample': [0.8, 1],
              # 'colsample_bytree': [0.8,1],
              # 'boosting_type': ["gbdt"],
              'random_state':[42],
              #'n_jobs':[5],
}


myfolds = KFold(n_splits=5, random_state=42)


lgbm_mod = lgbm.LGBMRegressor()
# rand_cv_hist = RandomizedSearchCV(lgbm_mod, param_dist, cv=myfolds, n_jobs=1, n_iter=1000, scoring='neg_mean_absolute_error')
rand_cv_hist = RandomizedSearchCV(lgbm_mod, param_dist, cv=myfolds, n_jobs=-1, n_iter=50, scoring='r2', random_state=42, verbose=1)

ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
# import os
# os.environ['KMP_DUPLICATE_LIB_OK']='True'

rand_cv_hist.fit(X_train,y_train)

In [None]:
# save CV result to csv 
# pd.DataFrame(rand_cv_hist.cv_results_).to_csv("LightGBM-Hyper-CV.csv")

best_model = rand_cv_hist.best_estimator_
# Figure out the model that you want to interpret
# fit the best model for that comment code
# interpret as below

# lgbm = bestmodel.fit(X,Y)

y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2}")

# variable importance from lgbm
# lgbm.plot_importance()
plot_importance(best_model)
plt.show()
