In [37]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb



## (the following code is same as s4_models_regression)

In [5]:
dfcombo = pd.read_csv('combo.csv')
dfcombo.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
zipcode,217324.0,48161.549309,27812.434636,1001.0,25315.0,47171.0,71241.0,99901.0
year,217324.0,2016.619789,2.857517,2012.0,2014.0,2017.0,2019.0,2021.0
population,217324.0,14192.464555,15809.836341,25.0,2512.0,7483.0,21630.0,130352.0
median_household_income,217324.0,60253.62238,24950.906483,2499.0,43789.75,54638.5,70286.0,250001.0
median_age,217324.0,41.236737,6.945752,15.9,36.7,40.9,45.2,84.3
median_rent,217324.0,923.939557,374.824226,99.0,671.0,821.0,1074.0,3501.0
mean_travel_time_to_work,217324.0,6207.458339,7099.169141,0.0,1035.0,3109.0,9409.0,60956.0
median_value,217324.0,212311.41693,179707.66554,9999.0,108500.0,158200.0,247500.0,2000001.0
ave_num_rooms,217324.0,5.731863,0.833573,1.3,5.3,5.7,6.2,10.0
gini,217324.0,0.424868,0.055501,0.128,0.388275,0.4209,0.4573,0.7865


In [6]:
years_to_encode = [2019, 2020, 2021]
for year in years_to_encode:
    dfcombo[f'year_{year}'] = (dfcombo['year'] == year).astype(int)

In [7]:
dfcombo['year'] = dfcombo['year'].astype('category')

In [8]:
featuresX = dfcombo.columns.tolist()
del featuresX[31:]
del featuresX[0:2]
featuresA = featuresX + ['year_2019', 'year_2020', 'year_2021']
featuresB = featuresX + ['year']
ycol='pct_next_1yr'
y = [ycol]
featuresA

['population',
 'median_household_income',
 'median_age',
 'median_rent',
 'mean_travel_time_to_work',
 'median_value',
 'ave_num_rooms',
 'gini',
 'cost_of_living_perc',
 'median_RE_tax',
 'labor_force_perc',
 'unemployed_perc',
 'bach_degr_perc',
 'masters_degr_perc',
 'peops_per_household',
 'owner_occ_perc',
 'new_units_perc',
 'families_wU18_perc',
 'poverty_perc',
 'non_families_perc',
 'vacant_perc',
 'perc_moved_fr_same_county',
 'perc_moved_fr_other_county',
 'perc_moved_fr_other_state',
 'perc_moved_fr_abroad',
 'single_fam_perc',
 'public_trans_perc',
 'foreign_born_perc',
 'male_perc',
 'year_2019',
 'year_2020',
 'year_2021']

## add the class variable 'winner'
A zip code is a winner for a given year if it performs better than the weighted average for that year
* I'll do weighted average by population. (alternative: by num_units)

In [19]:
dfcombo['weighted_avg'] = (dfcombo['population'] * dfcombo['pct_next_1yr']).groupby(dfcombo['year']).transform('sum') / dfcombo['population'].groupby(dfcombo['year']).transform('sum')
dfcombo['winner'] = dfcombo.apply(lambda x: 1 if x['pct_next_1yr'] > x['weighted_avg'] else 0, axis=1)
print("Weighted Averages for each year:")
print(dfcombo.groupby('year')['weighted_avg'].first())
print("\nCounts for each 'winner' value:")
print(dfcombo['winner'].value_counts())

# dfcombo = dfcombo.drop(columns=['weighted_avg'])


Weighted Averages for each year:
year
2012    0.093404
2013    0.059667
2014    0.065022
2015    0.056611
2016    0.068893
2017    0.065557
2018    0.046874
2019    0.095980
2020    0.147883
2021    0.087071
Name: weighted_avg, dtype: float64

Counts for each 'winner' value:
0    128435
1     88889
Name: winner, dtype: int64


In [22]:
# dfcombo.info()

## logistic regression

In [32]:
X = dfcombo[featuresA]
y = dfcombo['winner']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression(max_iter=10000)  # Increase max_iter if convergence issues
logreg.fit(X_train_scaled, y_train)
y_pred = logreg.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 64.64%


## SGD Classifier

In [33]:
sgd_clf = SGDClassifier(max_iter=10000, random_state=42)
sgd_clf.fit(X_train_scaled, y_train)
y_pred = sgd_clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 64.39%


## Random Forest Classifier

In [39]:
X = dfcombo[featuresB]
# no need for StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 73.34%


## lightGBM

In [38]:
d_train = lgb.Dataset(X_train, label=y_train)

params = {
   'objective': 'binary',
   'metric': 'binary_logloss',
   'boosting_type': 'gbdt',
   'num_leaves': 31,
   'learning_rate': 0.05,
   'feature_fraction': 0.9
}

clf = lgb.train(params, d_train)  # num_bost_round=100 by default. Can tune.
y_pred_prob = clf.predict(X_test)

# Convert probabilities to class labels
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

[LightGBM] [Info] Number of positive: 62177, number of negative: 89949
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7224
[LightGBM] [Info] Number of data points in the train set: 152126, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.408720 -> initscore=-0.369258
[LightGBM] [Info] Start training from score -0.369258
Accuracy: 71.79%
