In [111]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.preprocessing import OneHotEncoder


# To set up a temporary directory for caching pipeline results
from tempfile import mkdtemp

# To build a pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# To do a cross-validated grid search
from sklearn.model_selection import GridSearchCV

In [2]:
from functions import *

In [84]:
df=pd.read_csv('../data/interim/model.csv',index_col=0)

* [ ] For each georaphy, we would want to census

---
**Dataframge Shape Aim:**
| `Census Tract_Year` | Mean Income Percentage Variation from the City Average (from year x to x+n) : Change Combined | Mean Income in Year x | Numerical X Feature 1 (from year x-t to x)|
| --- | --- | --- | --- |
| 10100_2021 | XXX |XXX |XXX |
| 10100_2020 | XXX |XXX |XXX |
| 10100_2019 | XXX |XXX |XXX |
| --- | --- | --- |--- |

  **Census Tract -- Year -- Median Income for this year**

(Median_Income_year_(x+n) - Median_Income_year_(x))*(Median_Income_year_(x+n) / Median_Income_year_(x))**2

---
**<center><h3>Baseline Classification Model<center><h3>**

In [5]:
df_base_train=pd.read_csv('../data/interim/baseline_model_rem.csv',index_col=0)
df_base_test=pd.read_csv('../data/interim/baseline_model_test.csv',index_col=0)

* OHE:
  * Fit & transform on train
  * Transform only on test
  * `All columns are numeric`
* Group by Census Tract & Year
* Determine intial train and prediction windows
  * Baseline model: split at 2013
    * training on 2006 to 2013 data
    * predicting from 2013 to 2021
* Split X and y

---
* **OHE**

In [6]:
ohe_col=list(df_base_train.select_dtypes(include=['object']).columns)
ohe_col

['PERMIT_TYPE',
 'REVIEW_TYPE',
 'CONTACT_1_TYPE',
 'CONTACT_1_CITY',
 'CONTACT_1_STATE']

In [15]:
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

* Custom Column Transformer: One Hot Encoding that returns a whole dataframe with column names and one hot encoded column dropped

In [74]:
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        self.encoder = OneHotEncoder(**kwargs)
    
    def fit(self, X, y=None):
        self.encoder.fit(X[ohe_col])
        return self
    
    def transform(self, X):
        transformed_data = self.encoder.transform(X[ohe_col])
        column_names = self.encoder.get_feature_names_out()
        df = pd.DataFrame(transformed_data.toarray(), columns=column_names)
        df=pd.concat([X,df],axis=1)
        df = df.drop(columns=ohe_col)
        return df

    def get_feature_names(self, X):
        ohe_feature_names = self.encoder.get_feature_names().tolist()
        return ohe_feature_names

* `Function Transformer 2`: Grouping by Census Tract and Year
  * based on the year we are making the prediction from and the training length

In [110]:
def df_window_multi_type(df,year,t):

    '''
    Combines descriptions & numeric data
    '''
    #copy dataframe to avoid accidental overwriting
    df_temp=df.copy()

    #Confirm that no columns are set as indexs (eg. if census_tract was set as index, would cause issues)
    assert df_temp.index.is_monotonic_increasing, 'Check Indexing: Should be a simple arithmetic sequence'

    #Set 'Census_Tract' as index
    df_temp=df_temp.set_index('Census_Tract')

    #select relevant years
    #year+1 as the range end to esnure data for the current year is also included
    #drop YEAR column as we no longer need it
    df_temp=df_temp[df_temp['YEAR'].isin(range(year-t,year+1))].drop(columns='YEAR')

    #instantiate the output dataframe
    df_result=pd.DataFrame()

    #select columns with distriptions
    obj_cols = df_temp.select_dtypes(include=['object']).columns
    num_cols = df_temp.select_dtypes(include=['number']).columns

    ### CAN ADD MORE FEATURES HERE ###

    #Taking averages for numeric columns
    for col in num_cols:
        df_result[col]=df_temp.groupby(level=0)[col].mean()

    #Concatenating qualitative columns
    for col in obj_cols:
        #need to keep in mind that some stings (descriptions) might be missing
        df_result[col]=df_temp.groupby(level=0)[col].apply(lambda x: ' '.join(str(i) for i in x))

    return df_result

In [112]:
#set up a FunctionTransformer based on df_window_multi_type
grouping_transformer = FunctionTransformer(df_window_multi_type, kw_args={'year': year, 't': train_period})

* `Custom Transformer 3`: Convert income data into a Binary Target Column calculating if gentrification occured
  * based on the year we are making the prediction from and the prediction length

In [203]:
class GentrificationTarget(BaseEstimator, TransformerMixin):
    def __init__(self,year,prediction_period):
        self.year=year
        self.prediction_period=prediction_period
        self.current_income_col=None
        self.future_income_col=None
        self.median_change=None

    def fit(self,df,y=None):

        #name of the column containing income for the current year
        self.current_income_col='Median_Income_'+str(self.year)

        #the year we are making the prediction for
        prediction_year=self.year+self.prediction_period

        #name of the column containing income for the year we are making the prediction for
        self.future_income_col='Median_Income_'+str(prediction_year)

        current_income=df[self.current_income_col]
        future_income=df[self.future_income_col]

        absolute_change=future_income-current_income
        percentage_change=future_income/current_income
        combined_change=absolute_change*(percentage_change)**2

        #fitting in our case is just finding the mean change
        self.median_change=np.median(combined_change)
        #returning self with updated attributes that can be accessed in the transform method later
        return self

    def transform(self, df, y=None):
        #this method transforms the existing dataframe to calculate if the change for each geography was more or less than the mean
        #dropping all the other columns

        df_temp=df.copy()

        current_income=df[self.current_income_col]
        future_income=df[self.future_income_col]

        absolute_change=future_income-current_income
        percentage_change=future_income/current_income
        combined_change=absolute_change*(percentage_change)**2

        #this is y_target column as a list
        target=[1 if i>self.median_change else 0 for i in combined_change]

        y_col=[col for col in list(df_temp.columns) if bool(re.search('Median',col))]
        df_temp=df_temp.drop(columns=y_col)
        df_temp['Income']=df[self.current_income_col]
        df_temp['Target']=target

        return df_temp

In [204]:
gt=GentrificationTarget(2015,5)
gt.fit(df_train_transformed)
df_train_final=gt.transform(df_train_transformed)
df_test_final=gt.transform(df_test_transformed)

* The aim of this pipeline is to:
  * One Hot Enocode
  * Group 

In [206]:
cachedir = mkdtemp()

year=2015
train_period=7
prediction_period=5

X_y_prep = Pipeline(
    [#one hot encoding
    ('ohe', CustomOneHotEncoder()),
    #grouping based on the year we are making the prediction from and the training length
    ('group',grouping_transformer),
    #convert income change into y_target based on the year we are making the prediction from and the prediction length and drop the remaining of Income columns
    ('target',GentrificationTarget(year=2015,prediction_period=5))
    #split into X & y
    
    ],
    memory=cachedir)

In [207]:
X_y_prep.fit(df_base_train)
df_train_transformed=X_y_prep.transform(df_base_train)
df_test_transformed=X_y_prep.transform(df_base_test)

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


In [209]:
df_train_transformed

Unnamed: 0_level_0,PROCESSING_TIME,BUILDING_FEE_PAID,ZONING_FEE_PAID,OTHER_FEE_PAID,SUBTOTAL_PAID,TOTAL_FEE,REPORTED_COST,Household_Count,PERMIT_TYPE_DROP,PERMIT_TYPE_EASY PERMIT PROCESS,...,CONTACT_1_STATE_IN,CONTACT_1_STATE_MI,CONTACT_1_STATE_NJ,CONTACT_1_STATE_NY,CONTACT_1_STATE_OTHER,CONTACT_1_STATE_TX,CONTACT_1_STATE_UT,CONTACT_1_STATE_WI,Income,Target
Census_Tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10100,15.312057,280.376879,40.070922,3.900709,348.146312,360.927128,21738.129894,2224.5,0.042553,0.326241,...,0.014184,0.000000,0.000000,0.000000,0.003546,0.007092,0.0,0.000000,32188.0,0
10202,14.145455,231.389061,62.363636,3.181818,301.346848,307.943455,21032.957576,1109.5,0.218182,0.206061,...,0.006061,0.000000,0.000000,0.000000,0.012121,0.006061,0.0,0.000000,27318.0,1
10300,14.659280,310.280886,40.200831,2.631579,371.240526,403.619695,36315.792244,2830.0,0.141274,0.313019,...,0.000000,0.005540,0.000000,0.002770,0.002770,0.013850,0.0,0.002770,37111.0,0
10400,11.754098,196.563267,30.679157,1.756440,237.985550,473.777400,58994.726019,1940.5,0.105386,0.203747,...,0.002342,0.000000,0.000000,0.000000,0.011710,0.009368,0.0,0.000000,38384.0,0
10503,20.709091,416.965545,79.772727,4.318182,517.749364,554.577818,64575.268182,1079.5,0.290909,0.163636,...,0.000000,0.000000,0.000000,0.000000,0.009091,0.000000,0.0,0.000000,19813.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
843200,14.214391,335.748862,48.395742,3.546256,419.133054,469.727871,34158.976799,877.0,0.145374,0.196769,...,0.022026,0.000000,0.002937,0.000000,0.010279,0.002937,0.0,0.004405,38929.0,1
843600,12.333333,386.711852,52.835648,8.569444,508.708819,523.470486,36205.437500,1182.5,0.083333,0.317130,...,0.000000,0.000000,0.000000,0.002315,0.009259,0.013889,0.0,0.004630,30726.0,0
843900,11.209486,339.449921,40.859684,4.446640,411.652292,459.247549,40527.802411,1890.0,0.075099,0.233202,...,0.000000,0.000000,0.000000,0.000000,0.011858,0.007905,0.0,0.019763,26975.0,1
844600,20.326316,354.748421,41.381579,11.315789,497.275526,754.008474,66573.731632,601.5,0.063158,0.242105,...,0.000000,0.005263,0.000000,0.000000,0.000000,0.010526,0.0,0.000000,16018.5,1


In [106]:
pipe_f.fit(df_base_train)
df_train_transformed=pipe_f.transform(df_base_train)
df_test_transformed=pipe_f.transform(df_base_test)

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


In [None]:
preprocess= ColumnTransformer( transformers=
    #apply MinMaxScaler to all columns
    [('MinMaxScaler', MinMaxScaler(year,),slice(None))],
    remainder='passthrough')

In [113]:
df_train_transformed

Unnamed: 0_level_0,PROCESSING_TIME,BUILDING_FEE_PAID,ZONING_FEE_PAID,OTHER_FEE_PAID,SUBTOTAL_PAID,TOTAL_FEE,REPORTED_COST,Median_Income_2010,Median_Income_2011,Median_Income_2012,...,CONTACT_1_STATE_FL,CONTACT_1_STATE_IL,CONTACT_1_STATE_IN,CONTACT_1_STATE_MI,CONTACT_1_STATE_NJ,CONTACT_1_STATE_NY,CONTACT_1_STATE_OTHER,CONTACT_1_STATE_TX,CONTACT_1_STATE_UT,CONTACT_1_STATE_WI
Census_Tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10100,15.312057,280.376879,40.070922,3.900709,348.146312,360.927128,21738.129894,36905.0,31919.0,31063.0,...,0.000000,0.975177,0.014184,0.000000,0.000000,0.000000,0.003546,0.007092,0.0,0.000000
10202,14.145455,231.389061,62.363636,3.181818,301.346848,307.943455,21032.957576,35724.0,44107.0,36369.0,...,0.000000,0.975758,0.006061,0.000000,0.000000,0.000000,0.012121,0.006061,0.0,0.000000
10300,14.659280,310.280886,40.200831,2.631579,371.240526,403.619695,36315.792244,45224.0,45964.0,41315.0,...,0.008310,0.963989,0.000000,0.005540,0.000000,0.002770,0.002770,0.013850,0.0,0.002770
10400,11.754098,196.563267,30.679157,1.756440,237.985550,473.777400,58994.726019,44018.0,48138.0,43125.0,...,0.009368,0.967213,0.002342,0.000000,0.000000,0.000000,0.011710,0.009368,0.0,0.000000
10503,20.709091,416.965545,79.772727,4.318182,517.749364,554.577818,64575.268182,18250.0,18952.0,20524.0,...,0.000000,0.990909,0.000000,0.000000,0.000000,0.000000,0.009091,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
843200,14.214391,335.748862,48.395742,3.546256,419.133054,469.727871,34158.976799,40133.0,44537.0,34881.0,...,0.000000,0.955947,0.022026,0.000000,0.002937,0.000000,0.010279,0.002937,0.0,0.004405
843600,12.333333,386.711852,52.835648,8.569444,508.708819,523.470486,36205.437500,24844.0,22606.0,22373.0,...,0.004630,0.965278,0.000000,0.000000,0.000000,0.002315,0.009259,0.013889,0.0,0.004630
843900,11.209486,339.449921,40.859684,4.446640,411.652292,459.247549,40527.802411,35663.0,31774.0,29094.0,...,0.000000,0.960474,0.000000,0.000000,0.000000,0.000000,0.011858,0.007905,0.0,0.019763
844600,20.326316,354.748421,41.381579,11.315789,497.275526,754.008474,66573.731632,33571.0,29052.0,36979.0,...,0.000000,0.984211,0.000000,0.005263,0.000000,0.000000,0.000000,0.010526,0.0,0.000000


In [35]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
import tempfile
from tempfile import mkdtemp

In [41]:
cachedir = mkdtemp()

pipe_f = Pipeline(
    [('group', grouping_transformer),
    ('process', preprocess)],

    memory=cachedir)

In [None]:
pipe_f.fit()

In [12]:
train_transformed=my_ohe.transform(df_base_train[ohe_col])

In [13]:
train_transformed

Unnamed: 0,PERMIT_TYPE_DROP,PERMIT_TYPE_EASY PERMIT PROCESS,PERMIT_TYPE_ELECTRIC WIRING,PERMIT_TYPE_NEW CONSTRUCTION,PERMIT_TYPE_REINSTATE REVOKED PMT,PERMIT_TYPE_RENOVATION/ALTERATION,PERMIT_TYPE_SCAFFOLDING,REVIEW_TYPE_CONVEYANCE DEVICE PERMIT,REVIEW_TYPE_DEMOLITION PERMIT,REVIEW_TYPE_DIRECT DEVELOPER SERVICES,...,CONTACT_1_STATE_FL,CONTACT_1_STATE_IL,CONTACT_1_STATE_IN,CONTACT_1_STATE_MI,CONTACT_1_STATE_NJ,CONTACT_1_STATE_NY,CONTACT_1_STATE_OTHER,CONTACT_1_STATE_TX,CONTACT_1_STATE_UT,CONTACT_1_STATE_WI
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535506,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
535507,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
535508,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
535509,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
class CustomOneHotEncoding(OneHotEncoder):
    
    def fit(self, X, y=None):
        super().fit(X)
        return self
    
    def transform(self, X):
        transformed_data = super().transform(X)
        columns = self.get_feature_names_out(X.columns)
        return pd.DataFrame(transformed_data.toarray(), columns=columns)

In [18]:
#ohe_preprocesses.fit(df_base_train)
train_transformed=ohe_preprocesses.train_transform(df_base_train)
train_transformed

AttributeError: 'ColumnTransformer' object has no attribute 'train_transform'

In [None]:
test_transformed=ohe_preprocesses.transform(df_base_test)

In [155]:
train_transformed

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.1174e+04, 1.3205e+03,
        2.0060e+03],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.1174e+04, 1.3205e+03,
        2.0060e+03],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.1174e+04, 1.3205e+03,
        2.0060e+03],
       ...,
       [0.0000e+00, 1.0000e+00, 0.0000e+00, ..., 7.4847e+04, 1.6320e+03,
        2.0070e+03],
       [0.0000e+00, 0.0000e+00, 1.0000e+00, ..., 6.6720e+04, 1.5610e+03,
        2.0070e+03],
       [0.0000e+00, 0.0000e+00, 1.0000e+00, ..., 4.4080e+04, 7.6050e+02,
        2.0070e+03]])

In [149]:
#grouping_transformer.transform(train_transformed)

In [None]:
# Define the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('scaler', StandardScaler()),
                           ('classifier', DecisionTreeClassifier())])

In [33]:
df_base_train

Unnamed: 0,PERMIT_TYPE,REVIEW_TYPE,ISSUE_DATE,PROCESSING_TIME,BUILDING_FEE_PAID,ZONING_FEE_PAID,OTHER_FEE_PAID,SUBTOTAL_PAID,TOTAL_FEE,CONTACT_1_TYPE,...,Median_Income_2014,Median_Income_2015,Median_Income_2016,Median_Income_2017,Median_Income_2018,Median_Income_2019,Median_Income_2020,Median_Income_2021,Household_Count,YEAR
0,RENOVATION/ALTERATION,STANDARD PLAN REVIEW,2006-01-03,81.0,125.0,75.0,0.0,200.0,200.0,OWNER AS GENERAL CONTRACTOR,...,40000.0,45472.0,54034.0,61186.0,61733.0,64392.0,63594.0,71174.0,1320.5,2006
1,DROP,STANDARD PLAN REVIEW,2006-06-27,36.0,200.0,50.0,0.0,250.0,250.0,ARCHITECT,...,40000.0,45472.0,54034.0,61186.0,61733.0,64392.0,63594.0,71174.0,1320.5,2006
2,RENOVATION/ALTERATION,STANDARD PLAN REVIEW,2006-08-01,0.0,85.0,75.0,0.0,160.0,160.0,ARCHITECT,...,40000.0,45472.0,54034.0,61186.0,61733.0,64392.0,63594.0,71174.0,1320.5,2006
3,ELECTRIC WIRING,EASY PERMIT WEB,2006-01-12,9.0,40.0,0.0,0.0,40.0,40.0,CONTRACTOR-ELECTRICAL,...,40000.0,45472.0,54034.0,61186.0,61733.0,64392.0,63594.0,71174.0,1320.5,2006
4,DROP,STANDARD PLAN REVIEW,2006-08-02,13.0,85.0,50.0,0.0,135.0,135.0,OWNER AS GENERAL CONTRACTOR,...,40000.0,45472.0,54034.0,61186.0,61733.0,64392.0,63594.0,71174.0,1320.5,2006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535506,ELECTRIC WIRING,EASY PERMIT WEB,2006-04-19,0.0,230.0,0.0,0.0,230.0,230.0,CONTRACTOR-ELECTRICAL,...,46602.0,43815.0,44293.0,46434.0,54625.0,60672.0,60500.0,66726.0,1872.0,2006
535507,RENOVATION/ALTERATION,STANDARD PLAN REVIEW,2007-08-24,31.0,175.0,75.0,0.0,250.0,250.0,STRUCTURAL ENGINEER,...,67784.0,63808.0,62217.0,64750.0,64102.0,70497.0,71644.0,74847.0,1632.0,2007
535508,EASY PERMIT PROCESS,EASY PERMIT,2007-10-25,16.0,523.0,50.0,75.0,1198.0,1198.0,CONTRACTOR-ELECTRICAL,...,67784.0,63808.0,62217.0,64750.0,64102.0,70497.0,71644.0,74847.0,1632.0,2007
535509,ELECTRIC WIRING,EASY PERMIT WEB,2007-10-23,6.0,40.0,0.0,0.0,40.0,40.0,CONTRACTOR-ELECTRICAL,...,55069.0,50927.0,51853.0,53750.0,62500.0,65438.0,64347.0,66720.0,1561.0,2007


In [None]:

test = pd.DataFrame(preprocesses.fit_transform(X_rem), columns=preprocesses.get_feature_names_out())
display(test.head(5))

display(X_rem_baseline.head(2))

In [20]:
from sklearn.compose import ColumnTransformer

preprocesses = ColumnTransformer(
    [('OHE', OneHotEncoder(sparse=False), ['INDUSTRY']),
    ('StandardScale', StandardScaler())]
    )

In [None]:
ohe_col=['PERMIT_TYPE','REVIEW_TYPE','CONTACT_1_TYPE']

In [13]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

ohe_cat(df_base_train,ohe_col)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [21]:
df_temp=df[(df['YEAR']>2010)&(df['YEAR']<2016)]

df_temp=df_temp.groupby('Census_Tract').mean().drop(columns=['YEAR','CENSUS_TRACT'])

df_X=df_temp.drop(columns=y_li).reset_index(drop=True)

df_Y=select_y(df_temp)

y=df_Y['5y_inc_change_2021']

X_train,X_test,y_train,y_test=train_test_split(df_X,y,test_size=0.3,random_state=5)

scaler=MinMaxScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)


my_linreg_d=LinearRegression()
my_linreg_d.fit(X_pca_train,y_train)
print(f'Baseline train accuracy score {my_linreg_d.score(X_pca_train,y_train)}')
print(f'Baseline test accuracy score {my_linreg_d.score(X_pca_test,y_test)}')

y_pred = my_linreg_d.predict(X_pca_test)
mse = mean_squared_error(y_test, y_pred)
#print('MSE:', mse)

y_pred = my_linreg_d.predict(X_pca_test)
r2 = r2_score(y_test, y_pred)
print('R2 score:', r2)

KeyError: "['CENSUS_TRACT'] not found in axis"

***
**<center><h3>Classification Model Development<center><h3>**

In [14]:
scores_df_classification=pd.DataFrame({'prediction_ahead_window':[],'train_on_window':[],'train_score':[],'test_score':[]})
i=0
year_from=2010

#n_window loop
for n in range(1,8):
    #year_last_from - the last year for which we can try to predict given the chosen n
    year_last_from=2021-n

    #t_window loop: predicting using fata from the past 1 to 5 years
    for t in range(1,6):

        year_from=2010
        y_target=[] #instantiate y_target
        df_X_result=pd.DataFrame() #instantiate an empty df for X features
        
        #df_X - should contain only x_features, but keep median income for year_from and YEAR - drop
        #df_X=df.drop(columns=[....]) 

        while year_from<=year_last_from:

            #filter and average
            df_temp=df_window(df,year_from,t)

            #y_target
            y_target.extend(abs_income_change(df_temp,year_from,n))

            #x_features: drop columns with median, apart from 'Median_Income_' for the current year

            #Select all column containing 'Median' in the name
            y_col=[col for col in list(df_temp.columns) if bool(re.search('Median',col))]
            #Drop columns containing 'Median' in the name
            df_X=df_temp.drop(columns=y_col)

            #Create a string that is the name of the column containing income data for the year we are making the prediction from
            current_income='Median_Income_'+str(year_from) #current year

            #Add this column back the X dataframe
            df_X['Income']=df_temp[current_income]

            #for each year in the loop keeping adding new results
            df_X_result=pd.concat([df_X_result,df_X])
            
            #switch to the next year
            year_from+=1

        #CLASSIFICATION: THRESHOLD APPLICATION
        mean_change=np.median(y_target)
        y=np.array([1 if y>mean_change else 0 for y in y_target])
        X=df_X_result.reset_index()
        
        #Split data into test and train based on census tracts
 
        geo_set=set(X['Census_Tract'])
        test_len=int(len(geo_set)*0.25)

        geo_test=random.sample(list(geo_set),k=test_len)
        test_mask=X['Census_Tract'].isin(geo_test)

        y_test=y[test_mask]
        y_train=y[~test_mask]

        X_test=X[test_mask]
        X_train=X[~test_mask]

        #SCALE

        scaler=MinMaxScaler()
        X_train_scaled=scaler.fit_transform(X_train)
        X_test_scaled=scaler.transform(X_test)

        #PCA

        my_pca=PCA(n_components=3)
        X_pca_train=my_pca.fit_transform(X_train_scaled)
        X_pca_test=my_pca.transform(X_test_scaled)

        #MODEL

        my_logreg1=LogisticRegression(C=0.1,max_iter=1000,random_state=5)
        my_logreg1.fit(X_pca_train,y_train)
        train_score=my_logreg1.score(X_pca_train,y_train)
        test_score=my_logreg1.score(X_pca_test,y_test)

        scores_df_classification.loc[i]=[n,t,train_score,test_score]   
        i+=1


In [15]:
scores_df_classification.sort_values('test_score',ascending=False)

Unnamed: 0,prediction_ahead_window,train_on_window,train_score,test_score
20,5.0,1.0,0.655153,0.699752
34,7.0,5.0,0.676493,0.691537
22,5.0,3.0,0.660367,0.68277
16,4.0,2.0,0.669209,0.681686
30,7.0,1.0,0.663121,0.678738
14,3.0,5.0,0.66305,0.675558
21,5.0,2.0,0.660176,0.672935
13,3.0,4.0,0.661583,0.670013
11,3.0,2.0,0.653438,0.669866
24,5.0,5.0,0.671011,0.665045


* Predictions are better for larger n -> less datapoints
* Need to find a way to differentiate between different years
* Could be because of the volatility, but also could be because different years market behaviour differs

In [9]:
df_X

Unnamed: 0_level_0,YEAR,PROCESSING_TIME,BUILDING_FEE_PAID,ZONING_FEE_PAID,OTHER_FEE_PAID,SUBTOTAL_PAID,TOTAL_FEE,REPORTED_COST,CENSUS_TRACT,Median_Income_2020,...,CONTACT_1_TYPE_OWNER AS ARCHITECT & CONTRACTR,CONTACT_1_TYPE_OWNER AS GENERAL CONTRACTOR,CONTACT_1_TYPE_OWNER OCCUPIED,CONTACT_1_TYPE_PLUMBING CONTRACTOR,CONTACT_1_TYPE_RESIDENTAL REAL ESTATE DEV,CONTACT_1_TYPE_SELF CERT ARCHITECT,CONTACT_1_TYPE_SIGN CONTRACTOR,CONTACT_1_TYPE_STRUCTURAL ENGINEER,CONTACT_1_TYPE_TENT CONTRACTOR,CONTACT_1_TYPE_UNKNOWN
Census_Tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10100,2019.5,22.430778,505.626104,40.496281,2.249428,561.054863,583.764519,84149.541762,10100.0,42891.0,...,0.000000,0.026316,0.021739,0.0,0.0,0.000000,0.000000,0.010870,0.000000,0.000000
10201,2019.5,8.123848,379.978093,56.574021,2.678571,450.215985,454.903381,26338.300634,10201.0,39955.0,...,0.000000,0.000000,0.085829,0.0,0.0,0.032258,0.000000,0.008929,0.008929,0.041187
10202,2019.5,8.615187,446.414469,48.133848,3.812741,512.152574,518.413050,40145.471042,10202.0,43839.0,...,0.000000,0.050837,0.027027,0.0,0.0,0.061133,0.000000,0.074646,0.000000,0.050837
10300,2019.5,9.680060,371.170342,43.489583,1.955357,423.162902,423.162902,28745.775298,10300.0,44375.0,...,0.000000,0.102679,0.090774,0.0,0.0,0.032738,0.000000,0.023810,0.077381,0.020833
10400,2019.5,8.785714,396.127262,21.875000,0.892857,443.580476,492.608810,59262.142857,10400.0,37198.0,...,0.011905,0.047619,0.083333,0.0,0.0,0.059524,0.011905,0.011905,0.000000,0.047619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
843700,2019.5,10.751773,466.987685,78.499310,2.996946,580.869423,647.840689,57180.173288,843700.0,153424.0,...,0.000000,0.035165,0.059693,0.0,0.0,0.082151,0.000000,0.029846,0.000000,0.009259
843800,2019.5,13.337428,371.008600,36.056511,5.651106,445.917269,465.810299,20052.121212,843800.0,40966.0,...,0.000000,0.101147,0.082719,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.028665
843900,2019.5,5.859307,567.725353,66.420455,7.350649,694.248390,721.986424,69252.149134,843900.0,50496.0,...,0.000000,0.044805,0.062987,0.0,0.0,0.099351,0.063636,0.027273,0.047619,0.056710
844600,2019.5,15.402256,451.785056,65.531015,8.599624,561.390940,580.172368,38498.810150,844600.0,48955.0,...,0.000000,0.107143,0.105263,0.0,0.0,0.035714,0.000000,0.000000,0.000000,0.000000


In [19]:
def compile_df(df,year_from,prediction_period):

        #last year we can make prediction from given how far we want to predict and that the last income data is from 2021
        year_last_from=2021-prediction_period
        y_target=[] #instantiate y_target
        df_X_result=pd.DataFrame() #instantiate an empty df for X features

        while year_from<=year_last_from:

                #filter and average
                df_temp=df_window(df,year_from,t)

                #y_target
                y_target.extend(abs_income_change(df_temp,year_from,n))

                #x_features: drop columns with median, apart from 'Median_Income_' for the current year
                current_income='Median_Income_'+str(year_from) #current year
                y_col=[col for col in list(df_temp.columns) if bool(re.search('Median',col))]
                df_X=df_temp.drop(columns=y_col)
                df_X['Income']=df_temp[current_income]

                #x_features: concat to the existing
                df_X_result=pd.concat([df_X_result,df_X])

                #switch to the next year
                year_from+=1

        #CLASSIFICATION: THRESHOLD APPLICATION
        mean_change=np.median(y_target)
        y=np.array([1 if y>mean_change else 0 for y in y_target])
        X=df_X_result.reset_index()

        return X,y

In [18]:
scores_df_classification=pd.DataFrame({'prediction_ahead_window':[],'train_on_window':[],'train_score':[],'test_score':[]})
i=0
year_from=2010

#n_window loop
for n in range(1,8):

    #t_window loop: predicting using fata from the past 1 to 5 years
    for t in range(1,6):

        year_from=2010
        y_target=[] #instantiate y_target
        df_X_result=pd.DataFrame() #instantiate an empty df for X features

        X,y=compile_df(df,year_from,n)
        
        #Split data into test and train based on census tracts

        geo_set=set(X['Census_Tract'])
        test_len=int(len(geo_set)*0.25)

        geo_test=random.sample(list(geo_set),k=test_len)
        test_mask=X['Census_Tract'].isin(geo_test)

        y_test=y[test_mask]
        y_train=y[~test_mask]

        X_test=X[test_mask]
        X_train=X[~test_mask]

        scaler=MinMaxScaler()
        X_train_scaled=scaler.fit_transform(X_train)
        X_test_scaled=scaler.transform(X_test)

        my_logreg1=LogisticRegression(C=0.1,max_iter=1000,random_state=5)
        my_logreg1.fit(X_train_scaled,y_train)
        train_score=my_logreg1.score(X_train_scaled,y_train)
        test_score=my_logreg1.score(X_test_scaled,y_test)

        scores_df_classification.loc[i]=[n,t,train_score,test_score]   
        i+=1

scores_df_classification.sort_values('test_score',ascending=False)

Unnamed: 0,prediction_ahead_window,train_on_window,train_score,test_score
28,6.0,4.0,0.705718,0.727794
27,6.0,3.0,0.700157,0.724461
32,7.0,3.0,0.709909,0.71156
30,7.0,1.0,0.708383,0.708089
15,4.0,1.0,0.681994,0.70438
25,6.0,1.0,0.703257,0.703704
29,6.0,5.0,0.705901,0.699531
34,7.0,5.0,0.722367,0.698092
19,4.0,5.0,0.685124,0.697707
18,4.0,4.0,0.6823,0.695258


In [11]:
scores_df_classification=pd.DataFrame({'prediction_ahead_window':[],'train_on_window':[],'train_score':[],'test_score':[]})
i=0
year_from=2010

#n_window loop
for n in range(1,8):
    #year_last_from - the last year for which we can try to predict given the chosen n
    year_last_from=2021-n

    #t_window loop: predicting using fata from the past 1 to 5 years
    for t in range(1,6):

        year_from=2010
        y_target=[] #instantiate y_target
        df_X_result=pd.DataFrame() #instantiate an empty df for X features
        
        #df_X - should contain only x_features, but keep median income for year_from and YEAR - drop
        #df_X=df.drop(columns=[....]) 

        while year_from<=year_last_from:

            #filter and average
            df_temp=df_window(df,year_from,t)

            #y_target
            y_target.extend(abs_income_change(df_temp,year_from,n))

            #x_features: drop columns with median, apart from 'Median_Income_' for the current year
            current_income='Median_Income_'+str(year_from) #current year
            y_col=[col for col in list(df_temp.columns) if bool(re.search('Median',col))]
            df_X=df_temp.drop(columns=y_col)
            df_X['Income']=df_temp[current_income]

            #x_features: concat to the existing
            df_X_result=pd.concat([df_X_result,df_X])
            
            #switch to the next year
            year_from+=1

        #CLASSIFICATION: THRESHOLD APPLICATION
        mean_change=np.median(y_target)
        y=np.array([1 if y>mean_change else 0 for y in y_target])
        X=df_X_result.reset_index()

        #break
        
        #Split data into test and train based on census tracts
 
        geo_set=set(X['Census_Tract'])
        test_len=int(len(geo_set)*0.25)

        geo_test=random.sample(list(geo_set),k=test_len)
        test_mask=X['Census_Tract'].isin(geo_test)

        y_test=y[test_mask]
        y_train=y[~test_mask]

        X_test=X[test_mask]
        X_train=X[~test_mask]

        scaler=MinMaxScaler()
        X_train_scaled=scaler.fit_transform(X_train)
        X_test_scaled=scaler.transform(X_test)

        my_logreg1=LogisticRegression(C=0.1,max_iter=1000,random_state=5)
        my_logreg1.fit(X_train_scaled,y_train)
        train_score=my_logreg1.score(X_train_scaled,y_train)
        test_score=my_logreg1.score(X_test_scaled,y_test)

        scores_df_classification.loc[i]=[n,t,train_score,test_score]   
        i+=1

scores_df_classification.sort_values('test_score',ascending=False)

Unnamed: 0,prediction_ahead_window,train_on_window,train_score,test_score
33,7.0,4.0,0.702955,0.725424
29,6.0,5.0,0.702526,0.720779
32,7.0,3.0,0.709411,0.708525
31,7.0,2.0,0.706358,0.708525
34,7.0,5.0,0.707463,0.706013
30,7.0,1.0,0.704957,0.700704
26,6.0,2.0,0.711054,0.69883
13,3.0,4.0,0.670155,0.696818
27,6.0,3.0,0.713076,0.695157
23,5.0,4.0,0.692884,0.693117


* Divergence between the years

---
**5 Years Ahead & 4 Years Before**

* Are there any patterns in the predictions that the model is getting wrong?
* `5.0---4.0---0.718170---0.741754`

To test this, let's first look at a single model, making the prediction from 2016 to 2021 (predicting 5 years ahead) and training on the data from 2012 to 2016 (training on 4 years prior)

In [19]:
help(abs_perc_income_change)

Help on function abs_perc_income_change in module functions:

abs_perc_income_change(df, year_x, n)
    year_x - current year (the year we are making the prediction from)
    n - the number of years ahead we are making the prediction



In [21]:
def coef_eval(X,model):
    return pd.DataFrame({'features':X.columns,'coeff':model.coef_[0]})
    

In [59]:
current_year=2016
train_years=5
prediction_years=4

y_target=abs_perc_income_change(df,current_year,train_years)
mean_change=np.median(y_target)
y=np.array([1 if y>mean_change else 0 for y in y_target])

df_temp=df_window(df,current_year,train_years).drop(columns=['YEAR','CENSUS_TRACT'])

y_target=abs_perc_income_change(df_temp,current_year,prediction_years)
mean_change=np.median(y_target)
y=np.array([1 if y>mean_change else 0 for y in y_target])

current_income='Median_Income_'+str(year_from) #current year
y_col=[col for col in list(df_temp.columns) if (bool(re.search('Median',col)) & (col!=current_income))]
df_X=df_temp.drop(columns=y_col)

X_train,X_test,y_train,y_test=train_test_split(df_X,y,test_size=0.3,random_state=5)

scaler=MinMaxScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

my_logreg1=LogisticRegression(C=0.001,max_iter=1000,random_state=5)
my_logreg1.fit(X_train_scaled,y_train)
print(f'Baseline train accuracy score {my_logreg1.score(X_train_scaled,y_train):.3f}%')
print(f'Baseline test accuracy score {my_logreg1.score(X_test_scaled,y_test):.3f}%')

coef_eval(df_X,my_logreg1).sort_values('coeff',ascending=False)

Baseline train accuracy score 0.619%
Baseline test accuracy score 0.638%


Unnamed: 0,features,coeff
7,Median_Income_2015,0.012786
4,SUBTOTAL_PAID,0.010764
13,PERMIT_TYPE_NEW CONSTRUCTION,0.010248
27,REVIEW_TYPE_SELF CERT,0.010148
3,OTHER_FEE_PAID,0.009593
5,TOTAL_FEE,0.009515
1,BUILDING_FEE_PAID,0.009452
53,CONTACT_1_TYPE_SELF CERT ARCHITECT,0.008743
0,PROCESSING_TIME,0.008565
50,CONTACT_1_TYPE_OWNER OCCUPIED,0.006694


In [None]:
'''52	CONTACT_1_TYPE_RESIDENTAL REAL ESTATE DEV	0.000000
44	CONTACT_1_TYPE_MASON - CONCRETE ONLY	0.000000
43	CONTACT_1_TYPE_MASON - BRICK ONLY	0.000000
42	CONTACT_1_TYPE_MASON - BRICK AND CONCRETE	0.000000
12	PERMIT_TYPE_FOR EXTENSION OF PMT	0.000000
22	REVIEW_TYPE_DIRECT DEVELOPER SERVICES	0.000000
14	PERMIT_TYPE_PORCH CONSTRUCTION	0.000000'''

In [None]:
# Looking back at EDA
#Let's drop these

# Merge CONTACT_1_TYPE_CONTRACTOR-WRECKING, CONTACT_1_TYPE_CONTRACTOR-ELECTRICAL, CONTACT_1_TYPE_CONTRACTOR-WRECKING	
# Merge CONTACT_1_TYPE_OWNER AS ARCHITECT & CONTRACTR and CONTACT_1_TYPE_BUILDING OWNER


In [26]:
from sklearn.metrics import confusion_matrix

y_pred = my_logreg1.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)

print(cm)

from sklearn.metrics import recall_score

recall = recall_score(y_test, y_pred)
print('Recall score:', recall)
print(recall)

[[90 16]
 [61 46]]
Recall score: 0.42990654205607476
0.42990654205607476


* More false negatives than false positives
* The model predicted more false gentrification (false positives) than false negatives

In [27]:
##What happends if we vary the threshold?

---
***Code_Actions:***
* Create DataFrame `results_df`
* In `results_df`, creare a column `results` with 0,1,2,3 values where [0-true negative (not gentrified), 1-true postiive (gentrified), 2 - false negative (gentrified, but predicted not), 3 - false positive (not gentrified, but predicted gentrified)]
* Join  **X_test[['Household_Count','Median_Income_2015']]** to `results_df`
* Add Lat and Lon back
* Plot a map
  * Size by Household_Counts
  * Size by Median_Income in 2015

In [28]:
from functions import coord
import joblib

lat_lon= joblib.load('../data/interim/wip/lat_lon.pkl').reset_index()

In [29]:
result=y_pred-y_test

results_df=pd.DataFrame({'Census_Tract':X_test.index,'actual':y_test,'predicted':y_pred})
results_li=[]
for i in range(len(results_df)):
    if results_df.loc[i,'actual']==results_df.loc[i,'predicted']:
        results_li.append(results_df.loc[i,'actual'])
    elif results_df.loc[i,'actual']==1:
        #2 for false negative
        results_li.append(2)
    elif results_df.loc[i,'actual']==0:
        #3 for false positive
        results_li.append(3)

#create a new column from list results_li
results_df['results']=results_li

df_temp=results_df.merge(X_test[['Household_Count','Median_Income_2015']],on='Census_Tract',how='inner')

#add lat and lon back
df_plot=coord(df_temp,lat_lon)
df_plot

Unnamed: 0,Census_Tract,actual,predicted,results,Household_Count,Median_Income_2015,CENSUS_TRACT,LATITUDE,LONGITUDE
0,670400,0,0,0,1275.0,28952.0,670400,41.790303,-87.671740
1,241500,1,1,1,1311.5,82050.0,241500,41.908200,-87.671098
2,252201,0,0,0,946.5,29402.0,252201,41.877301,-87.751735
3,30702,0,0,0,1020.5,48870.0,30702,41.980644,-87.654957
4,431301,0,0,0,1249.0,22760.0,431301,41.753315,-87.561191
...,...,...,...,...,...,...,...,...,...
208,251200,0,0,0,1117.0,45125.0,251200,41.898037,-87.760442
209,450300,0,0,0,1466.5,43594.0,450300,41.741612,-87.580454
210,700402,0,0,0,1265.5,67212.0,700402,41.738285,-87.711667
211,650500,1,0,2,1480.0,57625.0,650500,41.761052,-87.733460


---
***Residuals***

In [30]:
import plotly.express as px


fig = px.scatter_mapbox(df_plot, size='Household_Count',color="results",lat="LATITUDE", lon="LONGITUDE", zoom=10)

fig.update_layout(width=1500,height=1000, mapbox_style="carto-positron")

fig.show()

In [31]:
fig = px.scatter_mapbox(df_plot, size='Median_Income_2015',color="results",lat="LATITUDE", lon="LONGITUDE", zoom=10)

fig.update_layout(width=1500,height=1000, mapbox_style="carto-positron")

fig.show()

In [41]:
def drop_column_if_present(df, column_name):
    if column_name in df.columns:
        df.drop(column_name, axis=1, inplace=True)

* Improve Window Function:
  * Instead of simply looking at the average consider how the planning parameter changed 

In [32]:
# Split data into city centre and not city centre and study collinearity
# What features are allowing the model split between the city centre and the suburbs
# The model apepars to be predicting for all centre geographies to be gentrified

* Even though Longitude and Latitude were not part of the model, the model appeared to find a pattern similar to geographic division

In [None]:
#, color="5y_inc_change_2021", size="Household_Count"

1) Loop through each t & n, fit linear regression for each & record the r2 scores
   1) scored_di={}
   2) What is the test/train split?
2) Income Change from year x to year x+n, where n is the number of years for the optimal prediction window
   1) x_first=2010
   2) max_x_last=2020 for n=1 (predicting 1 year ahead), min_x_last=2014 (predicting 7 years ahead)
   3) for each x loop through all possible n
   4) grater the x_end -> smaller the n
   5) **for `n` in range(1,8)** (1 to 7 incl years)
      1) x_last=2021-n
      2) x=2010
      3) **while x=<x_last**
         1) x+=1
         2) for each x (year) & n (prediction ahead window) calculate:
            1)  city_change=the mean perc_abs difference for the whole city
            2)  geo_change=the mean perc_abs difference for each geograpgy
            3)  for each geo `TARGET`=geo_change/city_change
      3) ***now for each `n` and all `x` for this n loop through `t` (past window) & fit a logistic regression***
            1) consider predicting based on 1 to 5 years prioir
            2) **for `t` in range(1,6):**
               1) permit_year_end=x (year we are in at the time of making the prediction)
               2) permit_year_start=x-t
               3) X=df[df['YEAR'].isin(range(x-t,x))].mean()
               4) y=`TARGET`
               5) Split into train and test best on census tracts
                  1) X_train=X[X['Census_Tract'].isin(randomly_selected_geos)]
                  2) Liear_Regression_Fit(X_train)
                  3) score_di={n,t,r2_score}

In [12]:
'''scores_df=pd.DataFrame({'prediction_ahead_window':[],'train_on_window':[],'train_score':[],'test_score/r2':[]})
extra_scores_di={}
i=0
year_from=2010

#n_window loop
for n in range(1,8):
    #year_last_from - the last year for which we can try to predict given the chosen n
    year_last_from=2021-n

    #t_window loop: predicting using fata from the past 1 to 5 years
    for t in range(1,6):

        year_from=2010
        y_target=[] #instantiate y_target
        df_X_result=pd.DataFrame() #instantiate an empty df for X features
        
        #df_X - should contain only x_features, but keep median income for year_from and YEAR - drop
        #df_X=df.drop(columns=[....]) 

        while year_from<=year_last_from:

            #filter and average
            df_temp=df_window(df,year_from,t)

            #y_target
            y_target.extend(abs_perc_income(df_temp,year_from,n))

            #x_features: drop columns with median, apart from 'Median_Income_' for the current year
            current_income='Median_Income_'+str(year_from) #current year
            y_col=[col for col in list(df_temp.columns) if (bool(re.search('Median',col)) & (col!=current_income))]
            df_X=df_temp.drop(columns=y_col)

            #x_features: concat to the existing
            df_X_result=pd.concat([df_X_result,df_temp])
            
            #switch to the next year
            year_from+=1
        
        y=np.array(y_target)
        X=df_X_result.reset_index()

        #Custom Grid Search

        for regul in ['l1','l2','l1+l2']:
        #best score
            
        for alpha_num in [0.001,0.01,0.1,1,10,100]:
        #alpha loop
        
        for pca_num in range(7):
        #pca loop

            #Split data into test and train based on census tracts
            #5-fold train/test split
 
            geo_set=set(X['Census_Tract'])
            test_len=int(len(geo_set)*0.25)

            geo_test=random.sample(list(geo_set),k=test_len)
            test_mask=X['Census_Tract'].isin(geo_test)

            y_test=y[test_mask]
            y_train=y[~test_mask]

            X_test=X[test_mask]
            X_train=X[~test_mask]

        scores_df.loc[i]=[n,t,train_score,test_score,r2]   
        i+=1
'''

IndentationError: expected an indented block (2883003383.py, line 48)

In [None]:
'''def lin_reg(X_train,X_test,y_train,y_test,pca_num,alpha_num,regularization):
    scaler=MinMaxScaler()
    X_train_scaled=scaler.fit_transform(X_train)
    X_test_scaled=scaler.transform(X_test)

    my_pca=PCA(n_components=pca_num)
    X_pca_train=my_pca.fit_transform(X_train_scaled)
    X_pca_test=my_pca.transform(X_test_scaled)

    if regularization=='l1':
        my_linreg=Ridge(alpha=alpha_num)
    elif regularization=='l2':
        my_linreg=Lasso(alpha=alpha_num)
    elif regularization=='l1+l2':
        my_linreg=ElasticNet(alpha=alpha_num)

    my_linreg.fit(X_pca_train,y_train)
    train_score=my_linreg.score(X_pca_train,y_train)
    test_score=my_linreg.score(X_pca_test,y_test)

    return train_score,test_score,pca_num,alpha'''