In [1]:
!pwd

/Users/okeefe/Box/USF Data Science Practicum/2020-21/Okeefe/Project_1_Policy_Parsing


In [3]:
import pandas as pd
import numpy as np

import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

from finished_functions import *

# Multi-Class Classifier

- Given the dataset constructed in "Policy_Scraping", create a machine learning algorithm which can predict if a line in a policy document holds relevant information for following attributes:
    - max floor area ratio
    - max dwellings per unit area
    - building height
    - minimum lot area (square feet)
    - units per lot
    
 
## Data:

- Will be using the records from the policy_scraper phase to train and test the model
- Two model circumstances:
    1. All the data scraped
    2. Only scraped data with non-nan zone candidates
    
- Features: Vectorized centroid of "Context" (using spacy)
- Label: "Attribute"

In [3]:
data = pd.read_csv('City_Zoning_Attributes_with_Zones.csv', index_col="Unnamed: 0")
#data = data[~data['Zone_Candidates'].isnull()]

In [4]:
data

Unnamed: 0,City,Attribute,Values,Context,Policy Subsection,Line,Fraction,Zone_Candidates
0,oakland,max_dua,55.0,Liquids: Capacity of an individual vessel exce...,Title 15 - BUILDINGS AND CONSTRUCTION,6287,56.467840,
1,oakland,max_dua,1.0,A vehicle with one or more chassis-mounted tan...,Title 15 - BUILDINGS AND CONSTRUCTION,6347,57.006827,
2,oakland,max_dua,3.0,Service of Notice and Order. The notice and or...,Title 15 - BUILDINGS AND CONSTRUCTION,7449,66.906216,
3,oakland,max_dua,5.0,The penalties imposed pursuant to this Chapter...,Title 15 - BUILDINGS AND CONSTRUCTION,7528,67.615882,
4,oakland,max_dua,30.0,The penalties imposed on the building owner sh...,Title 15 - BUILDINGS AND CONSTRUCTION,8301,74.559828,
...,...,...,...,...,...,...,...,...
19035,mountain_view,minimum_lot_sqft,25000.0,All new commercial buildings or groups of new ...,CHAPTER 8 - BUILDINGS,1915,57.982430,
19036,mountain_view,minimum_lot_sqft,"1985.0, 900.0",Exception: This chapter shall not apply to Gro...,CHAPTER 8 - BUILDINGS,2014,60.981521,"A105, A102, A109, A1-2, A107, A1-F, A106"
19037,mountain_view,minimum_lot_sqft,1500.0,"""2. Number and location of tests. The minimum ...",CHAPTER 8 - BUILDINGS,2051,62.102393,"A1-H, A108, A107, A1-I, A1-F, A106"
19038,mountain_view,minimum_lot_sqft,"1985.0, 900.0, 1933.0",The provisions of this article shall apply to ...,CHAPTER 8 - BUILDINGS,2960,89.639503,


## EDA to prepare Train/Validate/Test

### Question: 

- Given the collected values from regular expressions, can we create a Multi-Class Classifier which more effectively and accurately identifies attributes?

###  Classes:
    1. max_dua
    2. minimum_lot_sqft
    3. units_per_lot
    4. max_far
    5. none
   
$$\hspace{2mm}$$
   
### Metrics: 
    1. F1 score - Validation/Test
    2. Collection counts for each attribute of a novel policy document (the jurisdiction will be omitted from the train/validate set)

In [5]:
data['Attribute'].value_counts(normalize=True)

max_dua             0.433718
minimum_lot_sqft    0.314286
building_height     0.178099
units_per_lot       0.058403
max_far             0.015494
Name: Attribute, dtype: float64

- Very unbalanced in terms of max_far

In [6]:
median = data['City'].value_counts(normalize=True).median()

In [7]:
for key, val in data['City'].value_counts(normalize=True).items():
    if val > median:
        print(key, val)

brentwood 0.08508403361344538
sonoma_county 0.04543067226890756
hayward 0.039285714285714285
woodside 0.03335084033613445
pacifica 0.02804621848739496
alameda 0.025577731092436974
milpitas 0.02542016806722689
berkeley 0.024002100840336136
santa_clara_county 0.021586134453781512
burlingame 0.020640756302521008
san_jose 0.02001050420168067
marin_county 0.019852941176470587
south san francisco 0.019012605042016805
morgan_hill 0.018067226890756304
vacaville 0.017489495798319328
el_cerrito 0.017226890756302522
san_rafael 0.016964285714285713
sunnyvale 0.016911764705882352
contra_costa_county 0.01638655462184874
novato 0.015756302521008403
santa rosa 0.015703781512605042
mountain_view 0.015546218487394958
los_gatos 0.015073529411764706
richmond 0.014548319327731093
los_altos 0.014180672268907563
alameda_county 0.013865546218487394
tiburon 0.012762605042016806
union city 0.012657563025210084
brisbane 0.0125
sonoma 0.012394957983193277
vallejo 0.012342436974789915
napa 0.011764705882352941
ple

# Test Set

- Because Calistoga contains the median value counts, we will use it as our final test.
    - Rationale: Not small enough to be inconsequential, yet also not large enough to be a significant count of the dataset

In [8]:
data['City'].value_counts()['calistoga']

199

### Finding values for the Train/Validate/Test for lines where no attributes are found

- For each city, parse through the documents and assign "legitimate" lines (greater than 4 words in length) the attribute "none"
    - Rationale: 
        - We want the training set to reflect the real outcomes as accurately as possible! This means using all useful lines that can give insight into what distinguishes a "none" class from an attribute class
    - Comments: 
        - This will lend to an extremely unbalanced data set (predominantly "None") and therefore we will eventually need to introduce some upsampling techniques
        - We want to omit any "empty" or "nonsensical" lines which introduce no ambiguity (e.g. a line only containing empty quotes or brackets will not add anything to the classifier and thus should be omitted)

In [9]:
none_class_lines = []

paths = get_policies(whitelist=whitelist, city='calistoga')

for path in paths:
        with open(path, 'r', encoding="utf-8") as f:
            file = f.read().split('\n')
        subsection = path.split("/")[-1].split(".")[0]
        policy_data = data[(data['City']=='calistoga') & (data['Policy Subsection']==subsection)]
        attrib_idx = policy_data.Line.unique()
        if len(attrib_idx) > 0:                                        # Don't bother with policy subsections where nothing was found!
            for index in sorted(attrib_idx, reverse=True):             # Delete the lines containing found attributes
                del file[index-1]                                      # Correction for the index 0 start
            
            # Drop all funky looking symbols so that there is a better representation of context
            edited_contents = [line.replace('\xa0', ' ') for line in file if len(line.split()) > 4]
            none_class_lines.extend(edited_contents)
            
none_class_df = pd.DataFrame({'City': 'calistoga', 'Attribute': 'none', 'Context': none_class_lines})
city_data = data[data['City']=='calistoga'][['City', 'Attribute', 'Context']]
city_data_median = int(city_data['Attribute'].value_counts().median())
total_none = len(none_class_df)
sample_ratio = city_data_median / total_none
none_class_df = none_class_df.sample(frac=sample_ratio)
test_set = city_data.append(none_class_df)
test_set['Attribute'].value_counts()

max_dua             75
minimum_lot_sqft    65
none                49
building_height     49
units_per_lot        9
max_far              1
Name: Attribute, dtype: int64

# Train/Validate Set

- Use the same procedure from above to find all "non-attribute" lines for every city in the train/validate set
- To avoid the imbalance, the train/validation set will be split by class and then reappended so that the proportions remain similar

In [10]:
def add_none_attributes(city: str, df: pd.DataFrame) -> pd.DataFrame:
    """Appends all none attribute lines in policy subsections to prepare for classification model"""
    none_class_lines = []

    paths = get_policies(whitelist=whitelist, city=city, most_recent=True)

    for path in paths:
            with open(path, 'r', encoding="utf-8") as f:
                file = f.read().split('\n')
            subsection = path.split("/")[-1].split(".")[0]
            policy_data = df[(df['City']==city) & (df['Policy Subsection']==subsection)]
            
            attrib_idx = policy_data.Line.unique()
            #print(policy_data)
            if len(attrib_idx) > 0:                                        # Don't bother with policy subsections where nothing was found!
                for index in sorted(attrib_idx, reverse=True):             # Delete the lines containing found attributes
                    try:
                        del file[index-1]                                  # Correction for the index 0 start
                    except:                                                # Debug hints for stubborn policies
                        print(path)
                        print(file)
                        print(index)
                # Drop all funky looking symbols so that there is a better representation of context
                edited_contents = [line.replace('\xa0', ' ') for line in file if len(line.split()) > 4]
                none_class_lines.extend(edited_contents)

    none_class_df = pd.DataFrame({'City': city, 'Attribute': 'none', 'Context': none_class_lines})
    city_data = df[df['City']==city][['City', 'Attribute', 'Context']]
    
    # Find "none" sampling size by setting it equal to the sum of the known classes
    # total_known = len(city_data)
    city_median = city_data['Attribute'].value_counts().median()
    total_none = len(none_class_df)
    
    sample_ratio = city_median / total_none
    print(sample_ratio)
    print(city_data['Attribute'].value_counts())
    none_class_df = none_class_df.sample(frac=sample_ratio)
    
    appended_df = city_data.append(none_class_df).set_index('City')
    return appended_df

In [11]:
train_valid = data[data['City'] != 'calistoga']
cities = train_valid['City'].unique()

new_df = pd.DataFrame(columns = ['City', 'Attribute', 'Context'])

for i, city in enumerate(cities):
    print(f"{city}: {i}/{len(cities)}")
    city_df = add_none_attributes(city, data)
    new_df = new_df.append(city_df)

oakland: 0/77
0.0022016398420892803
max_dua             47
minimum_lot_sqft    24
building_height      5
units_per_lot        2
Name: Attribute, dtype: int64
alameda: 1/77
0.010465262892447359
max_dua             169
minimum_lot_sqft    157
building_height      83
units_per_lot        76
max_far               2
Name: Attribute, dtype: int64
orinda: 2/77
0.004024621212121212
max_dua             51
minimum_lot_sqft    42
building_height     17
units_per_lot        5
max_far              2
Name: Attribute, dtype: int64
hillsborough: 3/77
0.0037067545304777594
minimum_lot_sqft    26
max_dua             13
max_far              9
building_height      7
units_per_lot        6
Name: Attribute, dtype: int64
dublin: 4/77
0.005964214711729622
max_dua             70
minimum_lot_sqft    64
building_height     24
units_per_lot        8
max_far              2
Name: Attribute, dtype: int64
menlo park: 5/77
0.010842368640533779
minimum_lot_sqft    16
max_dua             13
building_height      6
Name: 

In [12]:
new_df['Attribute'].value_counts()

max_dua             8183
minimum_lot_sqft    5919
building_height     3342
none                2909
units_per_lot       1103
max_far              294
Name: Attribute, dtype: int64

In [13]:
# Upsampling minority data points

df_majority = new_df[~(new_df['Attribute'] == 'max_far') & ~(new_df['Attribute'] == 'units_per_lot')]
df_majority_median = int(df_majority['Attribute'].value_counts().median())
df_max_far = new_df[new_df['Attribute'] == 'max_far']
df_units_per_lot = new_df[new_df['Attribute'] == 'units_per_lot']

ratio_max_far = int(df_majority_median / len(df_max_far))
ratio_units_per_lot = int(df_majority_median / len(df_units_per_lot))

usamp_max_far = df_max_far.sample(len(df_max_far)*10, replace=True)
usamp_units_per_lot = df_units_per_lot.sample(len(df_units_per_lot)*2, replace=True)

usamp_df = pd.concat([df_majority, usamp_max_far], axis=0)
usamp_df = pd.concat([usamp_df, usamp_units_per_lot], axis=0)

In [14]:
usamp_df['Attribute'].value_counts()

max_dua             8183
minimum_lot_sqft    5919
building_height     3342
max_far             2940
none                2909
units_per_lot       2206
Name: Attribute, dtype: int64

### Evenly splitting the new dataframe 

- Take each attribute of df and split it 80/20

In [16]:
train['Attribute'].value_counts()

max_dua             6546
minimum_lot_sqft    4735
building_height     2674
max_far             2352
none                2327
units_per_lot       1765
Name: Attribute, dtype: int64

In [17]:
valid['Attribute'].value_counts()

max_dua             1637
minimum_lot_sqft    1184
building_height      668
max_far              588
none                 582
units_per_lot        441
Name: Attribute, dtype: int64

In [18]:
assert len(train) + len(valid) == len(usamp_df)

In [44]:
X_train = train[[i for i in train.columns if i != 'Attribute']]
X_valid = valid[[i for i in valid.columns if i != 'Attribute']]
y_train = train['Attribute']
y_valid = valid['Attribute']

## Feature Engineering

- Tokenize context using spacy_tokenizer, calculate char/word count and average word length

In [3]:
import numpy as np
import pandas as pd
import spacy
from sklearn.pipeline                import Pipeline, FeatureUnion
from sklearn.svm                     import LinearSVC      # baseline
from sklearn.feature_extraction.text import CountVectorizer
from   sklearn.ensemble              import RandomForestClassifier
from sklearn.preprocessing           import *
from sklearn.impute                  import SimpleImputer
from sklearn.compose                 import ColumnTransformer
nlp = spacy.load("en_core_web_lg")

def spacy_tokenizer(string: str) -> str:
    doc = nlp(string)
    new_string = " ".join([token.lemma_ for token in doc if not token.is_stop])
    return new_string

def numerical_features(X_df: pd.DataFrame) -> None:
    tokens = X_df['Context'].apply(lambda x: re.sub(r'[^\w\s]', '', spacy_tokenizer(x).strip()))
    X_df['Char Count'] = tokens.apply(lambda x: len(x))
    X_df['Word Count'] = tokens.apply(lambda x: len(x.split()))
    X_df['Avg Word Length'] = X['Char Count'] / X['Word Count']
    
def encode_cities_mean_frequency(X_df: pd.DataFrame) -> None:
    keys = X_df['City'].value_counts().index.values
    vals = (X_df['City'].value_counts() / len(X_df)).values
    encode_cities = dict(zip(keys, vals))

    X_df['City'] = X_df['City'].map(lambda x: encode_cities[x])
    
def encode_label(y_df: pd.DataFrame) -> None:
    encode_labels = {'max_dua'          : 0,
                     'minimum_lot_sqft' : 1,
                     'building_height'  : 2,
                     'units_per_lot'    : 3,
                     'max_far'          : 4, 
                     'none'             : 5
    }

    y_df = y_df.map(lambda y: encode_labels[y])
    
def preprocess_pipeline(X_df, y_df):
    # Tokenize the Context column

    # Update X_df with number variables
    numerical_features(X_df)
    
    # Tokenize the Context column into a sparse matrix
    vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
    sparse = vectorizer.fit_transform(X_df['Context'])    
    
    #encode the "cities" feature
    encode_cities_mean_frequency(X_df)
    
    # transform sparse CV matrix such that each dimension is given its own column
    # drop context and join X_df with sparse (dataframe)
    X_df = X_df.join(pd.DataFrame(sparse.todense())).drop(['Context'], axis=1)
    
    #encode the labels
    encode_label(y_df)
    
    return X_df, y_df

    
    
classifier = LinearSVC()

In [4]:
X_train= pd.read_csv('ML-Modeling-Data/X_train.csv', index_col='Unnamed: 0')
y_train= pd.read_csv('ML-Modeling-Data/y_train.csv', index_col='Unnamed: 0')
X_valid= pd.read_csv('ML-Modeling-Data/X_valid.csv', index_col='Unnamed: 0')
y_valid= pd.read_csv('ML-Modeling-Data/y_valid.csv', index_col='Unnamed: 0')

In [6]:
X_df = X_train.append(X_valid)

In [11]:
X_df = X_df.drop(['City', 'Char Count', 'Word Count', 'Avg Word Length'], axis=1).rename(columns={'index':'City'})
X_df.to_csv('X_from_pipeline.csv')

In [18]:
y_df

Unnamed: 0,Attribute
656,0
5374,0
7544,0
2570,0
4772,0
...,...
2189,3
2191,3
2200,3
2202,3


In [20]:
y_df = y_train.append(y_valid)

decode = {0 : 'max_dua',
          1 : 'minimum_lot_sqft',
          2 : 'building_height',
          3 : 'units_per_lot',
          4 : 'max_far',
          5 : 'none'  
}

y_df = y_df['Attribute'].map(lambda x: decode[x])

y_df.to_csv('y_from_pipeline.csv')

In [2]:
def numerical_features(X: pd.DataFrame) -> None:
    tokens = X['Context'].apply(lambda x: re.sub(r'[^\w\s]', '', spacy_tokenizer(x).strip()))
    X['Char Count'] = tokens.apply(lambda x: len(x))
    X['Word Count'] = tokens.apply(lambda x: len(x.split()))
    X['Avg Word Length'] = X['Char Count'] / X['Word Count']
    
numerical_features(X_train)
numerical_features(X_valid)

NameError: name 'pd' is not defined

In [47]:
X_train.to_csv('ML-Modeling-Data/X_train.csv')
X_valid.to_csv('ML-Modeling-Data/X_valid.csv')

# Checkpoint

In [4]:
X_train = pd.read_csv('ML-Modeling-Data/X_train.csv')
X_valid = pd.read_csv('ML-Modeling-Data/X_valid.csv')
y_train = pd.read_csv('ML-Modeling-Data/y_train.csv')
y_valid = pd.read_csv('ML-Modeling-Data/y_valid.csv')

In [5]:
X_train.drop(['City', 'Unnamed: 0'], axis=1, inplace=True)
X_train.rename(columns={'index':'City'}, inplace=True)
X_train.shape

(20399, 5)

In [6]:
X_valid.drop(['City', 'Unnamed: 0'], axis=1, inplace=True)
X_valid.rename(columns={'index':'City'}, inplace=True)
X_valid.shape

(5100, 5)

In [7]:
X_train

Unnamed: 0,Context,City,Char Count,Word Count,Avg Word Length
0,F. Occupancy of an individually partitioned...,dublin,132,18,7.333333
1,2. Any portion of the lot having a slope o...,napa,80,14,5.714286
2,B. Applicant is the bona fide owner of the pre...,pittsburg,279,37,7.540541
3,Where permitted or conditionally permitted by ...,alameda,108,14,7.714286
4,The review authority may grant a detached two-...,tiburon,225,33,6.818182
...,...,...,...,...,...
20394,A. Only one other residential unit shall...,pleasanton,456,62,7.354839
20395,"R-6; One unit per 2,000 square feet of lot area.",alameda,35,7,5.000000
20396,Common Open Space. Developments which have eit...,morgan_hill,639,89,7.179775
20397,Live/Work Unit 1 space per unit for each unit ...,el_cerrito,124,24,5.166667


In [8]:
X_df = X_train.append(X_valid)
X_df.head()

Unnamed: 0,Context,City,Char Count,Word Count,Avg Word Length
0,F. Occupancy of an individually partitioned...,dublin,132,18,7.333333
1,2. Any portion of the lot having a slope o...,napa,80,14,5.714286
2,B. Applicant is the bona fide owner of the pre...,pittsburg,279,37,7.540541
3,Where permitted or conditionally permitted by ...,alameda,108,14,7.714286
4,The review authority may grant a detached two-...,tiburon,225,33,6.818182


- Count-Vectorize the context column

In [9]:
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
#X_train['Token_Count']
sparse = vectorizer.fit_transform(X_df['Context'])
# sparse_valid = vectorizer.fit_transform(X_valid['Context'])

In [10]:
X_train.shape, X_valid.shape

((20399, 5), (5100, 5))

- Encode the cities into their mean frequency:

In [11]:
keys = X_df['City'].value_counts().index.values
vals = (X_df['City'].value_counts() / len(X_df)).values
encode_cities = dict(zip(keys, vals))

X_df['City'] = X_df['City'].map(lambda x: encode_cities[x])
# X_valid['City'] = X_valid['City'].map(lambda x: encode_cities[x])

In [12]:
# X_train = X_train.join(pd.DataFrame(sparse_train.todense())).drop(['Context'], axis=1)
# X_valid = X_valid.join(pd.DataFrame(sparse_valid.todense())).drop(['Context'], axis=1)
X_df = X_df.join(pd.DataFrame(sparse.todense())).drop(['Context'], axis=1)

In [55]:
# X_train.drop(['Tokens'], axis=1, inplace = True)
# X_valid.drop(['Tokens'], axis=1, inplace = True)

- Encode the labels as numbers

In [81]:
encode_labels = {'max_dua'          : 0,
                 'minimum_lot_sqft' : 1,
                 'building_height'  : 2,
                 'units_per_lot'    : 3,
                 'max_far'          : 4, 
                 'none'             : 5
}

y_train = y_train.map(lambda x: encode_labels[x])
y_valid = y_valid.map(lambda x: encode_labels[x])

KeyError: 0

In [13]:
y_df = y_train.append(y_valid)

In [14]:
X_df.head()

Unnamed: 0,City,Char Count,Word Count,Avg Word Length,0,1,2,3,4,5,...,88,89,90,91,92,93,94,95,96,97
0,0.008745,132,18,7.333333,0,20,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0.003726,126,19,6.631579,0,20,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.010432,80,14,5.714286,0,22,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.003726,1309,175,7.48,0,22,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.003608,279,37,7.540541,0,53,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
y_df.head()

290     0
1828    0
4984    0
189     0
5731    0
Name: Attribute, dtype: int64

In [15]:
X_df.to_csv('ML-Modeling-Data/X.csv')
y_df.to_csv('ML-Modeling-Data/y.csv')

# Checkpoint

In [16]:
X_df = pd.read_csv('ML-Modeling-Data/X.csv')
y_df = pd.read_csv('ML-Modeling-Data/y.csv')

In [98]:
y_df.unique()

array([0, 2, 1, 5, 4, 3])

In [19]:
y_df = y_df['Attribute']

In [23]:
y_df.unique()

array([0, 2, 1, 5, 4, 3])

In [36]:
y_df

0        0
1        0
2        0
3        0
4        0
        ..
25494    3
25495    3
25496    3
25497    3
25498    3
Name: Attribute, Length: 25499, dtype: int64

In [42]:
X_train = pd.DataFrame(columns=X_df.columns)
X_valid = pd.DataFrame(columns=X_df.columns)
y_train = pd.Series(name = y_df.name, dtype=int)
y_valid = pd.Series(name = y_df.name, dtype=int)

for attr in y_df.unique():
    subset_X = X_df[y_df == attr].reset_index()
    Xtrain_split = subset_X.sample(frac=0.8)
    Xvalid_split = subset_X.loc[~subset_X.index.isin(Xtrain_split.index)]
    X_train = X_train.append(Xtrain_split)
    X_valid = X_valid.append(Xvalid_split)
    
    subset_y = y_df[y_df == attr].reset_index()
    ytrain_split = subset_y.sample(frac=0.8)['Attribute']
    yvalid_split = subset_y.loc[~subset_y.index.isin(ytrain_split.index)]['Attribute']
    y_train = y_train.append(ytrain_split)
    y_valid = y_valid.append(yvalid_split)

In [44]:
assert len(X_train) + len(X_valid) == len(X_df)
assert len(y_train) + len(y_valid) == len(y_df)

- Convert to numpy array for machine learning

In [54]:
X_train.to_csv('ML-Modeling-Data/X_train_pp.csv')
y_train.to_csv('ML-Modeling-Data/y_train_pp.csv')
X_valid.to_csv('ML-Modeling-Data/X_valid_pp.csv')
y_valid.to_csv('ML-Modeling-Data/y_valid_pp.csv')

In [50]:
del X_train['Unnamed: 0']
del X_train['index']

In [52]:
del X_valid['Unnamed: 0']
del X_valid['index']

# Machine Learning

In [55]:
X_train= pd.read_csv('ML-Modeling-Data/X_train_pp.csv', index_col='Unnamed: 0')
y_train= pd.read_csv('ML-Modeling-Data/y_train_pp.csv', index_col='Unnamed: 0')
X_valid= pd.read_csv('ML-Modeling-Data/X_valid_pp.csv', index_col='Unnamed: 0')
y_valid= pd.read_csv('ML-Modeling-Data/y_valid_pp.csv', index_col='Unnamed: 0')

In [75]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.fit_transform(X_valid)

In [78]:
tree = RandomForestClassifier()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_valid)

  tree.fit(X_train, y_train)


In [88]:
filename = 'Models/attrib_classifier_rf.sav'
pickle.dump(best_model, open(filename, 'wb'))

In [65]:
# Baseline

In [79]:
from sklearn.metrics import accuracy_score, classification_report

print(classification_report(y_pred, y_valid))

              precision    recall  f1-score   support

           0       0.74      0.56      0.64      2140
           1       0.55      0.57      0.56      1159
           2       0.27      0.46      0.34       395
           3       0.19      0.38      0.25       216
           4       0.49      0.42      0.45       678
           5       0.27      0.31      0.29       512

    accuracy                           0.50      5100
   macro avg       0.42      0.45      0.42      5100
weighted avg       0.56      0.50      0.52      5100



In [83]:
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier()
GBC.fit(X_train, y_train)
y_pred = GBC.predict(X_valid)

encode_labels = {'max_dua'          : 0,
                 'minimum_lot_sqft' : 1,
                 'building_height'  : 2,
                 'units_per_lot'    : 3,
                 'max_far'          : 4, 
                 'none'             : 5
}


print(classification_report(y_pred, y_valid, target_names=encode_labels.keys()))

  return f(*args, **kwargs)


                  precision    recall  f1-score   support

         max_dua       0.81      0.49      0.61      2732
minimum_lot_sqft       0.51      0.56      0.53      1062
 building_height       0.05      0.49      0.09        71
   units_per_lot       0.17      0.50      0.25       151
         max_far       0.48      0.40      0.44       714
            none       0.22      0.34      0.27       370

        accuracy                           0.48      5100
       macro avg       0.37      0.46      0.37      5100
    weighted avg       0.63      0.48      0.53      5100



In [84]:
from sklearn.model_selection import RandomizedSearchCV

search_space = {'learning_rate'    : [0.1, 0.001, 0.0001, 0.00001],
                'max_depth'        : [2, 3, 4, 5],
                'min_samples_leaf' : [1, 2, 4, 6],
                'n_estimators'     : [10, 20, 50, 100, 150, 200, 500],
                'subsample'        : [0.2, 0.4, 0.5, 0.6, 0.8, 0.9]
                }

clf_random = RandomizedSearchCV(estimator=GradientBoostingClassifier(),
                                param_distributions=search_space,
                                n_iter=50,
                                cv=5,
                                n_jobs=-1,
                                verbose=1)

best_model = clf_random.fit(X_train, y_train)
best_model.best_estimator_.get_params()

Fitting 5 folds for each of 50 candidates, totalling 250 fits


  return f(*args, **kwargs)


{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 4,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 0.5,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [86]:
y_pred = best_model.predict(X_valid)

print(classification_report(y_pred, y_valid, target_names=encode_labels.keys()))

                  precision    recall  f1-score   support

         max_dua       0.75      0.54      0.62      2270
minimum_lot_sqft       0.55      0.56      0.55      1165
 building_height       0.17      0.44      0.25       257
   units_per_lot       0.25      0.43      0.31       254
         max_far       0.47      0.43      0.45       646
            none       0.29      0.33      0.31       508

        accuracy                           0.50      5100
       macro avg       0.41      0.46      0.42      5100
    weighted avg       0.57      0.50      0.52      5100



In [87]:
import pickle

filename = 'Models/attrib_classifier_gbcv.sav'
pickle.dump(best_model, open(filename, 'wb'))

In [None]:
def preprocess_pipeline(X_df, y_df):
    # Update X_df with number variables
    numerical_features(X_df)
    
    # Tokenize the spacy goods

# AutoML

In [85]:
from pyspark.sql import SparkSession
from pysparkling import *
import h2o
from h2o.automl import H2OAutoML

ss = SparkSession.builder.getOrCreate()
hc = H2OContext.getOrCreate()

train = X_train
train['label'] = y_train
valid = X_valid
valid['label'] = y_valid

df_train = ss.createDataFrame(train)
df_train_h2o = hc.asH2OFrame(df_train, "train")
adult_train_h2o["label"] = adult_train_h2o["label"].asfactor()

df_valid = ss.createDataFrame(valid)
df_valid_h2o = hc.asH2OFrame(df_valid, "valid")
df_valid_h2o["label"] = df_valid_h2o["label"].asfactor()

predictors = df_valid_h2o.names[:]
response = 'label'
predictors.remove(response)

model_automl = H2OAutoML(max_models=3, nfolds=5)
model_automl.train(x=predictors,
                   y=response,
                   training_frame=df_train_h2o)

Connecting to H2O server at http://10.0.0.157:54323 ... successful.


0,1
H2O_cluster_uptime:,08 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.4
H2O_cluster_version_age:,2 months and 18 days
H2O_cluster_name:,sparkling-water-okeefe_local-1618891679901
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,796 Mb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16



Sparkling Water Context:
 * Sparkling Water Version: 3.32.0.4-1-3.0
 * H2O name: sparkling-water-okeefe_local-1618891679901
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (0,10.0.0.157,54321)
  ------------------------

  Open H2O Flow in browser: http://10.0.0.157:54323 (CMD + click in Mac OSX)

    


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
model_automl.leaderboard

In [None]:
h2o.get_model("StackedEnsemble_AllModels_AutoML_20210307_183027")

In [None]:
model_automl.explain(df_train_h2o)

In [None]:
ss.stop()

In [None]:
# Model #1:
# Results for dataset without upsampling the minority values.
#     "none" attribute was downsampled using the median of the known class counts

#                      precision    recall  f1-score   support

#  building_height       0.78      0.88      0.83       591
#          max_dua       0.68      0.81      0.74      1373
#          max_far       0.07      0.17      0.10        24
# minimum_lot_sqft       0.92      0.68      0.78      1610
#             none       0.68      0.64      0.66       619
#    units_per_lot       0.05      0.09      0.07       133

#         accuracy                           0.72      4350
#        macro avg       0.53      0.55      0.53      4350
#     weighted avg       0.76      0.72      0.73      4350



# Model #2:
# Upsampled minority values to the median of the dataset
#    "none" attribute downsampled using the median of the known class counts

#                   precision    recall  f1-score   support

#  building_height       0.71      0.86      0.78       555
#          max_dua       0.96      0.41      0.57      3865
#          max_far       0.33      0.72      0.45       421
# minimum_lot_sqft       0.54      0.86      0.66       740
#             none       0.42      0.84      0.56       288
#    units_per_lot       0.04      0.67      0.07        54

#         accuracy                           0.55      5923
#        macro avg       0.50      0.73      0.52      5923
#     weighted avg       0.80      0.55      0.59      5923



# Model #3:
# Upsampling minority values
#     "none" attribute downsampled using the sum of the the known class counts

#                   precision    recall  f1-score   support

#  building_height       0.78      0.62      0.69       840
#          max_dua       0.22      0.53      0.31       679
#          max_far       0.47      0.69      0.56       958
# minimum_lot_sqft       0.66      0.59      0.62      1315
#             none       0.94      0.60      0.73      5951
#    units_per_lot       0.17      0.71      0.27       334

#         accuracy                           0.61     10077
#        macro avg       0.54      0.62      0.53     10077
#     weighted avg       0.77      0.61      0.65     10077



# Model #4:
# Upsampled minority values to be: max_far: *10, units_per_lot: *2
#    "none" attribute downsampled using the median of the known class counts

#                   precision    recall  f1-score   support

#  building_height       0.81      0.80      0.81       679
#          max_dua       0.83      0.64      0.72      2132
#          max_far       0.59      0.54      0.57       640
# minimum_lot_sqft       0.77      0.75      0.76      1225
#             none       0.56      0.78      0.65       418
#    units_per_lot       0.01      0.67      0.02         6

#         accuracy                           0.68      5100
#        macro avg       0.59      0.69      0.59      5100
#     weighted avg       0.76      0.68      0.71      5100

# Conclusions

- Since we are looking for high levels of recall, model #1 seems to be the way to go! 
- Model #1 also had the best f1 score, though both models #2/#3 have a higher precision.
- While max_far benefitted from upsampling (ESPECIALLY in recall!), units_per_lot didn't quite budge throughout
    - Further research shows that upsampling max_far via the median value only increased by a factor of 2.

# Saving Model

In [None]:
import pickle

filename = 'attrib_classifier_model1.sav'
pickle.dump(model, open(filename, 'wb'))

In [58]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)