# Attribute Classifier

### Goals:

- Create a classification model that, when given a raw document, finds the probability that a given line holds relevance to a land use attribute.

- Provide MTC with yet another metric-drive analytical tool to determine the standardization in the structure of policy documents from any jurisdiction.

### Procedure:

1. Preprocessing:
    - Transform raw text version of policy document into a table
    - Each row represents a line of the document.
    - Initial features - city, line of the policy


2. Feature Selection:
    - Tokenize each row using spaCy and lemmatize all tokens
    - Extract character count, character count, and average word length as numerical features
    - Encode cities via mean frequency
    - Vectorize the lemmatized tokens such that the model can interpret them during training/testing
    - Encode labels in the training/test set
    
### Production Model:

- Provides the probability of any given line in a document belonging to each of the assumed land use classes
- Necessary Information Format:
    1. Raw text version of policy zoning document
    2. city_frequency.json - information of the count frequency of documents used during training stage
    3. decode_labels.json - connection between integer classes and their corresponding land use attribute

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit
import json
import re
import numpy as np
import pandas as pd
import spacy
from sklearn.pipeline                import Pipeline, FeatureUnion
from sklearn.svm                     import LinearSVC      # baseline
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from   sklearn.ensemble              import RandomForestClassifier
from sklearn.preprocessing           import *
from sklearn.impute                  import SimpleImputer
from sklearn.compose                 import ColumnTransformer
nlp = spacy.load("en_core_web_lg")

def spacy_tokenizer(string: str) -> str:
    doc = nlp(string)
    new_string = " ".join([token.lemma_ for token in doc if not token.is_stop])
    return new_string

def numerical_features(X_df: pd.DataFrame) -> None:
    tokens = X_df['Context'].apply(lambda x: re.sub(r'[^\w\s]', '', spacy_tokenizer(x).strip()))
    X_df['Char Count'] = tokens.apply(lambda x: len(x))
    X_df['Word Count'] = tokens.apply(lambda x: len(x.split()))
    X_df['Avg Word Length'] = X_df['Char Count'] / X_df['Word Count']
    
def encode_cities_mean_frequency(X_df: pd.DataFrame) -> None:
    keys = X_df['City'].value_counts().index.values
    vals = (X_df['City'].value_counts() / len(X_df)).values
    encode_cities = dict(zip(keys, vals))
    # For production use, will need to save the frequency of cities in the case of new cities being included
    # when used.
    city_frequency = dict(zip(keys, vals * len(X_df)))
    

    X_df['City'] = X_df['City'].map(lambda x: encode_cities[x])
    
    return city_frequency
    
def encode_label(y_df: pd.DataFrame) -> None:
    encode_labels = {'max_dua'          : 0,
                     'minimum_lot_sqft' : 1,
                     'building_height'  : 2,
                     'units_per_lot'    : 3,
                     'max_far'          : 4, 
                     'none'             : 5
    }

    y_df['Attribute'] = y_df['Attribute'].map(lambda y: encode_labels[y])
    
def preprocess_pipeline(X_df, y_df):
    print("Context Numerical Analysis")
    # Update X_df with number variable analysis
    numerical_features(X_df)
    
    print("Hash Vectorizing...")
    # Tokenize the Context column into a sparse matrix
    vectorizer = HashingVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
#     vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
    sparse = vectorizer.fit_transform(X_df['Context'])    
    
    print("Encoding cities...")
    #encode the "cities" feature
    city_frequency = encode_cities_mean_frequency(X_df)
    
    print("Transforming sparse matrix...")
    # transform sparse CV matrix such that each dimension is given its own column
    # drop context and join X_df with sparse (dataframe)
    X_df = X_df.join(pd.DataFrame(sparse.todense())).drop(['Context'], axis=1)
    
    print("Encoding the labels...")
    #encode the labels
    encode_label(y_df)
    
    return X_df, y_df, city_frequency

def train_test_split(X_df, y_df, frac):
    X = np.array(X_df)
    y = np.array(y_df)
    skf = StratifiedShuffleSplit(n_splits=2, test_size=(1-frac))
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
    return X_train, X_test, y_train, y_test

In [3]:
X_from_pipeline = pd.read_csv('ML-Modeling-Data/X_from_pipeline.csv', index_col='Unnamed: 0')
y_from_pipeline = pd.read_csv('ML-Modeling-Data/y_from_pipeline.csv', index_col='Unnamed: 0')

In [4]:
assert X_from_pipeline.shape[0] == y_from_pipeline.shape[0]

In [None]:
X_df, y_df, city_frequency = preprocess_pipeline(X_from_pipeline, y_from_pipeline)

with open('ML-Modeling-Data/city_frequency.json', 'w') as outfile:
    json.dump(city_frequency, outfile)

Context Numerical Analysis
Hash Vectorizing...




Encoding cities...
Transforming sparse matrix...


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_df, y_df, frac=0.8)

In [105]:
assert X_train.shape[0] + X_val.shape[0] == X_df.shape[0]
assert y_train.shape[0] + y_val.shape[0] == y_df.shape[0]

In [121]:
from sklearn.ensemble                import GradientBoostingClassifier
from sklearn.model_selection         import RandomizedSearchCV
from sklearn.svm                     import LinearSVC
from sklearn                         import metrics
import matplotlib.pyplot             as plt


search_space = {'learning_rate'    : [0.1, 0.001, 0.0001, 0.00001],
                'max_depth'        : [2, 3, 4, 5],
                'min_samples_leaf' : [1, 2, 4, 6],
                'n_estimators'     : [10, 20, 50, 100, 150, 200, 500],
                'subsample'        : [0.2, 0.4, 0.5, 0.6, 0.8, 0.9]
                }

clf_random = RandomizedSearchCV(estimator=GradientBoostingClassifier(),
                                param_distributions=search_space,
                                n_iter=50,
                                cv=5,
                                n_jobs=-1,
                                verbose=1)

best_model = clf_random.fit(X_train, y_train)
best_model.best_estimator_.get_params()

Fitting 5 folds for each of 50 candidates, totalling 250 fits


  return f(*args, **kwargs)


{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 0.5,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [122]:
y_pred = best_model.predict(X_val)

encode_labels = {'max_dua'          : 0,
                 'minimum_lot_sqft' : 1,
                 'building_height'  : 2,
                 'units_per_lot'    : 3,
                 'max_far'          : 4, 
                 'none'             : 5
}

print(metrics.classification_report(y_pred, y_val, target_names=encode_labels.keys()))

                  precision    recall  f1-score   support

         max_dua       0.88      0.80      0.83      1801
minimum_lot_sqft       0.83      0.75      0.79      1295
 building_height       0.74      0.79      0.76       622
   units_per_lot       0.63      0.76      0.69       366
         max_far       0.65      0.80      0.72       481
            none       0.75      0.82      0.79       535

        accuracy                           0.78      5100
       macro avg       0.74      0.79      0.76      5100
    weighted avg       0.79      0.78      0.79      5100



In [1]:
# Classification report using CountVectorizer:

#                   precision    recall  f1-score   support

#          max_dua       0.88      0.80      0.83      1801
# minimum_lot_sqft       0.83      0.75      0.79      1295
#  building_height       0.74      0.79      0.76       622
#    units_per_lot       0.63      0.76      0.69       366
#          max_far       0.65      0.80      0.72       481
#             none       0.75      0.82      0.79       535

#         accuracy                           0.78      5100
#        macro avg       0.74      0.79      0.76      5100
#     weighted avg       0.79      0.78      0.79      5100

In [124]:
import pickle

filename = 'attrib_classifier_model_cvalidated.sav'
pickle.dump(best_model, open(filename, 'wb'))

# Production Model Procedure/ Use Case:

- Our use case will be uploading the policy information for Calistoga. See the "Attribute_Classifier_RD" notebook for more information on rationale.