In [1]:
import pandas as pd
import numpy as np

def blight_model():
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import roc_auc_score
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.svm import SVC
    from sklearn.model_selection import GridSearchCV
    
    # Read in the data sets
    train = pd.read_csv('train.csv', encoding = "ISO-8859-1")
    test = pd.read_csv('test.csv', encoding = "ISO-8859-1")
    addresses = pd.read_csv('addresses.csv', encoding = "ISO-8859-1")
    
    # Merge the addresses and lat/lons into the train and test DataFrames
    train = pd.merge(train, addresses, how='inner', left_on='ticket_id', right_on='ticket_id')
    test = pd.merge(test, addresses, how='inner', left_on='ticket_id', right_on='ticket_id')
    
    # Remove train data with 'compliance' == NaN
    train = train.dropna(subset=['compliance'])
    train['compliance'] = train['compliance'].astype(int)
    
    
    # Convert NaN to "NA" in columns to convert to type category
    convert_columns={'country': 'category',
                     'non_us_str_code': 'category',
                     'compliance': 'category',
                     'state': 'category',
                     'zip_code': 'category'
                    }
    
    for df in [test, train]:
        for col, col_type in convert_columns.items():
            if col in df:
                if col_type == 'category':
                    df[col] = df[col].replace(np.nan, "NA", regex=True).astype(col_type)
                elif col_type == 'int':
                    df[col] = df[col].replace(np.nan, 0, regex=True).astype(col_type)
    
   
    # Remove unneeded columns from X sets    
    common_cols_to_drop = ['agency_name', 'inspector_name', 'mailing_address_str_number',
                           'violator_name', 'violation_street_number', 'violation_street_name',
                           'mailing_address_str_name', 'address', 'admin_fee', 'violation_zip_code',
                           'state_fee', 'late_fee', 'ticket_issued_date', 'hearing_date', 'violation_description',
                           'fine_amount', 'clean_up_cost', 'disposition', 'grafitti_status',
                           'violation_code', 'city']
    train_cols_to_drop = ['payment_status', 'payment_date', 'balance_due', 'payment_amount'] + common_cols_to_drop
    train = train.drop(train_cols_to_drop, axis=1).set_index('ticket_id')
    test = test.drop(common_cols_to_drop, axis=1).set_index('ticket_id')
    y_train = train['compliance']  
    X_train_cols_to_drop = ['compliance', 'compliance_detail', 'collection_status']
    train = train.drop(X_train_cols_to_drop, axis=1)
    
    # Convert cetegory columns to integers
    cat_columns = train.select_dtypes(['category']).columns
    for df in [test, train]:
        df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
        
    # Create X_train data

    X_train = train.copy()
    
    grid_values = {'learning_rate': [0.01, 0.1, 1], 'max_depth': [3, 4, 5]}
    clf = GradientBoostingClassifier(random_state = 0)
    grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
    grid_clf_auc.fit(X_train, y_train)
    probs = grid_clf_auc.predict_proba(test)[:, 1]
    result = pd.Series(probs, index=test.index)
    scorer=grid_clf_auc.best_score_
    return result,scorer

predictions,score=blight_model()

  if self.run_code(code, result):


In [2]:
predictions.head()

ticket_id
284932    0.205176
285362    0.094864
285361    0.228263
285338    0.205176
285346    0.228263
dtype: float64

In [3]:
score

0.7523577544339461