In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
STAGING_BUCKET_URL = "s3://udacity-capstone-staging-data"

In [4]:
pandas_df = pd.read_parquet("{}/{}".format(STAGING_BUCKET_URL, "ml_data"))

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer

In [6]:
numeric_features = [
    'country_citizenship_gdp', 
    'country_residence_gdp',
    'num_previous_stays'
]
categorical_features = [
    'country_citizenship', 
    'country_residence', 
    'gender',
    'visa_type', 
    'destination_state'
]

ordinal_features = [
    'age', 
    'month'
]

all_features = numeric_features + categorical_features + ordinal_features


preprocessing_pipeline = ColumnTransformer([
    (
        "numerical_features",
        Pipeline(steps=[
             # most of the countries that don't have a GDP are small countries, so 0 is a more logical choice
            ('imp', SimpleImputer(strategy='constant', fill_value=0)),
            ('sts', StandardScaler()),
        ]),
        numeric_features
    ),
    (
        "ordinal_features",
        Pipeline(steps=[
            ('imp', SimpleImputer(strategy='constant', fill_value=0)),
        ]),
        ordinal_features
    ),
    (
        "categorical_features",
        Pipeline(steps=[
            # we use missing_values=None because that's the representation we get when reading from parquet, not np.nan
            ('imp', SimpleImputer(strategy='constant', fill_value='MISSING', missing_values=None)),
            ('one_hot_encoder', OneHotEncoder(sparse=True, dtype=int)),
        ]),
        categorical_features
    )
])

TARGET_COLUMN = 'is_overstay'
X = preprocessing_pipeline.fit_transform(pandas_df[all_features])
Y = pandas_df[TARGET_COLUMN].values

In [7]:
X

<35743097x1244 sparse matrix of type '<class 'numpy.float64'>'
	with 357392378 stored elements in Compressed Sparse Row format>

In [None]:
param_grid = {
    "n_estimators"     : [10],
    "max_depth": [20],
    "max_features"     : ["sqrt"],
    #"min_samples_split": [2, 3, 10],
    "min_samples_leaf" : [100],
    #"bootstrap"        : [True, False],
    "criterion"        : ["gini"],
    #"class_weight"        : ["balanced", None],
}

clf = RandomForestClassifier(n_jobs=2)
gs = GridSearchCV(clf, param_grid=param_grid, n_jobs=2, scoring='roc_auc', verbose=2)
gs.fit(X, Y)


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] criterion=gini, max_depth=50, max_features=sqrt, min_samples_leaf=100, n_estimators=10 
[CV]  criterion=gini, max_depth=50, max_features=sqrt, min_samples_leaf=100, n_estimators=10, total=24.7min
[CV] criterion=gini, max_depth=50, max_features=sqrt, min_samples_leaf=100, n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 24.7min remaining:    0.0s
