# Import libraries

In [1]:
import pandas as pd
import numpy as np
import time
from datetime import date

## Preprocessing

In [2]:
from sklearn.preprocessing import Normalizer, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.pipeline import Pipeline as imbPipeline

## Models

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, IsolationForest, \
    GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis

## Cross validation

In [4]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score, \
    cross_validate, cross_val_predict, GridSearchCV, ParameterGrid, RepeatedStratifiedKFold

## Feature selection

In [5]:
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, RFE

## Performance measures

In [6]:
from sklearn.metrics import accuracy_score, auc, roc_curve, roc_auc_score as auc_score, confusion_matrix, \
    classification_report

# Loading data

In [7]:
train = pd.read_csv("autos_training_final.csv", sep="|")
samp = train.sample(frac=0.25).copy() # For performance reasons we might want to try out the code only on a sample

test = pd.read_csv("autos_testing_final.csv", sep="|")
sample_submission = pd.read_csv("autos_submission.csv", sep=",")

# Defining column groups

In [8]:
# Special columns
spec_cols = ['id', 'label', 'dateCrawled']

# Redundant columns
# These columns have either the same value across the dataset or contain only very little variation
redundant_cols = ['seller', 'offerType', 'nrOfPictures'] 

# relevant columns with DateTime type
date_cols = ['dateCreated', 'lastSeen']

# Numerical columns
num_cols = train.loc[:,~train.columns.isin(spec_cols + redundant_cols + date_cols)] \
                .select_dtypes(include=[int, float]) \
                .columns

# Categorical columns
cat_cols = train.select_dtypes(include=object).columns

# Attribute specific missing value codes

In [9]:
# The following columns have their own 'missing values' codes
col_nanval = [['yearOfRegistration', 1000],
              ['powerPS', 0],
              ['monthOfRegistration', 0]]

# Data cleaning functions

## Handling native missing data codes

In [10]:
def rep_nanvals(data, collist, inplace=False):
    """Replaces missing value codes with a random value from among the rest of values in the column.
    Takes the data and a list of column-missing code pairs.
    """
    start = time.time()
    print("\nReplacing data-specific missing codes...")  
    
    for col_nan in collist:
        col = data.loc[:, col_nan[0]]
        nanval = col_nan[1]
        col.replace(to_replace=nanval,
                    value=np.random.choice(col[col != nanval]),
                    inplace=inplace)
    
    end = time.time()
    print("Duration:{}".format(end - start))  
    
# Possible development: use sklearn's Impute() instead, predict missing values based on already existing data

## Missing values

In [11]:
def rep_nas(data, inplace=False):
    """Replaces NaN values with a random value from among the rest of values in the column.
    """
    start = time.time()
    print("\nReplacing NaN values...")    
    
    for col in data.isna().mean()[data.isna().any() == True].index:
        dcol = data.loc[:,col]
        dcol.fillna(value=np.random.choice(dcol[dcol.isna() == False]),
                               inplace=inplace)

    end = time.time()
    print("Duration:{}".format(end - start))        

# Possible development: use sklearn's Impute() instead, predict missing values based on already existing data

## Merging the registration date columns into a single registration date

In [12]:
def yr_mth(data, yrcol, mtcol, inplace=False):
    """Takes the data, and the year and month columns and merge them into a single "yr_mth" column in the
    YYYY.MM format.
    """
    data.loc[:,'yr_mth'] = data.loc[:,[yrcol, mtcol]] \
        .apply(lambda x: str(x[0])[:4] + "." + str(x[1])[:-2], axis=1)
    data.drop(columns=[yrcol, mtcol], inplace=inplace)
    
    #print(data.yr_mth)

## Splitting the datatype columns into separate date/time columns

In [13]:
def split_datetype(data, datcols, inplace=False):
    """Splitting datetype columns into separated year, month and day columns
    Takes the dataframe and the list of column names.
    Returns the dataframes with the new and without the old columns.
    """
    start = time.time()
    print("\nSplitting date columns...")

    for col in datcols:
        datcol = data.loc[:,col]
        data.loc[:,col + "_yr"] = datcol.apply(lambda x: int(str(x)[0:4]))
        data.loc[:,col + "_mt"] = datcol.apply(lambda x: int(str(x)[5:7]))
        data.loc[:,col + "_dy"] = datcol.apply(lambda x: int(str(x)[8:10]))
        #data.loc[:,col + "_hr"] = datcol.apply(lambda x: int(str(x)[11:13]))

    def calc_date(row):
        duration = date(row.lastSeen_yr,
                        row.lastSeen_mt,
                        row.lastSeen_dy) - date(row.dateCreated_yr,
                                                row.dateCreated_mt,
                                                row.dateCreated_dy)
        return duration.days

    dur = data.apply(lambda x: calc_date(x), axis=1)
    data.loc[:,'visiblePeriod'] = dur
    data.drop(columns=datcols, inplace=inplace)

    end = time.time()
    print("Duration:{}".format(end - start))

## Transforming categorical values into dummies

In [14]:
def cat_dummies(data):
    """Transforms all categorical attributes into dummy attributes.
    Takes a DataFrame.
    
    Returns the original DataFrame's without its categorical ('object' datatype) attributes.
    """
    start = time.time()
    print("\nCreating dummy variables...")
    dum_cols = data.loc[:,cat_cols] \
                   .columns.difference(spec_cols
                                       + redundant_cols
                                       + date_cols).drop('name')

    #print("data.columns:{}".format(data.columns))
    #print("dum_cols:{}".format(dum_cols))
    #print("Columns transformed to dummies: {}".format(dum_cols))

    dummies = pd.get_dummies(data.loc[:,dum_cols])
    #print("New dummy columns: {}".format(dummies.columns))
    w_dummies = pd.concat([data, dummies], axis=1)
    w_dummies = w_dummies.select_dtypes(exclude=[object]) \
                         .dropna(axis=1)

    end = time.time()
    print("{} seconds".format(end - start))
        
    return w_dummies

## Handling outliers

In [15]:
def drop_outliers(data):
    """Uses IsolationForest to find outliers in the data and then drops them.
    Takes a DataFrame.
    
    Returns the DataFrame without the outliers.
    """
    start = time.time()
    print("\nHandling outliers...")

    clf = IsolationForest(max_samples='auto',
                          random_state=2425,
                          contamination=0.01,
                          verbose=False,
                          n_jobs=-1)

    clf.fit(data)
    isof = clf.predict(data)

    data.loc[:,'Outlier'] = pd.Series(isof)
    outl_rows = data[data.Outlier == -1].index

    data.drop(outl_rows, inplace=True)
    data.drop(columns='Outlier', inplace=True)

    end = time.time()
    print("{} seconds".format(end-start))

## Handling correlated attributes

In [16]:
def drop_cors(data):
    """Drops correlated attributes from a DataFrame.
    
    Takes a DataFrame.
    
    Returns the DataFrame without the correlated attributes.
    """
    start = time.time()
    print("\nChecking for attribute correlations...")

    # Solution from Chris Albon
    corrs = data.corr().abs()
    upper = corrs.where(np.triu(np.ones(corrs.shape), k=1).astype(np.bool))

    to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
    print("Correlated attributes to drop:{}".format(to_drop))

    data.drop(data.loc[:,to_drop], axis=1, inplace=True)

    end = time.time()
    print("{} seconds".format(end-start))
    
    return to_drop

## Automatic Feature Selection

In [17]:
def fsel(X, y, test_data, method='univarv', model=RandomForestClassifier, k=5):
    """Selects features from from the dataset based on various methods.
    Parameters
        X: DataFrame
            The predictor attributes
        
        y: DataFrame or Series
            The label attribute to predict
        
        test_data: DataFrame
            The test data
        
        method='univar'
            The method to identify the selected features:
                'univar': Univariate feature selection based on chi-squared test.
                'rfe': Recursive Feature Elimination
                'pca': Principal Component Analysis            
                'fimp': Feature importance
        
        model=RandomForestClassifier
            Predictor model (applicable for 'rfe' and 'fimp')
        
        k=5
            Depending on the chosen method:
                'univar', 'rfe', 'fimp': The number of best features selected.
                'pca': The number of components to create from the attributes.
    
    Returns
        The transformed training (X) and test datasets (with the best attributes or with the new components).
    """
    start = time.time()
    print("\nAutomatic feature selection...")
    
    if method == 'univar':
        fsel_mod = SelectKBest(score_func=chi2, k=k)
        fsel_test = fsel_mod.fit(X, y)

        #print("Feature selection test scores:{}".format(fsel_test.scores_))
        features = fsel_test.transform(X)
        fnames = pd.DataFrame(data={'attribute': X.columns,
                                    'chi2': fsel_test.scores_}) \
                                    .sort_values(by='chi2', ascending=False) \
                                    .head(k).attribute.values
        print("\nSelected Features:\n{}".format(fnames))
        
        X_train = features
        test = fsel_test.transform(test_data)

    elif method == 'rfe':
        fsel_mod = RFE(model(verbose=1, n_jobs=-1), k)
        fsel_test = fsel_mod.fit(X, y)
        fnames = X.columns[fsel_test.support_]
        print("Selected Features:{}".format(fnames))
        
        X_train = X.loc[:,fnames].as_matrix()
        test = fsel_test.transform(test_data)

    elif method == 'pca':
        fsel_mod = PCA(n_components=k)
        fsel_test = fsel_mod.fit(X)
        print("Explained Variance:{}".format(fsel_test.explained_variance_ratio_))
        #print("Fit components:{}".format(fsel_test.components_))
        
        X_train = fsel_test.transform(X)
        test = fsel_test.transform(test_data)

    elif method == 'fimp':
        fsel_mod = model()
        fsel_test = fsel_mod.fit(X, y)
        fnames = pd.DataFrame(data={'attribute': X.columns,
                                    'fimp': fsel_test.feature_importances_}) \
                                    .sort_values(by='fimp', ascending=False) \
                                    .head(k).attribute.values
        print("Selected Features:{}".format(fnames))
        
        X_train = X.loc[:,fnames].as_matrix()
        test = test_data.loc[:,fnames].as_matrix()

    else:
        print("The {} method does not exist!".format(method))
    
    end = time.time()
    print("{} seconds".format(end - start))
    
    return X_train, test

## Standardization

In [18]:
def standardize(train, test):
    """\nStandardize the traning and test datasets.
    """
    start = time.time()
    print("\nStandardization...")
    
    scaler = StandardScaler()
    scaler.fit(train)

    train = scaler.transform(train)
    test = scaler.transform(test)

    end = time.time()
    print("{} seconds".format(end - start))
    
    return train, test

## Normalization

In [19]:
def normalize(train, test):
    """Normalize the training and test datasets."""
    start = time.time()
    print("\nNormalization...")
    
    scaler = MinMaxScaler()
    scaler.fit(train)

    train = scaler.transform(train)
    test = scaler.transform(test)

    print("{} seconds".format(time.time() - start))
    return train, test

## The preprocessing function tying all the above together

In [20]:
def preprocess(train, test,
               dropcors=True,
               outl_drop=True,
               repnas=False,
               repnanvals=True,
               #oneregdate=False,
               usedatecols=True,
               usecats=True,
               featsel=False,
               stand=True,
               norm=False):
    """Runs preprocessing steps on both the train and test data.
    Takes the train and test data as DataFrames and the following parameters:
    
        dropcors=True,
            Drop correlated attributes.

        outl_drop=True,
            Drop outliers.

        repnas=False,
            Replace missing values of an attribute with a random value from the same column.

        repnanvals=True,
            Replace missing value codes of an attribute with a random value from the same column.

        #oneregdate=False,
        #    Merges registration date columns into a single attributes.

        usedatecols=True,
            Split DateTime columns into separate year and month attributes. If "False", drops them.

        usecats=True
            Transform categorical values into dummy attributes. If "False", drops them.
        
        featsel=False
            The automatic feature selection method:
            
            'univar': Univariate feature selection based on chi-squared test.
            'rfe': Recursive Feature Elimination
            'pca': Principal Component Analysis
            'fimp': Feature importance
        
        stand=True
            Strandardize the train and test datasets.
        
        norm=False
            Normalize the train and test datasets.

    Returns the following items:
    X_train
        The preprocessed training dataset

    y_train
        The target data for X_train

    X_test
        The preprocessed test dataset
    """
    
    cord_cols = []
    datasets = ['tr', 'te']
    training_rows = []

    X_train = np.array([])
    X_test = np.array([])
    
    y_train =  train.loc[:,['label']].copy()
    train = train.loc[:,train.columns.difference(['id', 'label'])]
    test = test.loc[:,test.columns.difference(['id'])]    
    
    
    for data in (train, test):
        #print("\nOriginal:{}".format(data.shape))
        #print("columns:\n{}\n".format(data.columns))

        if repnanvals == True: # Replacing data-specific missing codes
            rep_nanvals(data, col_nanval, inplace=True)
            #print("nanvals:{}".format(data.shape))
            #print("columns:\n{}\n".format(data.columns))

        if repnas == True: # Replacing NaN values
            rep_nas(data, inplace=True)
            #print("repnas:{}".format(data.shape))
            #print("columns:\n{}\n".format(data.columns))

        #if oneregdate == True: # Merging the registration date attributes into a single one
        #    yr_mth(data, 'yearOfRegistration', 'monthOfRegistration', inplace=True)
        #    #print("yr_mth:{}".format(data.shape))
        #    #print("columns:\n{}\n".format(data.columns))

        if usedatecols == True: # Splitting up the attributes with DateTime data type
            split_datetype(data, date_cols, inplace=True)
            #print("split_datetype:{}".format(data.shape))
            #print("columns:\n{}\n".format(data.columns))
            #print(data.dtypes)

        if usecats == True: # Using category attributes
            data = cat_dummies(data)
            #print("cat_dummies:{}".format(data.shape))
            #print("columns:\n{}\n".format(data.columns))
        else:
            num_cols = data.select_dtypes(include=np.number).columns
            data = data.loc[:,num_cols]

        if datasets.pop(0) == 'tr': # Steps for the training dataset
            if outl_drop == True:
                drop_outliers(data)
                #print("drop_outliers:{}".format(data.shape))

            if dropcors == True:
                cord_cols = drop_cors(data)
                #print("drop_cors:{}".format(data.shape))

            X_train = data
            train_idx = X_train.index
            #print("X_train columns:{}".format(data.columns))
            
        else: # Steps for the test dataset
            #if dropcors == True:
            #    data.drop(data.loc[:,cord_cols], axis=1, inplace=True)
            
            #print("drop_cors:{}".format(data.shape))
            #print("X_test columns:{}".format(data.columns))

            X_test = data

    # Dropping attributes from the train dataset which are not in the training data set
    X_train.drop(columns=X_train.columns[X_train.columns.isin(X_test.columns) == False], inplace=True)
    # X_test.drop(columns=X_test.columns[X_test.columns.isin(X_train.columns) == False], inplace=True)

    print("X_train.shape:{}".format(X_train.shape))
    print("X_test.shape:{}".format(X_test.shape))
    
    
    # Defining the training targets for the existing rows
    y_train = y_train.loc[train_idx, :].as_matrix().ravel()
    
    ## Feature selection
    if featsel in ('univar', 'rfe', 'pca', 'fimp'):
        X_train, X_test = fsel(X_train,
                               y_train,
                               test_data=X_test,
                               method='fimp',
                               model=RandomForestClassifier,
                               k=90)

    ## Standardization
    if stand == True:
        X_train, X_test = standardize(X_train, X_test)

    ## Normalization
    if norm == True:
        X_train, X_test = nstandormalize(X_train, X_test)
    
    return X_train, y_train, X_test
    
    # Uses or results:
    # y_train = train.loc[train_idx, 'label'].copy()
    # results = eval_models(models, X_train, y_train)
    # rf_gridCV_2 = param_search(X_train, y_train, RandomForestClassifier, rf_params)
    # res = rep_cv(X_train, y_train, rf_gridCV_2.best_estimator_)
    # get_1confs(X_train, y_train, X_test, rf_gridCV_2.best_estimator_)

# Modeling
## Models to try
A list of models (sometimes with additional parameters) which the `eval_models` function test on the data.

In [21]:
models = {"RandomForestClassifier": RandomForestClassifier(max_features=0.25, criterion="entropy"), # n_estimators=100
          "Gradient Boosting Classifier": GradientBoostingClassifier(max_features='log2', n_estimators=500),
          "AdaBoost Classifier": AdaBoostClassifier(),
          "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
          "LogisticRegression": LogisticRegression(C=1.5, penalty='l1'),
          "GaussianNB": GaussianNB(),
          "Decision Tree Cl - Gini": DecisionTreeClassifier(),
          #"Extra Tree Classifier": ExtraTreeClassifier(criterion='entropy', max_features='log2'),
          #"MLPClassifier": MLPClassifier(),
          #"K-NN_3 Classifier ": KNeighborsClassifier(3),
          #"QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(), # very bad performance
          #"Linear SVC": LinearSVC(),
          }

# Execution
## Preprocessing and train/test split

First we process the sample

In [22]:
X_train, y_train, X_test = preprocess(samp, test,
                                      dropcors=True,
                                      outl_drop=True,
                                      repnas=False,
                                      repnanvals=True,
                                      #oneregdate=False,
                                      usedatecols=True,
                                      usecats=True,
                                      featsel='fimp',
                                      stand=True,
                                      norm=False)


Replacing data-specific missing codes...
Duration:0.04809141159057617

Splitting date columns...
Duration:9.843006372451782

Creating dummy variables...


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  if __name__ == '__main__':
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


1.260565996170044 seconds

Handling outliers...
73.38892340660095 seconds

Checking for attribute correlations...
Correlated attributes to drop:['lastSeen_dy', 'gearbox_manuell', 'model_601', 'model_cooper', 'model_fortwo', 'model_niva', 'model_ypsilon']
17.810032606124878 seconds

Replacing data-specific missing codes...
Duration:0.014958620071411133

Splitting date columns...
Duration:15.263983011245728

Creating dummy variables...


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  if __name__ == '__main__':
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


2.252896547317505 seconds
X_train.shape:(62386, 298)
X_test.shape:(107208, 306)

Automatic feature selection...
Selected Features:['postalCode' 'powerPS' 'yearOfRegistration' 'visiblePeriod'
 'dateCreated_dy' 'monthOfRegistration' 'kilometer'
 'notRepairedDamage_nein' 'notRepairedDamage_ja' 'lastSeen_mt'
 'gearbox_automatik' 'fuelType_benzin' 'vehicleType_limousine'
 'fuelType_diesel' 'vehicleType_cabrio' 'vehicleType_kombi'
 'dateCreated_mt' 'model_andere' 'brand_volkswagen'
 'vehicleType_kleinwagen' 'brand_bmw' 'brand_opel' 'brand_mercedes_benz'
 'model_golf' 'brand_audi' 'brand_ford' 'model_3er' 'vehicleType_coupe'
 'vehicleType_bus' 'brand_renault' 'vehicleType_suv' 'model_c_klasse'
 'brand_peugeot' 'brand_sonstige_autos' 'model_astra' 'model_passat'
 'model_a4' 'brand_fiat' 'model_corsa' 'model_a3' 'model_e_klasse'
 'model_polo' 'brand_mazda' 'fuelType_lpg' 'brand_citroen' 'brand_seat'
 'model_focus' 'model_5er' 'brand_skoda' 'model_transporter'
 'model_fiesta' 'model_a6' 'brand_s

# Modeling

## Evaluating models

On the sample we evaluate the models.

In [23]:
def eval_models(models, X, y):
    """Evaluates selected model's prediction power on the cross-validated training datasets.
    Takes
        models: Dictionary of "model_name": model() pairs.
        X: predictor attributes
        y: target attribute
    """
    results = []
    for model in models:
        #print("Running {}...".format(model))
        #start = time.time()

        result = []
        result.append(model)

        model_score = cross_validate(models[model],
                                    X,
                                    y,
                                    scoring=['accuracy', # Evaluation metrics
                                             'f1_micro',
                                             'f1_macro',
                                             'roc_auc'],
                                    cv=kfold, # Cross-validation method
                                    n_jobs=-1,
                                    verbose=0,
                                    return_train_score=False)

        acc_mean = model_score['test_accuracy'].mean()
        acc_std = model_score['test_accuracy'].std()
        auc_mean = model_score['test_roc_auc'].mean()
        auc_std = model_score['test_roc_auc'].std()

        print("\n{}:\n\tAccuracy: {} ({})".format(model, \
                                                  acc_mean, \
                                                  auc_std))
        print("\tROC AUC: {} ({})".format(auc_mean, auc_std))

        #if model != "Gradient Boosting Classifier":
        f1_micro_mean = model_score['test_f1_micro'].mean()
        f1_micro_std = model_score['test_f1_micro'].std()
        f1_macro_mean = model_score['test_f1_macro'].mean()
        f1_macro_std = model_score['test_f1_macro'].std()
        print("\tF1 micro: {} ({})".format(f1_micro_mean, f1_micro_std))
        print("\tF1 macro: {} ({})".format(f1_macro_mean, f1_macro_std))

        #result = result + [acc_mean, acc_std, auc_mean, auc_std]

        dur = model_score['fit_time'].sum() + model_score['score_time'].sum()

        print("\tduration:{}\n".format(dur))
        #result.append(dur)

        #results.append(result)

In [24]:
seed = np.random.randint(1000)
print("\nseed:{}".format(seed))
kfold = KFold(n_splits=5, random_state=seed)


seed:258


In [25]:
results = eval_models(models, X_train, y_train)
#print("results:{}".format(results))


RandomForestClassifier:
	Accuracy: 0.7010547170859134 (0.0038141402971097224)
	ROC AUC: 0.752308720427382 (0.0038141402971097224)
	F1 micro: 0.7010547170859134 (0.003005916258065509)
	F1 macro: 0.6737741586005092 (0.00224231619875556)
	duration:20.35589027404785


Gradient Boosting Classifier:
	Accuracy: 0.7158337116861783 (0.004818091392073195)
	ROC AUC: 0.7789780838631672 (0.004818091392073195)
	F1 micro: 0.7158337116861784 (0.0042321057946571984)
	F1 macro: 0.694464501482503 (0.004298639013616878)
	duration:183.16096210479736


AdaBoost Classifier:
	Accuracy: 0.6993717139649152 (0.004817365667097355)
	ROC AUC: 0.7578757958070237 (0.004817365667097355)
	F1 micro: 0.6993717139649152 (0.003797241848309439)
	F1 macro: 0.6766020002702782 (0.003457388945043552)
	duration:52.436386823654175


LinearDiscriminantAnalysis:
	Accuracy: 0.6737088661794537 (0.003713448881459202)
	ROC AUC: 0.7259069047881604 (0.003713448881459202)
	F1 micro: 0.6737088661794537 (0.0027735838029534607)
	F1 macro: 0

# Grid search
After a couple of evaluation rounds we finally choose the Random Forest Classifier. It seems that with the increase of training data it tends to give the best results.

Accordingly we ran GridSearch on the whole dataset with various parameters of the Random Forest Classifier.

Based on the previous runs, increasing the number of estimators creates better results but also becomes very time consuming. For the sake of demonstration I set this parameter to 200.

## Preprocessing the whole dataset:

In [26]:
X_train, y_train, X_test = preprocess(train, test,
                                      dropcors=True,
                                      outl_drop=True,
                                      repnas=False,
                                      repnanvals=True,
                                      #oneregdate=False,
                                      usedatecols=True,
                                      usecats=True,
                                      featsel='fimp',
                                      stand=True,
                                      norm=False)


Replacing data-specific missing codes...
Duration:0.03609895706176758

Splitting date columns...
Duration:35.71002531051636

Creating dummy variables...


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  if __name__ == '__main__':
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


5.280261278152466 seconds

Handling outliers...
271.8466818332672 seconds

Checking for attribute correlations...
Correlated attributes to drop:['lastSeen_dy', 'gearbox_manuell', 'model_601', 'model_cooper', 'model_cuore', 'model_fortwo', 'model_niva', 'model_ypsilon']
71.99041891098022 seconds

Replacing data-specific missing codes...
Duration:0.008980512619018555

Splitting date columns...
Duration:15.24856972694397

Creating dummy variables...
2.1421334743499756 seconds
X_train.shape:(247650, 298)
X_test.shape:(107208, 306)

Automatic feature selection...
Selected Features:['postalCode' 'powerPS' 'yearOfRegistration' 'dateCreated_dy'
 'visiblePeriod' 'monthOfRegistration' 'kilometer'
 'notRepairedDamage_nein' 'notRepairedDamage_ja' 'lastSeen_mt'
 'gearbox_automatik' 'vehicleType_cabrio' 'fuelType_benzin'
 'vehicleType_limousine' 'fuelType_diesel' 'vehicleType_kombi'
 'vehicleType_kleinwagen' 'model_andere' 'brand_volkswagen'
 'dateCreated_mt' 'model_golf' 'brand_bmw' 'brand_opel'
 '

In [27]:
def param_search(train, target, model, pars):
    """Runs a grid search on the data.
    Takes
        train: The predictor attributes
        
        target: The target attribute
        
        model: the used model
        
        pars: The parameters on which it runs the grid search.
    """
    start = time.time()

    print("Starting grid search for parameters...")
    grid_m = GridSearchCV(
        estimator=model(),
        param_grid=pars.param_grid,
        scoring=['accuracy', # Metrics to measure
                 'f1_micro',
                 'f1_macro',
                 'roc_auc'],
        n_jobs=-1,
        verbose=2,
        refit='roc_auc', # The chosen 'primary' metric, which it uses to refit.
        iid=False)
    grid_m.fit(train, target)
    print(grid_m.cv_results_)

    return grid_m
    print("Duration:{}".format(time.time() - start))

In [28]:
rf_params = ParameterGrid({
    #'criterion': ['gini', 'entropy'],
    #'max_depth': [None], # , 5, 4, 3
    'max_features': [0.35, 0.3], #'sqrt', 'log2'
    'min_samples_split': [3, 4], #0.01
    'n_estimators': [200], # 1000
    # 'min_samples_leaf': [0.01],
    #'min_weight_fraction_leaf': [0.01],
    #'min_impurtity_decrease': [0, 0.1, 0.2, 0.3],
    'n_jobs': [-1],
    'random_state': [seed],
    'verbose': [0],
    #'class_weight': [],
    })

In [29]:
rf_gridCV_2 = param_search(X_train, y_train, RandomForestClassifier, rf_params)
print(rf_gridCV_2.best_score_)

Starting grid search for parameters...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] max_features=0.35, min_samples_split=3, n_estimators=200, n_jobs=-1, random_state=258, verbose=0 
[CV] max_features=0.35, min_samples_split=3, n_estimators=200, n_jobs=-1, random_state=258, verbose=0 
[CV]  max_features=0.35, min_samples_split=3, n_estimators=200, n_jobs=-1, random_state=258, verbose=0, total= 6.3min
[CV] max_features=0.35, min_samples_split=3, n_estimators=200, n_jobs=-1, random_state=258, verbose=0 
[CV]  max_features=0.35, min_samples_split=3, n_estimators=200, n_jobs=-1, random_state=258, verbose=0, total= 6.4min
[CV] max_features=0.35, min_samples_split=4, n_estimators=200, n_jobs=-1, random_state=258, verbose=0 
[CV]  max_features=0.35, min_samples_split=3, n_estimators=200, n_jobs=-1, random_state=258, verbose=0, total= 6.4min
[CV] max_features=0.35, min_samples_split=4, n_estimators=200, n_jobs=-1, random_state=258, verbose=0 
[CV]  max_features=0.35, min_sam

[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 46.1min finished


{'mean_fit_time': array([347.59467467, 346.38253935, 307.64177672, 372.41725794]), 'std_fit_time': array([ 2.44666275,  3.15953105,  0.80083965, 45.35246728]), 'mean_score_time': array([36.26458557, 35.13813106, 37.26083771, 42.07505377]), 'std_score_time': array([0.51741804, 0.24752135, 0.49895347, 4.34395885]), 'param_max_features': masked_array(data=[0.35, 0.35, 0.3, 0.3],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_min_samples_split': masked_array(data=[3, 4, 3, 4],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[200, 200, 200, 200],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_n_jobs': masked_array(data=[-1, -1, -1, -1],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_random_state': masked_array(data=[258, 258, 258, 

We can save the results into csv.

In [30]:
pd.DataFrame(rf_gridCV_2.cv_results_).to_csv("GridS_test_results.csv")



### Repeated cross validation
We use RepeatedStratifiedKFold to try to safeguard against instroducting bias thanks to the data structure.

In [31]:
def rep_cv(X_train, y_train, model):
    start = time.time()
    print("Running model...")
    model = model
    kfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=seed)

    rep_cv_res = cross_validate(model,
                                X_train,
                                y_train,
                                scoring=['accuracy', 'roc_auc'],
                                cv=kfold,
                                n_jobs=-1,
                                verbose=2,
                                return_train_score=False)

    print("Test accuracy:{}".format(rep_cv_res['test_accuracy'].mean()))
    print("Test ROC AUC:{}".format(rep_cv_res['test_roc_auc'].mean()))
    end = time.time()
    print("Duration:{}".format(end - start))

    return rep_cv_res

In [32]:
res = rep_cv(X_train, y_train, rf_gridCV_2.best_estimator_)

Running model...
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total= 9.9min
[CV]  ................................................................
[CV] ................................................. , total= 9.9min
[CV]  ................................................................
[CV] ................................................. , total= 9.7min
[CV]  ................................................................
[CV] ................................................. , total= 9.9min
[CV]  ................................................................
[CV] ................................................. , total= 9.0min
[CV]  ................................................................
[CV] ................................................. , total= 9.0min
[CV]  ......................................................

[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 72.7min finished


Test accuracy:0.7579823654024065
Test ROC AUC:0.8293120846617436
Duration:4364.496382236481


We can save the results of the Repeated cross validaton into csv.

In [33]:
pd.DataFrame(res).to_csv("rep_cv_res_test.csv")

# Getting the 1 confidences

In [34]:
def get_1confs(X_train, y_train, X_test, model):
    """Gets the confidences of 1 values as predicted by the model on the test dataset.
    Takes
        X_train: The predictor attributes of the training dataset
        y_train: The target attributes of the training dataset
        X_test: The predictor attributes of the test dataset
        model: the used model
        
    It writes the results into a csv.
    """
    print("X_test.shape:{}".format(X_test.shape))
    model = model
    
    model.fit(X_train, y_train)

    confs = model.predict_proba(X_test)[:,1]
    confs = pd.DataFrame([test.id, confs.reshape(-1,)]).transpose()
    confs.rename(columns={"Unnamed 0": 'label'}, inplace=True)
    confs.to_csv("late.csv", index=False)

In [35]:
#get_1confs(X_train, y_train, X_test, rf_gridCV_2.best_estimator_)