In [1]:
import pandas as pd
import sklearn
import sklearn.model_selection
import sklearn.linear_model
import sklearn.metrics
import sklearn.svm
import xgboost

In [2]:
data = pd.read_csv('train_features.csv')
data_test = pd.read_csv('test_features.csv')

In [4]:
SEED = 63467
data_train, data_valid = sklearn.model_selection.train_test_split(data, random_state = SEED, test_size = 0.1)

Columns are either target or features

In [5]:
target_cols = data_train.columns[1:7]
feature_cols = data_train.columns[7:]

Let's try some classifiers

In [None]:
use_xgboost = True

target_name = 'toxic'
Y = data_train[target_name]
X = data_train[feature_cols]

if use_xgboost:
    model = xgboost.XGBClassifier(
        n_jobs = 32,
        seed = SEED,
        max_depth = 5,
        n_estimators = 600
    )
else:
    model = sklearn.linear_model.LogisticRegression()

model.fit(X, Y)

In [None]:
# CV Score on 'toxic
predictions = model.predict_proba(data_valid[feature_cols])[:,1]
actual = data_valid[target_name]
cv_score = sklearn.metrics.roc_auc_score(actual, predictions)
cv_score

In [None]:
# Grid search for XGBoost
for max_depth in [3, 6, 9]:
    for n_estimators in [250, 500, 750]:
        model = xgboost.XGBClassifier(
            n_jobs = 32,
            seed = SEED,
            max_depth = max_depth,
            n_estimators = n_estimators
        )
        model.fit(X, Y)
        predictions = model.predict_proba(data_valid[feature_cols])[:,1]
        actual = data_valid[target_name]
        cv_score = sklearn.metrics.roc_auc_score(actual, predictions)
        print(max_depth, n_estimators, cv_score)

In [None]:
# Grid search for XGBoost
for max_depth, n_estimators in [(10, 850), (11, 950), (12, 1000), (15, 1300)]:
    model = xgboost.XGBClassifier(
        n_jobs = 32,
        seed = SEED,
        max_depth = max_depth,
        n_estimators = n_estimators
    )
    model.fit(X, Y)
    predictions = model.predict_proba(data_valid[feature_cols])[:,1]
    actual = data_valid[target_name]
    cv_score = sklearn.metrics.roc_auc_score(actual, predictions)
    print(max_depth, n_estimators, cv_score)

Run on test set and do submission

In [None]:
submission = pd.DataFrame()
submission['id'] = data_test['id']
for target_name in target_cols:
    print(target_name)
    Y = data[target_name]
    X = data[feature_cols]
    model = xgboost.XGBClassifier(
        n_jobs = 32,
        seed = SEED,
        silent = False,
        max_depth = 12,
        n_estimators = 1000
    )
    model.fit(X, Y)
    submission[target_name] = model.predict_proba(data_test[feature_cols])[:,1]

toxic


In [None]:
submission.to_csv('submission.csv', index = False)