In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load training and test data
train = pd.read_csv('data/train_processed.csv')
test = pd.read_csv('data/test_processed.csv')

In [3]:
# preprocess skills feature

# determine unique skills
skills = set()
for m in train.skills:
    skills.update(g for g in m.split('|'))
skills = sorted(skills)

#make a column for each skill
for skill in skills:
    train[skill] = [int(skill in s.split('|')) for s in train.skills]

In [4]:
# do same calculation for test set as well
for skill in skills:
    test[skill] = [int(skill in s.split('|')) for s in test.skills]

In [5]:
# response variable
y = train.solved_status

## Prepare a very simple model based on just the accuracy of the problem

In [6]:
%run scripts/features.py
%run scripts/models.py
%run scripts/eval.py

In [7]:
model1 = build_extreme_gradient_boosting(train, test)
model2 = build_random_forest_classifier(train, test)
# model = build_logistic_regression_model(train, test)

In [8]:
scores = eval_models([model1, model2], train, y)

accuracy score: 0.859033
accuracy score: 0.852700
combined score: 0.853833
accuracy score: 0.861467
accuracy score: 0.855900
combined score: 0.857733
accuracy score: 0.862033
accuracy score: 0.855967
combined score: 0.857433
accuracy score: 0.860233
accuracy score: 0.854967
combined score: 0.856133
accuracy score: 0.863533
accuracy score: 0.854933
combined score: 0.857300



In [None]:
analyze_output(train)

In [9]:
# train model
import time

start_time = time.time()
model1.fit(train, y)
model2.fit(train, y)
elapsed_time = time.time() - start_time

print 'It took %f seconds to train the model ' %(elapsed_time)

It took 110.416000 seconds to train the model 


In [10]:
predictions1  = model1.predict(test)
predictions2 = model2.predict(test)

ensemble_preds = np.array(majority_voting([predictions1, predictions2]))
# predictions = model.predict(test)

In [None]:
# submissions
submission_df = pd.DataFrame({'Id': test.id.values, 'solved_status': ensemble_preds.astype(int)})
submission_df.to_csv('submissions/seventeen_xgb.csv', index=False)