In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')



In [18]:
# load training and test data
train = pd.read_csv('data/train_processed_new.csv')
test = pd.read_csv('data/test_processed_new.csv')

In [19]:
# balanced dataset
%run scripts/helper.py
train = balanced_sample(train)

In [20]:
# response variable ( sample )
y = train.solved_status

In [21]:
# preprocess skills feature

# determine unique skills
skills = set()
for m in train.skills:
    skills.update(g for g in m.split('|'))
skills = sorted(skills)

#make a column for each skill
for skill in skills:
    train[skill] = [int(skill in s.split('|')) for s in train.skills]

In [22]:
# do same calculation for test set as well
for skill in skills:
    test[skill] = [int(skill in s.split('|')) for s in test.skills]

In [23]:
# determine problem tags
problem_tags = set()
for tag1, tag2, tag3, tag4, tag5 in zip(train.tag1, train.tag2, train.tag3, train.tag4, train.tag5):
    problem_tags.update([tag1, tag2, tag3, tag4, tag5])
problem_tags = sorted(problem_tags)

In [24]:
problem_tags[0] = 'Not Specified'

In [25]:
# make a column for each problem type
for problem_tag in problem_tags:
    train[problem_tag] = [int(problem_tag in tags) for tags in train[['tag1', 'tag2', 'tag3', 'tag4', 'tag5']].values]

In [26]:
for problem_tag in problem_tags:
    test[problem_tag] = [int(problem_tag in tags) for tags in test[['tag1', 'tag2', 'tag3', 'tag4', 'tag5']].values]

## Modelling

In [27]:
%run scripts/features.py
%run scripts/models.py
%run scripts/eval.py

In [28]:
model1 = build_extreme_gradient_boosting(train, test)
# model2 = build_logistic_regression_model(train, test)
# model3 = build_random_forest_classifier(train, test)
# model = build_logistic_regression_model(train, test)
# model = build_knn_classifier(train, test)
# model = build_extra_trees_classifier(train, test)

In [29]:
scores = eval_models([model1], train, y)

accuracy score: 0.820626
combined score: 0.820626
accuracy score: 0.814366
combined score: 0.814366
accuracy score: 0.797913
combined score: 0.797913
accuracy score: 0.818058
combined score: 0.818058
accuracy score: 0.817095
combined score: 0.817095



In [30]:
print 'Mean: %f and Standard Deviation: %f' %(np.mean(scores), np.std(scores))

Mean: 0.813612 and Standard Deviation: 0.008101


In [31]:
predsTrain, predsTest, y_train, y_test = analyze_output(train)

Accuracy on the training set              precision    recall  f1-score   support

        0.0       0.83      0.83      0.83     33274
        1.0       0.83      0.83      0.83     33180

avg / total       0.83      0.83      0.83     66454
 

Accuracy on the test set              precision    recall  f1-score   support

        0.0       0.82      0.82      0.82      8260
        1.0       0.82      0.82      0.82      8354

avg / total       0.82      0.82      0.82     16614
 



In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, predsTest)

In [None]:
misclassified = train.ix[(y_test != predsTest).index.values]
misclassified['preds'] = predsTest

In [None]:
misclassified_class_0 = misclassified[misclassified.solved_status == 0]

In [None]:
misclassified_class_0[['accuracy', 'solved_status', 'preds']]

In [None]:
sns.pointplot(x="solved_status", y="accuracy", data=misclassified);

In [89]:
# train model
import time

start_time = time.time()

# model1.fit(train, y)
# model2.fit(train, y)
# model3.fit(train, y)
model1.fit(train_sample, y_sample)

elapsed_time = time.time() - start_time

print 'It took %f seconds to train the model ' %(elapsed_time)

It took 118.402000 seconds to train the model 


In [90]:
# predictions1  = model1.predict(test)
# predictions2 = model2.predict(test)
# predictions3 = model3.predict(test)

# ensemble_preds = np.array(majority_voting([predictions1, predictions2, predictions3]))
predictions = model1.predict(test)

In [91]:
# submissions
submission_df = pd.DataFrame({'Id': test.id.values, 'solved_status': predictions.astype(int)})
submission_df.to_csv('submissions/thirty_ninth_xgb_num_attempts_at_problem.csv', index=False)