In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score


In [2]:
train = pd.read_csv('../data/train.csv')
students = pd.read_csv('../data/Student.csv')
internship = pd.read_csv('../data/Internship.csv')
test = pd.read_csv('../data/test.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [38]:
X = train[['Internship_ID', 'Is_Part_Time']]
X.loc[:, 'is_hefty_compensation'] = train.Expected_Stipend.map(lambda x: int(x == '10K+'))
X.loc[:, 'is_6_month_programme'] = train.Minimum_Duration.map(lambda x: int( (x >= 5) and (x <= 6)))

y = train.Is_Shortlisted

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44) 

## Classifier

In [185]:
clf = GradientBoostingClassifier()
pipeline = Pipeline([('clf', clf)])

In [186]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False))])

In [187]:
predsTrain = pipeline.predict_proba(X_train)[:, 1]
predsTest = pipeline.predict_proba(X_test)[:, 1]

In [188]:
print 'AUC score on training set %f ' %(roc_auc_score(y_train, predsTrain))
print 'AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

AUC score on training set 0.691331 
AUC score on test set 0.680581 


In [189]:
pipeline.fit(X, y)

Pipeline(steps=[('clf', GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False))])

In [90]:
test_subset = test[['Internship_ID', 'Is_Part_Time']]
test_subset.loc[:, 'is_hefty_compensation'] = test.Expected_Stipend.map(lambda x: int(x == '10K+'))
test_subset.loc[:, 'is_6_month_programme'] = test.Minimum_Duration.map(lambda x: int( (x >= 5) and (x <= 6)))

In [191]:
predictions = pipeline.predict_proba(test_subset)[:, 1]

In [192]:
submission = pd.read_csv('../data/submission.csv')
submission['Internship_ID'] = test.Internship_ID
submission['Student_ID'] = test.Student_ID
submission['Is_Shortlisted'] = predictions

In [193]:
submission.to_csv('../submissions/gbm.csv', index=False)

## Semi-supervised Learning

In [40]:
data = pd.concat([X, test_subset], axis=0)
labels = pd.concat([y, pd.Series([-1] * test_subset.shape[0])], axis=0)

In [81]:
from sklearn.semi_supervised import LabelPropagation, LabelSpreading

label_prop_model = LabelSpreading(n_neighbors=5, gamma=25)
random_unlabeled_points = np.where(np.random.random_integers(0, 1, size=len(data)))

labels_with_missing_values = np.copy(labels)
labels_with_missing_values[random_unlabeled_points] = -1

scaler = StandardScaler()
data = scaler.fit_transform(data.astype(np.float))

label_prop_model.fit(data[:7500], labels_with_missing_values[:7500])

LabelSpreading(alpha=0.2, gamma=25, kernel='rbf', max_iter=30, n_neighbors=5,
        tol=0.001)

In [103]:
test_subset_scaled = scaler.fit_transform(test_subset.astype(np.float))

In [104]:
predictions = label_prop_model.predict_proba(test_subset_scaled)

In [106]:
predictions = predictions[:, 1]

In [107]:
submission = pd.read_csv('../data/submission.csv')
submission['Internship_ID'] = test.Internship_ID
submission['Student_ID'] = test.Student_ID
submission['Is_Shortlisted'] = predictions

In [108]:
submission.to_csv('../submissions/semi_supervised_learning.csv', index=False)