In [109]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score

from datetime import datetime

In [2]:
train = pd.read_csv('../data/train.csv')
students = pd.read_csv('../data/Student.csv')
internship = pd.read_csv('../data/Internship.csv')
test = pd.read_csv('../data/test.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## Merge with student details

In [129]:
train_student_merged = pd.merge(train, students, how='left', on='Student_ID')
test_student_merged = pd.merge(test, students, how='left', on='Student_ID')

In [114]:
now = datetime.now()
current_year = now.year

In [119]:
X = train_student_merged[['Internship_ID', 'Is_Part_Time']]
X.loc[:, 'is_hefty_compensation'] = train_student_merged.Expected_Stipend.map(lambda x: int(x == '10K+'))
X.loc[:, 'is_6_month_programme'] = train_student_merged.Minimum_Duration.map(lambda x: int( (x >= 5) and (x <= 6)))
X.loc[:, 'recent_gradutate'] = train_student_merged.Year_of_graduation.map(lambda x: int(abs( x - current_year ) <= 2))

y = train_student_merged.Is_Shortlisted

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44) 

## Classifier

In [121]:
clf = RandomForestClassifier()
pipeline = Pipeline([('clf', clf)])

In [122]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [123]:
predsTrain = pipeline.predict_proba(X_train)[:, 1]
predsTest = pipeline.predict_proba(X_test)[:, 1]

In [124]:
print 'AUC score on training set %f ' %(roc_auc_score(y_train, predsTrain))
print 'AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

AUC score on training set 0.945218 
AUC score on test set 0.917698 


## Train on full dataset

In [125]:
pipeline.fit(X, y)

Pipeline(steps=[('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [148]:
test_subset = test_student_merged[['Internship_ID', 'Is_Part_Time', 'Student_ID']]
test_subset.loc[:, 'is_hefty_compensation'] = test_student_merged.Expected_Stipend.map(lambda x: int(x == '10K+'))
test_subset.loc[:, 'is_6_month_programme'] = test_student_merged.Minimum_Duration.map(lambda x: int( (x >= 5) and (x <= 6)))
test_subset.loc[:, 'recent_gradutate'] = test_student_merged.Year_of_graduation.map(lambda x: int(abs( x - current_year ) <= 2))

In [149]:
features = ['Internship_ID', 'Is_Part_Time', 'is_hefty_compensation',
            'is_6_month_programme', 'recent_gradutate']

In [151]:
test_subset.loc[:, 'predictions'] = pipeline.predict_proba(test_subset[features])[:, 1]

In [193]:
predictions = test_subset.groupby(['Internship_ID', 'Student_ID']).predictions.mean()

In [203]:
predictions_subset = []

for intern_id, student_id in zip(test.Internship_ID, test.Student_ID):
    predictions_subset.append(predictions.ix[(intern_id, student_id)])

In [165]:
import matplotlib.pyplot as plt
%matplotlib inline

In [205]:
submission = pd.read_csv('../data/submission.csv')
submission['Internship_ID'] = test.Internship_ID
submission['Student_ID'] = test.Student_ID
submission['Is_Shortlisted'] = predictions_subset

In [206]:
submission.to_csv('../submissions/recent_graduate.csv', index=False)