In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score

from datetime import datetime

In [2]:
train = pd.read_csv('../data/train.csv')
students = pd.read_csv('../data/Student.csv')
internship = pd.read_csv('../data/Internship.csv')
test = pd.read_csv('../data/test.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## Merge with internship details

In [3]:
train_with_internship = pd.merge(train, internship, how='left', on='Internship_ID')
test_with_internship = pd.merge(test, internship, how='left', on='Internship_ID')

In [4]:
train_with_internship_clean = train_with_internship.dropna(axis=1)
test_with_internship_clean = test_with_internship.dropna(axis=1)

## Merge with student details as well

In [5]:
students_cleaned = students.groupby('Student_ID').first()
students_cleaned = students_cleaned.reset_index()

train_with_student = pd.merge(train_with_internship_clean, students_cleaned, on='Student_ID', how='left')
test_with_student = pd.merge(test_with_internship_clean, students_cleaned, on='Student_ID', how='left')

In [6]:
train_merged = train_with_student.dropna(axis=1)
test_merged = test_with_student.dropna(axis=1)

In [7]:
categorical_features = train_merged.select_dtypes(include=['object']).columns

## Label Encoding Categorical Features

In [12]:
for feature in categorical_features[1:]:
    lbl = LabelEncoder()
    feature_range = pd.concat([train_merged[feature], test_merged[feature]], axis=0)
    
    lbl.fit(feature_range)
    train_merged.loc[:, feature] = lbl.transform(train_merged[feature])
    test_merged.loc[:, feature] = lbl.transform(test_merged[feature])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


## Data Preparation

In [13]:
X = train_merged[train_merged.columns.drop(['Earliest_Start_Date', 'Is_Shortlisted'])]
y = train_merged.Is_Shortlisted

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44) 

## Classification pipeline

In [16]:
select = SelectKBest(chi2, k=80)
clf = RandomForestClassifier(n_estimators=100, min_samples_split=5, criterion='entropy', n_jobs=-1)
# clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4)

pipeline = Pipeline([('select', select), ('clf', clf)])

In [17]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('select', SelectKBest(k=80, score_func=<function chi2 at 0x0000000015ACA6D8>)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [18]:
predsTrain = pipeline.predict_proba(X_train)[:, 1]
predsTest = pipeline.predict_proba(X_test)[:, 1]

In [19]:
print 'AUC score on training set %f ' %(roc_auc_score(y_train, predsTrain))
print 'AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

AUC score on training set 0.999692 
AUC score on test set 0.880927 


## Train on full dataset

In [20]:
pipeline.fit(X, y)

Pipeline(steps=[('select', SelectKBest(k=80, score_func=<function chi2 at 0x0000000015ACA6D8>)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## Predict on the test set

In [21]:
test_features = test_merged[test_merged.columns.drop('Earliest_Start_Date')]

In [22]:
predictions = pipeline.predict_proba(test_features)[:, 1]

In [24]:
submission = pd.read_csv('../data/submission.csv')
submission['Internship_ID'] = test.Internship_ID
submission['Student_ID'] = test.Student_ID
submission['Is_Shortlisted'] = predictions

In [25]:
submission.to_csv('../submissions/student_features_rf.csv', index=False)