In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_score, KFold

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score

from datetime import datetime

In [None]:
train = pd.read_csv('../data/train.csv', low_memory=False, parse_dates=['Earliest_Start_Date'])
students = pd.read_csv('../data/Student.csv')
internship = pd.read_csv('../data/Internship.csv')
test = pd.read_csv('../data/test.csv')

In [None]:
# one_hot_encoded_skill_features = internship.columns[13:]
internship = internship[internship.columns.drop(one_hot_encoded_skill_features)]

## Merge with internship details

In [None]:
train_with_internship = pd.merge(train, internship, how='left', on='Internship_ID')
test_with_internship = pd.merge(test, internship, how='left', on='Internship_ID')

## Merge with student details as well

In [None]:
def feature_engineer(df):
    df['num_experience'] = df.shape[0]
    df['num_exp_in_job'] = (df.Experience_Type  == 'job').sum()
    df['num_awards'] = (df.Experience_Type  == 'award').sum()
    df['num_previous_internships'] = (df.Experience_Type == 'internship').sum()
    
    return df

students_cleaned = students.groupby('Student_ID').apply(feature_engineer)
students_cleaned = students_cleaned.groupby('Student_ID').first()
students_cleaned = students_cleaned.reset_index()

train_merged = pd.merge(train_with_internship, students_cleaned, on='Student_ID', how='left')
test_merged = pd.merge(test_with_internship, students_cleaned, on='Student_ID', how='left')

In [None]:
# save these engineered files so that we don't have to create them every time
train_merged.to_csv('../data/train_merged.csv', index=False)
test_merged.to_csv('../data/test_merged.csv', index=False)

## Load prepared datasets

In [2]:
# date_columns = ['Earliest_Start_Date', 'Start_Date', 'Start Date', 'End Date', 'Internship_deadline']

train_merged = pd.read_csv('../data/train_merged.csv')
test_merged = pd.read_csv('../data/test_merged.csv')

## Missing values

In [3]:
train_merged.loc[:, 'Skills_required'] = train_merged.Skills_required.fillna('-1')
test_merged.loc[:, 'Skills_required'] = test_merged.Skills_required.fillna('-1')

In [4]:
train_merged.Stipend1 = train_merged.Stipend1.fillna(train_merged.Stipend1.mean())

## Feature Engineering

In [5]:
train_merged.loc[:, 'is_stipend_performance_or_unpaid'] = train_merged.Stipend_Type.map(lambda x: int(x in [1, 2]))
test_merged.loc[:, 'is_stipend_performance_or_unpaid'] = test_merged.Stipend_Type.map(lambda x: int(x in [1, 2])) 

In [6]:
train_merged.loc[:, 'Num_Skills_Required'] = train_merged.Skills_required.map(lambda x: len(set(x.split(','))))
test_merged.loc[:, 'Num_Skills_Required'] = train_merged.Skills_required.map(lambda x: len(set(x.split(','))))

In [7]:
def salary_mapping(salary):
    if salary < 2000:
        return 'No Expectations'
    elif salary >= 2000 and salary < 5000:
        return '2-5K'
    elif salary >= 5000 and salary < 10000:
        return '5-10K'
    else:
        return '10K+'

In [8]:
train_merged.loc[:, 'Stipend_level'] = train_merged.Stipend1.map(salary_mapping)
test_merged.loc[:, 'Stipend_level'] = test_merged.Stipend1.map(salary_mapping)

In [15]:
avg_stipend_train = train_merged.Stipend1.mean()
avg_stipend_test = test_merged.Stipend1.mean()

get_stipend_train = lambda x: int((x['Stipend_Type'] in ['fixed', 'performance']) and x['Stipend1'] < avg_stipend_train)
get_stipend_test = lambda x: int((x['Stipend_Type'] in ['fixed', 'performance']) and x['Stipend1'] < avg_stipend_test)

train_merged.loc[:, 'less_than_avg_salary'] = train_merged.apply(get_stipend_train, axis=1)
test_merged.loc[:, 'less_than_avg_salary'] = test_merged.apply(get_stipend_test, axis=1)    

In [19]:
def check_if_expectations_match(row):
    expected_stipend = row['Expected_Stipend']
    stipend_level = row['Stipend_level']
    
    if expected_stipend == 'No Expectations':
        return 1
    elif expected_stipend == '2-5K':
        if stipend_level in ['2-5K', '5-10K', '10K+']:
            return 1
        else:
            return 0
    elif expected_stipend == '5-10K':
        if stipend_level in ['5-10K', '10K+']:
            return 1
        else:
            return 0
    elif expected_stipend == '10K+':
        if stipend_level == '10K+':
            return 1
        else:
            return 0

train_merged.loc[:, 'expectations_match'] = train_merged[['Expected_Stipend', 'Stipend_level']].apply(check_if_expectations_match, axis=1)
test_merged.loc[:, 'expectations_match'] = test_merged[['Expected_Stipend', 'Stipend_level']].apply(check_if_expectations_match, axis=1)

## Label Encoding Categorical Features

In [20]:
categorical_features = train_merged.select_dtypes(include=['object']).columns.drop(['Earliest_Start_Date', 'Start_Date',
                                                                                    'Start Date', 'End Date', 'Internship_deadline'])

for feature in categorical_features:
    lbl = LabelEncoder()
    feature_range = pd.concat([train_merged[feature], test_merged[feature]], axis=0)
    
    lbl.fit(feature_range)
    train_merged.loc[:, feature] = lbl.transform(train_merged[feature])
    test_merged.loc[:, feature] = lbl.transform(test_merged[feature])

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


In [21]:
train_merged = train_merged.fillna(999)
test_merged = test_merged.fillna(999)

In [None]:
# train_merged = train_merged.replace(-999, 999)
# test_merged = test_merged.replace(-999, 999)

## Data Preparation

In [22]:
features_to_drop = ['Earliest_Start_Date', 'Start_Date', 'Start Date', 'End Date',
                                      'Is_Shortlisted', 'Internship_deadline', 'Internship_Profile',
                                      'Profile', 'Stipend2', 'PG_scale', 'UG_Scale']

features_to_drop.extend(list(train_merged.columns[20:293]))

features = train_merged.columns.drop(features_to_drop)

In [23]:
X = train_merged[features]
y = train_merged.Is_Shortlisted

In [24]:
random_sample = np.random.randint(0, X.shape[0], size=15000)

In [25]:
X_train = X.iloc[random_sample]
y_train = y.iloc[random_sample]

In [26]:
test_index = X[~X.Internship_ID.isin(X_train.Internship_ID)].index

In [27]:
X_test = X.ix[test_index]
y_test = y.ix[test_index]

In [28]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44) 

In [29]:
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(15000, 36) (10989, 36) (15000L,) (10989L,)


## Classification pipeline

In [39]:
select = SelectKBest(chi2, k=25)
# scaler = StandardScaler()

# clf = LogisticRegression(C=1., penalty='l1', class_weight='auto')
# clf = RandomForestClassifier(n_estimators=150, max_depth=6, min_samples_split=3, n_jobs=-1)
# clf = GradientBoostingClassifier()
clf = XGBClassifier()

# pipeline = Pipeline([('scaler', scaler), ('clf', clf)])
pipeline = Pipeline([('clf', clf)])
# pipeline = Pipeline([('select', select), ('clf', clf)])

In [40]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

## Cross validation score

In [None]:
scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='roc_auc')

In [None]:
print 'mean score %.2f and std %.2f ' %(scores.mean(), scores.std())

In [None]:
train_merged.Skills_required.head()

## Feature Importance

In [32]:
# RF feature importance
forest = pipeline.get_params()['clf']
importances = forest.feature_importances_ 
indices = np.argsort(importances)[::-1] 

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, X_train.columns[f], importances[indices[f]])) 

 1) Internship_ID                  0.100920
 2) Student_ID                     0.100140
 3) Expected_Stipend               0.062676
 4) Minimum_Duration               0.060714
 5) Preferred_location             0.050570
 6) Is_Part_Time                   0.049225
 7) Skills_required                0.040464
 8) Internship_Type                0.040312
 9) Internship_Location            0.038752
10) Internship_category            0.038656
11) No_of_openings                 0.035086
12) Stipend_Type                   0.034022
13) Stipend1                       0.033303
14) Internship_Duration(Months)    0.032804
15) Institute_Category             0.028950
16) Institute_location             0.022588
17) hometown                       0.022531
18) Degree                         0.022338
19) Stream                         0.021629
20) Current_year                   0.019678
21) Year_of_graduation             0.018295
22) Performance_PG                 0.016937
23) Performance_UG              

In [None]:
# GBM feature importance
forest = pipeline.get_params()['clf']
importances = forest.feature_importances_ 
indices = np.argsort(importances)[::-1] 

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, X_train.columns[f], importances[indices[f]])) 

## Predictions

In [41]:
predsTrain = pipeline.predict_proba(X_train)[:, 1]
predsTest = pipeline.predict_proba(X_test)[:, 1]

In [42]:
print 'AUC score on training set %f ' %(roc_auc_score(y_train, predsTrain))
print 'AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

AUC score on training set 0.776711 
AUC score on test set 0.581524 


## Train on full dataset

In [None]:
pipeline.fit(X, y)

## Predict on the test set

In [None]:
features_to_drop = ['Earliest_Start_Date', 'Start_Date', 'Start Date', 'End Date',
                    'Internship_deadline', 'Internship_Profile',
                    'Profile', 'Stipend2', 'PG_scale', 'UG_Scale']

features_to_drop.extend(list(train_merged.columns[20:293]))
features = test_merged.columns.drop(features_to_drop)

In [None]:
test_features = test_merged[features]

In [None]:
predictions = pipeline.predict_proba(test_features)[:, 1]

In [None]:
submission = pd.read_csv('../data/submission.csv')
submission['Internship_ID'] = test_merged.Internship_ID
submission['Student_ID'] = test_merged.Student_ID
submission['Is_Shortlisted'] = predictions

In [None]:
submission.to_csv('../submissions/redate_submission_second.csv', index=False)