In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_score, KFold

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score

from datetime import datetime

In [None]:
train = pd.read_csv('../data/train.csv', low_memory=False, parse_dates=['Earliest_Start_Date'])
students = pd.read_csv('../data/Student.csv')
internship = pd.read_csv('../data/Internship.csv')
test = pd.read_csv('../data/test.csv')

In [None]:
# one_hot_encoded_skill_features = internship.columns[13:]
internship = internship[internship.columns.drop(one_hot_encoded_skill_features)]

## Merge with internship details

In [None]:
train_with_internship = pd.merge(train, internship, how='left', on='Internship_ID')
test_with_internship = pd.merge(test, internship, how='left', on='Internship_ID')

## Merge with student details as well

In [None]:
def feature_engineer(df):
    df['num_experience'] = df.shape[0]
    df['num_exp_in_job'] = (df.Experience_Type  == 'job').sum()
    df['num_awards'] = (df.Experience_Type  == 'award').sum()
    df['num_previous_internships'] = (df.Experience_Type == 'internship').sum()
    
    return df

students_cleaned = students.groupby('Student_ID').apply(feature_engineer)
students_cleaned = students_cleaned.groupby('Student_ID').first()
students_cleaned = students_cleaned.reset_index()

train_merged = pd.merge(train_with_internship, students_cleaned, on='Student_ID', how='left')
test_merged = pd.merge(test_with_internship, students_cleaned, on='Student_ID', how='left')

In [None]:
# save these engineered files so that we don't have to create them every time
train_merged.to_csv('../data/train_merged.csv', index=False)
test_merged.to_csv('../data/test_merged.csv', index=False)

## Load prepared datasets

In [2]:
# date_columns = ['Earliest_Start_Date', 'Start_Date', 'Start Date', 'End Date', 'Internship_deadline']

train_merged = pd.read_csv('../data/train_merged.csv')
test_merged = pd.read_csv('../data/test_merged.csv')

## Missing values

In [3]:
train_merged.loc[:, 'Skills_required'] = train_merged.Skills_required.fillna('-1')
test_merged.loc[:, 'Skills_required'] = test_merged.Skills_required.fillna('-1')

In [4]:
train_merged.Stipend1 = train_merged.Stipend1.fillna(train_merged.Stipend1.mean())

## Feature Engineering

In [6]:
train_merged.loc[:, 'Num_Skills_Required'] = train_merged.Skills_required.map(lambda x: len(set(x.split(','))))
test_merged.loc[:, 'Num_Skills_Required'] = train_merged.Skills_required.map(lambda x: len(set(x.split(','))))

In [7]:
def salary_mapping(salary):
    if salary < 2000:
        return 'No Expectations'
    elif salary >= 2000 and salary < 5000:
        return '2-5K'
    elif salary >= 5000 and salary < 10000:
        return '5-10K'
    else:
        return '10K+'

In [8]:
train_merged.loc[:, 'Stipend_level'] = train_merged.Stipend1.map(salary_mapping)
test_merged.loc[:, 'Stipend_level'] = test_merged.Stipend1.map(salary_mapping)

In [9]:
avg_stipend_train = train_merged.Stipend1.mean()
avg_stipend_test = test_merged.Stipend1.mean()

train_merged.loc[:, 'less_than_avg_salary'] = train_merged.Stipend1.map(lambda x: int(x < avg_stipend_train))
test_merged.loc[:, 'less_than_avg_salary'] = test_merged.Stipend1.map(lambda x: int(x < avg_stipend_test))    

In [10]:
def check_if_expectations_match(row):
    expected_stipend = row['Expected_Stipend']
    stipend_level = row['Stipend_level']
    
    if expected_stipend == 'No Expectations':
        return 1
    elif expected_stipend == '2-5K':
        if stipend_level in ['2-5K', '5-10K', '10K+']:
            return 1
        else:
            return 0
    elif expected_stipend == '5-10K':
        if stipend_level in ['5-10K', '10K+']:
            return 1
        else:
            return 0
    elif stipend_level == '10K+':
        if stipend_level == '10K+':
            return 1
        else:
            return 0

train_merged.loc[:, 'expectations_match'] = train_merged[['Expected_Stipend', 'Stipend_level']].apply(check_if_expectations_match, axis=1)
test_merged.loc[:, 'expectations_match'] = test_merged[['Expected_Stipend', 'Stipend_level']].apply(check_if_expectations_match, axis=1)

## Label Encoding Categorical Features

In [12]:
categorical_features = train_merged.select_dtypes(include=['object']).columns.drop(['Earliest_Start_Date', 'Start_Date',
                                                                                    'Start Date', 'End Date', 'Internship_deadline'])

for feature in categorical_features:
    lbl = LabelEncoder()
    feature_range = pd.concat([train_merged[feature], test_merged[feature]], axis=0)
    
    lbl.fit(feature_range)
    train_merged.loc[:, feature] = lbl.transform(train_merged[feature])
    test_merged.loc[:, feature] = lbl.transform(test_merged[feature])

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


In [13]:
train_merged = train_merged.fillna(999)
test_merged = test_merged.fillna(999)

In [None]:
# train_merged = train_merged.replace(-999, 999)
# test_merged = test_merged.replace(-999, 999)

## Data Preparation

In [14]:
features = train_merged.columns.drop(['Earliest_Start_Date', 'Start_Date', 'Start Date', 'End Date',
                                      'Is_Shortlisted', 'Internship_deadline', 'Internship_Profile',
                                      'Profile', 'Stipend2'])

In [15]:
X = train_merged[features]
y = train_merged.Is_Shortlisted

In [16]:
random_sample = np.random.randint(0, X.shape[0], size=15000)

In [17]:
X_train = X.iloc[random_sample]
y_train = y.iloc[random_sample]

In [18]:
test_index = X[~X.Internship_ID.isin(X_train.Internship_ID)].index

In [19]:
X_test = X.ix[test_index]
y_test = y.ix[test_index]

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44) 

In [20]:
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(15000, 310) (10962, 310) (15000L,) (10962L,)


## Classification pipeline

In [76]:
select = SelectKBest(chi2, k=25)
# scaler = StandardScaler()

# clf = LogisticRegression(C=1., penalty='l1', class_weight='auto')
clf = RandomForestClassifier(n_estimators=500, max_depth=4, min_samples_split=3, n_jobs=-1)
# clf = GradientBoostingClassifier()
# clf = XGBClassifier(n_estimators=300, learning_rate=0.008, gamma=5, min_child_weight=5, subsample=0.6, colsample_bytree=0.7)

# pipeline = Pipeline([('scaler', scaler), ('clf', clf)])
# pipeline = Pipeline([('clf', clf)])
pipeline = Pipeline([('select', select), ('clf', clf)])

In [77]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('select', SelectKBest(k=25, score_func=<function chi2 at 0x0000000015B246D8>)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## Cross validation score

In [None]:
scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='roc_auc')

In [None]:
print 'mean score %.2f and std %.2f ' %(scores.mean(), scores.std())

In [None]:
train_merged.Skills_required.head()

## Feature Importance

In [29]:
# RF feature importance
forest = pipeline.get_params()['clf']
importances = forest.feature_importances_ 
indices = np.argsort(importances)[::-1] 

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, X_train.columns[f], importances[indices[f]])) 

 1) Internship_ID                  0.092016
 2) Student_ID                     0.089282
 3) Expected_Stipend               0.066643
 4) Minimum_Duration               0.053414
 5) Preferred_location             0.034551
 6) Is_Part_Time                   0.034303
 7) Skills_required                0.034126
 8) Internship_Type                0.033592
 9) Internship_Location            0.033471
10) Internship_category            0.032111
11) No_of_openings                 0.030226
12) Stipend_Type                   0.028327
13) Stipend1                       0.027475
14) Internship_Duration(Months)    0.026960
15) PR                             0.023193
16) UI                             0.023133
17) Marketing                      0.022791
18) Media                          0.021317
19) Social                         0.021157
20) Design                         0.021086
21) Web                            0.019215
22) Development                    0.019011
23) Business                    

IndexError: index 50 is out of bounds for axis 0 with size 50

In [None]:
# GBM feature importance
forest = pipeline.get_params()['clf']
importances = forest.feature_importances_ 
indices = np.argsort(importances)[::-1] 

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, X_train.columns[f], importances[indices[f]])) 

## Predictions

In [78]:
predsTrain = pipeline.predict_proba(X_train)[:, 1]
predsTest = pipeline.predict_proba(X_test)[:, 1]

In [79]:
print 'AUC score on training set %f ' %(roc_auc_score(y_train, predsTrain))
print 'AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

AUC score on training set 0.691441 
AUC score on test set 0.582487 


## Train on full dataset

In [80]:
pipeline.fit(X, y)

Pipeline(steps=[('select', SelectKBest(k=25, score_func=<function chi2 at 0x0000000015B246D8>)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## Predict on the test set

In [81]:
test_features_columns = test_merged.columns.drop(['Earliest_Start_Date', 'Start_Date', 'Start Date', 'End Date',
                                                  'Internship_deadline', 'Internship_Profile',
                                                  'Profile', 'Stipend2'])

In [82]:
test_features = test_merged[test_features_columns]

In [83]:
predictions = pipeline.predict_proba(test_features)[:, 1]

In [84]:
submission = pd.read_csv('../data/submission.csv')
submission['Internship_ID'] = test_merged.Internship_ID
submission['Student_ID'] = test_merged.Student_ID
submission['Is_Shortlisted'] = predictions

In [85]:
submission.to_csv('../submissions/redate_submission_first.csv', index=False)