In [88]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_score, KFold

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score

from datetime import datetime

In [None]:
train = pd.read_csv('../data/train.csv', low_memory=False, parse_dates=['Earliest_Start_Date'])
students = pd.read_csv('../data/Student.csv')
internship = pd.read_csv('../data/Internship.csv')
test = pd.read_csv('../data/test.csv')

In [None]:
# one_hot_encoded_skill_features = internship.columns[13:]
internship = internship[internship.columns.drop(one_hot_encoded_skill_features)]

## Merge with internship details

In [None]:
train_with_internship = pd.merge(train, internship, how='left', on='Internship_ID')
test_with_internship = pd.merge(test, internship, how='left', on='Internship_ID')

## Merge with student details as well

In [None]:
def feature_engineer(df):
    df['num_experience'] = df.shape[0]
    df['num_exp_in_job'] = (df.Experience_Type  == 'job').sum()
    df['num_awards'] = (df.Experience_Type  == 'award').sum()
    df['num_previous_internships'] = (df.Experience_Type == 'internship').sum()
    
    return df

students_cleaned = students.groupby('Student_ID').apply(feature_engineer)
students_cleaned = students_cleaned.groupby('Student_ID').first()
students_cleaned = students_cleaned.reset_index()

train_merged = pd.merge(train_with_internship, students_cleaned, on='Student_ID', how='left')
test_merged = pd.merge(test_with_internship, students_cleaned, on='Student_ID', how='left')

In [None]:
# save these engineered files so that we don't have to create them every time
train_merged.to_csv('../data/train_merged.csv', index=False)
test_merged.to_csv('../data/test_merged.csv', index=False)

## Load prepared datasets

In [184]:
date_columns = ['Earliest_Start_Date', 'Start_Date', 'Start Date', 'End Date', 'Internship_deadline']

train_merged = pd.read_csv('../data/train_merged.csv', parse_dates=date_columns)
test_merged = pd.read_csv('../data/test_merged.csv', parse_dates=date_columns)

## Missing values

In [185]:
train_merged.loc[:, 'Skills_required'] = train_merged.Skills_required.fillna('-1')
test_merged.loc[:, 'Skills_required'] = test_merged.Skills_required.fillna('-1')

In [186]:
train_merged.Stipend1 = train_merged.Stipend1.fillna(train_merged.Stipend1.mean())

## Feature Engineering

In [187]:
train_merged.loc[:, 'is_stipend_performance_or_unpaid'] = train_merged.Stipend_Type.map(lambda x: int(x in [1, 2]))
test_merged.loc[:, 'is_stipend_performance_or_unpaid'] = test_merged.Stipend_Type.map(lambda x: int(x in [1, 2])) 

In [188]:
train_merged.loc[:, 'Num_Skills_Required'] = train_merged.Skills_required.map(lambda x: len(set(x.split(','))))
test_merged.loc[:, 'Num_Skills_Required'] = train_merged.Skills_required.map(lambda x: len(set(x.split(','))))

In [189]:
def salary_mapping(salary):
    if salary < 2000:
        return 'No Expectations'
    elif salary >= 2000 and salary < 5000:
        return '2-5K'
    elif salary >= 5000 and salary < 10000:
        return '5-10K'
    else:
        return '10K+'

In [190]:
train_merged.loc[:, 'Stipend_level'] = train_merged.Stipend1.map(salary_mapping)
test_merged.loc[:, 'Stipend_level'] = test_merged.Stipend1.map(salary_mapping)

In [191]:
avg_stipend_train = train_merged.Stipend1.mean()
avg_stipend_test = test_merged.Stipend1.mean()

get_stipend_train = lambda x: int((x['Stipend_Type'] in ['fixed', 'performance']) and x['Stipend1'] < avg_stipend_train)
get_stipend_test = lambda x: int((x['Stipend_Type'] in ['fixed', 'performance']) and x['Stipend1'] < avg_stipend_test)

train_merged.loc[:, 'less_than_avg_salary'] = train_merged.apply(get_stipend_train, axis=1)
test_merged.loc[:, 'less_than_avg_salary'] = test_merged.apply(get_stipend_test, axis=1)    

In [256]:
train_merged.Year_of_graduation.unique()

array([2015, 2017, 2012, 2014, 2016, 2013, 2011, 2019, 2018, 2009, 2010,
       2001, 2020], dtype=int64)

In [257]:
test_merged.Year_of_graduation.unique()

array([2016, 2015, 2014, 2018, 2013, 2012, 2017, 2011, 2010, 2009, 2019,
       2001, 2020], dtype=int64)

In [260]:
def get_mean_stipend(df):
    year_of_grad = df['Year_of_graduation'].unique()
    mapping = {}
    
    for year in year_of_grad:
        mapping[year] = df[df['Year_of_graduation'] == year].Stipend1.mean()
    
    return mapping

train_map = get_mean_stipend(train_merged)
test_map = get_mean_stipend(test_merged)

In [262]:
def calculate_expectation_level_train(row):
    year_of_grad = row['Year_of_graduation']
    mean_stipend = train_map[year_of_grad]
    stipend1 = row['Stipend1']
    
    return int ( stipend1 < mean_stipend ) 

def calculate_expectation_level_test(row):
    year_of_grad = row['Year_of_graduation']
    mean_stipend = test_map[year_of_grad]
    stipend1 = row['Stipend1']
    
    return int ( stipend1 < mean_stipend ) 


train_merged.loc[:, 'stipend_expectation_by_exp'] = train_merged.apply(calculate_expectation_level_train, axis=1)
test_merged.loc[:, 'stipend_expectation_by_exp'] = test_merged.apply(calculate_expectation_level_test, axis=1)

In [192]:
def check_if_expectations_match(row):
    expected_stipend = row['Expected_Stipend']
    stipend_level = row['Stipend_level']
    
    if expected_stipend == 'No Expectations':
        return 1
    elif expected_stipend == '2-5K':
        if stipend_level in ['2-5K', '5-10K', '10K+']:
            return 1
        else:
            return 0
    elif expected_stipend == '5-10K':
        if stipend_level in ['5-10K', '10K+']:
            return 1
        else:
            return 0
    elif expected_stipend == '10K+':
        if stipend_level == '10K+':
            return 1
        else:
            return 0

train_merged.loc[:, 'expectations_match'] = train_merged[['Expected_Stipend', 'Stipend_level']].apply(check_if_expectations_match, axis=1)
test_merged.loc[:, 'expectations_match'] = test_merged[['Expected_Stipend', 'Stipend_level']].apply(check_if_expectations_match, axis=1)

In [193]:
train_merged.loc[:, 'Earliest_Start_Date_month'] = train_merged.Earliest_Start_Date.dt.month
test_merged.loc[:, 'Earliest_Start_Date_month'] = test_merged.Earliest_Start_Date.dt.month

train_merged.loc[:, 'is_Dec_or_Jan'] = train_merged.Earliest_Start_Date_month.map(lambda x: int(x == 12 or x == 1))
test_merged.loc[:, 'is_Dec_or_Jan'] = test_merged.Earliest_Start_Date_month.map(lambda x: int(x == 12 or x == 1))

## Label Encoding Categorical Features

In [195]:
categorical_features = train_merged.select_dtypes(include=['object']).columns.drop(['Skills_required', 'Profile', 'Degree'])

for feature in categorical_features:
    lbl = LabelEncoder()
    feature_range = pd.concat([train_merged[feature], test_merged[feature]], axis=0)
    
    lbl.fit(feature_range)
    train_merged.loc[:, feature] = lbl.transform(train_merged[feature])
    test_merged.loc[:, feature] = lbl.transform(test_merged[feature])

In [196]:
train_merged = train_merged.fillna(999)
test_merged = test_merged.fillna(999)

In [None]:
# train_merged = train_merged.replace(-999, 999)
# test_merged = test_merged.replace(-999, 999)

In [253]:
train_merged.is_Dec_or_Jan.value_counts()

1    170208
0     22374
Name: is_Dec_or_Jan, dtype: int64

## Data Preparation

In [263]:
features_to_drop = ['Earliest_Start_Date', 'Start_Date', 'Start Date', 'End Date',
                    'Is_Shortlisted', 'Internship_deadline', 'Internship_Profile',
                    'Profile', 'Stipend2', 'PG_scale', 'UG_Scale', 'Skills_required',
                    'Degree']

features_to_drop.extend(list(train_merged.columns[20:293]))
features = train_merged.columns.drop(features_to_drop)

In [264]:
X = train_merged[features]
y = train_merged.Is_Shortlisted

In [265]:
random_sample = np.random.randint(0, X.shape[0], size=15000)

In [266]:
X_train = X.iloc[random_sample]
y_train = y.iloc[random_sample]

In [267]:
test_index = X[~X.Internship_ID.isin(X_train.Internship_ID)].index

In [268]:
X_test = X.ix[test_index]
y_test = y.ix[test_index]

In [269]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44) 

In [271]:
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(15000, 37) (10656, 37) (15000L,) (10656L,)


## Classification pipeline

In [291]:
select = SelectKBest(chi2, k=25)
# scaler = StandardScaler()

# clf = LogisticRegression(C=1., penalty='l1', class_weight='auto')
clf = RandomForestClassifier(n_estimators=500, max_depth=3, min_samples_split=3, n_jobs=-1)
# clf = ExtraTreesClassifier(n_estimators=350, min_samples_split=5, class_weight='auto', max_depth=3, n_jobs=-1)
# clf = GradientBoostingClassifier()
# clf = XGBClassifier()

# pipeline = Pipeline([('scaler', scaler), ('clf', clf)])
pipeline = Pipeline([('clf', clf)])

# pipeline = Pipeline([('select', select), ('clf', clf)])

In [288]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=750, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## Cross validation score

In [None]:
scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='roc_auc')

In [None]:
print 'mean score %.2f and std %.2f ' %(scores.mean(), scores.std())

In [None]:
train_merged.Skills_required.head()

## Feature Importance

In [274]:
# RF feature importance
forest = pipeline.get_params()['clf']
importances = forest.feature_importances_ 
indices = np.argsort(importances)[::-1] 

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, X_train.columns[f], importances[indices[f]])) 

 1) Internship_ID                  0.164546
 2) Student_ID                     0.141308
 3) Expected_Stipend               0.139328
 4) Minimum_Duration               0.055127
 5) Preferred_location             0.048159
 6) Is_Part_Time                   0.045961
 7) Internship_Type                0.043041
 8) Internship_Location            0.042725
 9) Internship_category            0.035343
10) No_of_openings                 0.033308
11) Stipend_Type                   0.031789
12) Stipend1                       0.027245
13) Internship_Duration(Months)    0.021694
14) Institute_Category             0.020378
15) Institute_location             0.017301
16) hometown                       0.014464
17) Stream                         0.013980
18) Current_year                   0.013622
19) Year_of_graduation             0.013281
20) Performance_PG                 0.011027
21) Performance_UG                 0.009370
22) Performance_12th               0.008996
23) Performance_10th            

In [214]:
# GBM feature importance
forest = pipeline.get_params()['clf']
importances = forest.feature_importances_ 
indices = np.argsort(importances)[::-1] 

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, X_train.columns[f], importances[indices[f]])) 

 1) Internship_ID                  0.112838
 2) Student_ID                     0.110488
 3) Expected_Stipend               0.081714
 4) Minimum_Duration               0.073927
 5) Preferred_location             0.049659
 6) Is_Part_Time                   0.047065
 7) Internship_Type                0.040350
 8) Internship_Location            0.039712
 9) Internship_category            0.039164
10) No_of_openings                 0.038776
11) Stipend_Type                   0.037496
12) Stipend1                       0.029431
13) Internship_Duration(Months)    0.027383
14) Institute_Category             0.023922
15) Institute_location             0.021115
16) hometown                       0.020669
17) Stream                         0.019968
18) Current_year                   0.019587
19) Year_of_graduation             0.019051
20) Performance_PG                 0.018515
21) Performance_UG                 0.017904
22) Performance_12th               0.015356
23) Performance_10th            

## Predictions

In [289]:
predsTrain = (pipeline.predict_proba(X_train)[:, 1])
predsTest = (pipeline.predict_proba(X_test)[:, 1])

In [290]:
print 'AUC score on training set %f ' %(roc_auc_score(y_train, predsTrain))
print 'AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

AUC score on training set 0.670004 
AUC score on test set 0.621218 


## Train on full dataset

In [292]:
pipeline.fit(X, y)

Pipeline(steps=[('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## Predict on the test set

In [293]:
features_to_drop = ['Earliest_Start_Date', 'Start_Date', 'Start Date', 'End Date',
                    'Internship_deadline', 'Internship_Profile',
                    'Profile', 'Stipend2', 'PG_scale', 'UG_Scale', 'Skills_required',
                    'Degree']

features_to_drop.extend(list(train_merged.columns[20:293]))
features = test_merged.columns.drop(features_to_drop)

In [294]:
test_features = test_merged[features]

In [295]:
predictions = (pipeline.predict_proba(test_features)[:, 1])

In [296]:
submission = pd.read_csv('../data/submission.csv')
submission['Internship_ID'] = test_merged.Internship_ID
submission['Student_ID'] = test_merged.Student_ID
submission['Is_Shortlisted'] = predictions

In [297]:
submission.to_csv('../submissions/redate_salary_expectation_increased_estimators.csv', index=False)