In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score

from datetime import datetime

In [4]:
train = pd.read_csv('../data/train.csv', low_memory=False, parse_dates=['Earliest_Start_Date'])
students = pd.read_csv('../data/Student.csv')
internship = pd.read_csv('../data/Internship.csv')
test = pd.read_csv('../data/test.csv')

In [8]:
one_hot_encoded_skill_features = internship.columns[13:]
internship = internship[internship.columns.drop(one_hot_encoded_skill_features)]

## Merge with internship details

In [9]:
train_with_internship = pd.merge(train, internship, how='left', on='Internship_ID')
test_with_internship = pd.merge(test, internship, how='left', on='Internship_ID')

## Merge with student details as well

In [11]:
students_cleaned = students.groupby('Student_ID').first()
students_cleaned = students_cleaned.reset_index()

train_merged = pd.merge(train_with_internship, students_cleaned, on='Student_ID', how='left')
test_merged = pd.merge(test_with_internship, students_cleaned, on='Student_ID', how='left')

## Missing values

In [12]:
train_merged.loc[:, 'Skills_required'] = train_merged.Skills_required.fillna('-1')
test_merged.loc[:, 'Skills_required'] = test_merged.Skills_required.fillna('-1')

In [13]:
train_merged.loc[:, 'Profile'] = train_merged.Profile.fillna('-1')
test_merged.loc[:, 'Profile'] = test_merged.Profile.fillna('-1')

In [14]:
def check_profile(row):
    if row['Profile'].lower() in row['Internship_Profile'].lower():
        return 1
    else:
        return 0
    
train_merged.loc[:, 'profile_match'] = train_merged[['Internship_Profile', 'Profile']].apply(check_profile, axis=1)
test_merged.loc[:, 'profile_match'] = test_merged[['Internship_Profile', 'Profile']].apply(check_profile, axis=1)

## Feature Engineering

In [15]:
train_merged.loc[:, 'Num_Skills_Required'] = train_merged.Skills_required.map(lambda x: len(x.split(',')))
test_merged.loc[:, 'Num_Skills_Required'] = train_merged.Skills_required.map(lambda x: len(x.split(',')))

In [16]:
train_merged.loc[:, 'in_third_year'] = train_merged.Current_year.map(lambda x: int(x==3))
test_merged.loc[:, 'in_third_year'] = test_merged.Current_year.map(lambda x: int(x==3))

In [17]:
def performance_scale(percentage):
    if percentage <= 40.0:
        return 'poor'
    elif percentage > 40.0 and percentage <= 60.0:
        return 'good'
    elif percentage > 60.0 and percentage <= 80.0:
        return 'very good'
    else:
        return 'excellent'

train_merged.loc[:, 'Performance_12th'] = train_merged.Performance_12th.map(performance_scale)
train_merged.loc[:, 'Performance_10th'] = train_merged.Performance_10th.map(performance_scale)

test_merged.loc[:, 'Performance_12th'] = test_merged.Performance_12th.map(performance_scale)
test_merged.loc[:, 'Performance_10th'] = test_merged.Performance_10th.map(performance_scale)

## Label Encoding Categorical Features

In [19]:
categorical_features = train_merged.select_dtypes(include=['object']).columns.drop(['Start_Date', 'Start Date', 'End Date',
                                                                                    'Skills_required', 'Profile'])
for feature in categorical_features:
    lbl = LabelEncoder()
    feature_range = pd.concat([train_merged[feature], test_merged[feature]], axis=0)
    
    lbl.fit(feature_range)
    train_merged.loc[:, feature] = lbl.transform(train_merged[feature])
    test_merged.loc[:, feature] = lbl.transform(test_merged[feature])

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


In [20]:
train_mergedged = train_merged.fillna(-999)
test_merged = test_merged.fillna(-999)

## Data Preparation

In [None]:
# features = ['Internship_ID', 'Student_ID', 'No_of_openings', 'Num_Skills_Required', 'in_third_year',
#             'Expected_Stipend', 'Internship_Profile']
features = train_merged.columns.drop(['Earliest_Start_Date', 'Is_Shortlisted',
                                      'Skills_required', 'Profile', 'Internship_Profile'])

In [None]:
X = train_merged[features]
y = train_merged.Is_Shortlisted

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44) 

In [None]:
print X_train.shape, ' ', X_test.shape

## Classification pipeline

In [None]:
select = SelectKBest(chi2, k=75)
clf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
# clf = XGBClassifier(n_estimators=1000, learning_rate=0.08, max_depth=8, min_child_weight=5, colsample_bytree=0.8, subsample=0.8)
# pipeline = Pipeline([('clf', clf)])
pipeline = Pipeline([('select', select), ('clf', clf)])

In [None]:
pipeline.fit(X_train, y_train)

## Feature Importance

In [None]:
print sorted(zip(map(lambda x: round(x, 4), pipeline.get_params()['clf'].feature_importances_), X_train.columns), 
             reverse=True)

## Predictions

In [None]:
predsTrain = pipeline.predict_proba(X_train)[:, 1]
predsTest = pipeline.predict_proba(X_test)[:, 1]

In [None]:
print 'AUC score on training set %f ' %(roc_auc_score(y_train, predsTrain))
print 'AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

## Train on full dataset

In [None]:
pipeline.fit(X, y)

## Predict on the test set

In [None]:
test_features = test_merged[test_merged.columns.drop('Earliest_Start_Date')]

In [None]:
predictions = pipeline.predict_proba(test_features)[:, 1]

In [None]:
submission = pd.read_csv('../data/submission.csv')
submission['Internship_ID'] = test.Internship_ID
submission['Student_ID'] = test.Student_ID
submission['Is_Shortlisted'] = predictions

In [None]:
submission.to_csv('../submissions/student_features_rf.csv', index=False)