In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score

from datetime import datetime

In [2]:
train = pd.read_csv('../data/train.csv', low_memory=False, parse_dates=['Earliest_Start_Date'])
students = pd.read_csv('../data/Student.csv')
internship = pd.read_csv('../data/Internship.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
one_hot_encoded_skill_features = internship.columns[13:]
internship = internship[internship.columns.drop(one_hot_encoded_skill_features)]

## Merge with internship details

In [4]:
train_with_internship = pd.merge(train, internship, how='left', on='Internship_ID')
test_with_internship = pd.merge(test, internship, how='left', on='Internship_ID')

## Merge with student details as well

In [5]:
students_cleaned = students.groupby('Student_ID').first()
students_cleaned = students_cleaned.reset_index()

train_merged = pd.merge(train_with_internship, students_cleaned, on='Student_ID', how='left')
test_merged = pd.merge(test_with_internship, students_cleaned, on='Student_ID', how='left')

## Missing values

In [6]:
train_merged.loc[:, 'Skills_required'] = train_merged.Skills_required.fillna('-1')
test_merged.loc[:, 'Skills_required'] = test_merged.Skills_required.fillna('-1')

In [7]:
train_merged.loc[:, 'Profile'] = train_merged.Profile.fillna('-1')
test_merged.loc[:, 'Profile'] = test_merged.Profile.fillna('-1')

## Feature Engineering

In [8]:
train_merged.loc[:, 'Num_Skills_Required'] = train_merged.Skills_required.map(lambda x: len(x.split(',')))
test_merged.loc[:, 'Num_Skills_Required'] = train_merged.Skills_required.map(lambda x: len(x.split(',')))

In [9]:
def check_profile(row):
    if row['Profile'].lower() in row['Internship_Profile'].lower():
        return 1
    else:
        return 0
    
train_merged.loc[:, 'profile_match'] = train_merged[['Internship_Profile', 'Profile']].apply(check_profile, axis=1)
test_merged.loc[:, 'profile_match'] = test_merged[['Internship_Profile', 'Profile']].apply(check_profile, axis=1)

In [10]:
def performance_scale(percentage):
    if percentage <= 40.0:
        return 'poor'
    elif percentage > 40.0 and percentage <= 60.0:
        return 'good'
    elif percentage > 60.0 and percentage <= 80.0:
        return 'very good'
    else:
        return 'excellent'

train_merged.loc[:, 'Performance_12th'] = train_merged.Performance_12th.map(performance_scale)
train_merged.loc[:, 'Performance_10th'] = train_merged.Performance_10th.map(performance_scale)

test_merged.loc[:, 'Performance_12th'] = test_merged.Performance_12th.map(performance_scale)
test_merged.loc[:, 'Performance_10th'] = test_merged.Performance_10th.map(performance_scale)

In [13]:
def fix_scale(row):
    scale = row['UG_Scale']
    score = row['Performance_UG']
    
    if scale == 100:
        return score / 10.
    else:
        return score

train_merged.loc[:, 'Performance_UG'] = train_merged[['Performance_UG', 'UG_Scale']].apply(fix_scale, axis=1)
test_merged.loc[:, 'Performance_UG'] = test_merged[['Performance_UG', 'UG_Scale']].apply(fix_scale, axis=1)

## Label Encoding Categorical Features

In [14]:
categorical_features = train_merged.select_dtypes(include=['object']).columns.drop(['Start_Date', 'Start Date', 'End Date',
                                                                                    'Skills_required', 'Profile'])
for feature in categorical_features:
    lbl = LabelEncoder()
    feature_range = pd.concat([train_merged[feature], test_merged[feature]], axis=0)
    
    lbl.fit(feature_range)
    train_merged.loc[:, feature] = lbl.transform(train_merged[feature])
    test_merged.loc[:, feature] = lbl.transform(test_merged[feature])

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


In [15]:
train_merged = train_merged.fillna(-999)
test_merged = test_merged.fillna(-999)

## Data Preparation

In [30]:
features = train_merged.columns.drop(['Earliest_Start_Date', 'Is_Shortlisted',
                                      'Skills_required', 'Profile', 'UG_Scale',
                                     'PG_scale', 'Start Date', 'End Date', 'Start_Date',
                                     'Internship_deadline', 'Preferred_location',
                                     'Internship_ID', 'Student_ID',
                                     'Year_of_graduation', 'profile_match',
                                     'Stipend1', 'Stipend2',
                                     'Current_year'
                                     ])

In [31]:
X = train_merged[features]
y = train_merged.Is_Shortlisted

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44) 

In [33]:
print X_train.shape, ' ', X_test.shape

(154065, 22)   (38517, 22)


## Classification pipeline

In [45]:
# select = SelectKBest(chi2, k=75)
scaler = StandardScaler()

clf = LogisticRegression(C=1., class_weight='auto')
# clf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
# clf = XGBClassifier(n_estimators=1000, learning_rate=0.08, max_depth=8, min_child_weight=5, colsample_bytree=0.8, subsample=0.8)

pipeline = Pipeline([('scaler', scaler), ('clf', clf)])
# pipeline = Pipeline([('clf', clf)])
# pipeline = Pipeline([('select', select), ('clf', clf)])

In [46]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=1.0, class_weight='auto', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])

## Feature Importance

In [36]:
print sorted(zip(map(lambda x: round(x, 4), pipeline.get_params()['clf'].feature_importances_), X_train.columns), 
             reverse=True)

[(0.1255, 'Internship_Profile'), (0.0895, 'Performance_UG'), (0.0741, 'Stream'), (0.0732, 'No_of_openings'), (0.0687, 'hometown'), (0.0643, 'Experience_Type'), (0.0616, 'Degree'), (0.0605, 'Institute_location'), (0.0477, 'Minimum_Duration'), (0.0465, 'Internship_Location'), (0.0441, 'Internship_Duration(Months)'), (0.0437, 'Location'), (0.0366, 'Expected_Stipend'), (0.0297, 'Stipend_Type'), (0.0252, 'Performance_PG'), (0.0214, 'Num_Skills_Required'), (0.0211, 'Performance_10th'), (0.0192, 'Performance_12th'), (0.0149, 'Internship_category'), (0.0128, 'Is_Part_Time'), (0.0109, 'Internship_Type'), (0.009, 'Institute_Category')]


## Predictions

In [47]:
predsTrain = pipeline.predict_proba(X_train)[:, 1]
predsTest = pipeline.predict_proba(X_test)[:, 1]

In [48]:
print 'AUC score on training set %f ' %(roc_auc_score(y_train, predsTrain))
print 'AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

AUC score on training set 0.628239 
AUC score on test set 0.615047 


## Train on full dataset

In [49]:
pipeline.fit(X, y)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LogisticRegression(C=1.0, class_weight='auto', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])

## Predict on the test set

In [60]:
test_features_columns = test_merged.columns.drop(['Earliest_Start_Date', 'Skills_required', 
                 'Profile', 'UG_Scale',
                 'PG_scale', 'Start Date', 
                 'End Date', 'Start_Date',
                 'Internship_deadline', 'Preferred_location',
                 'Internship_ID', 'Student_ID',
                 'Year_of_graduation', 'profile_match',
                 'Stipend1', 'Stipend2',
                 'Current_year'])

In [61]:
test_features = test_merged[test_features_columns]

In [65]:
predictions = pipeline.predict_proba(test_features)[:, 1]

In [67]:
submission = pd.read_csv('../data/submission.csv')
submission['Internship_ID'] = test.Internship_ID
submission['Student_ID'] = test.Student_ID
submission['Is_Shortlisted'] = predictions

In [68]:
submission.to_csv('../submissions/linear_model_selected_features.csv', index=False)