In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_score, KFold

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score

from datetime import datetime

In [None]:
train = pd.read_csv('../data/train.csv', low_memory=False, parse_dates=['Earliest_Start_Date'])
students = pd.read_csv('../data/Student.csv')
internship = pd.read_csv('../data/Internship.csv')
test = pd.read_csv('../data/test.csv')

In [None]:
# one_hot_encoded_skill_features = internship.columns[13:]
# internship = internship[internship.columns.drop(one_hot_encoded_skill_features)]

## Merge with internship details

In [None]:
train_with_internship = pd.merge(train, internship, how='left', on='Internship_ID')
test_with_internship = pd.merge(test, internship, how='left', on='Internship_ID')

## Merge with student details as well

In [None]:
def feature_engineer(df):
    df['num_experience'] = df.shape[0]
    df['num_exp_in_job'] = (df.Experience_Type  == 'job').sum()
    df['num_awards'] = (df.Experience_Type  == 'award').sum()
    df['num_previous_internships'] = (df.Experience_Type == 'internship').sum()
    
    return df

students_cleaned = students.groupby('Student_ID').apply(feature_engineer)
students_cleaned = students_cleaned.groupby('Student_ID').first()
students_cleaned = students_cleaned.reset_index()

train_merged = pd.merge(train_with_internship, students_cleaned, on='Student_ID', how='left')
test_merged = pd.merge(test_with_internship, students_cleaned, on='Student_ID', how='left')

In [None]:
# save these engineered files so that we don't have to create them every time
train_merged.to_csv('../data/train_merged.csv', index=False)
test_merged.to_csv('../data/test_merged.csv', index=False)

## Load prepared datasets

In [3]:
date_columns = ['Earliest_Start_Date', 'Start_Date', 'Start Date', 'End Date', 'Internship_deadline']
train_merged = pd.read_csv('../data/train_merged.csv', parse_dates=date_columns)
test_merged = pd.read_csv('../data/test_merged.csv', parse_dates=date_columns)

## Missing values

In [7]:
train_merged.loc[:, 'Skills_required'] = train_merged.Skills_required.fillna('-1')
test_merged.loc[:, 'Skills_required'] = test_merged.Skills_required.fillna('-1')

In [8]:
train_merged.loc[:, 'Profile'] = train_merged.Profile.fillna('-1')
test_merged.loc[:, 'Profile'] = test_merged.Profile.fillna('-1')

## Feature Engineering

In [9]:
train_merged.loc[:, 'Earliest_Start_Date_year'] = train_merged.Earliest_Start_Date.dt.year
test_merged.loc[:, 'Earliest_Start_Date_year'] = test_merged.Earliest_Start_Date.dt.year

train_merged.loc[:, 'Earliest_Start_Date_month'] = train_merged.Earliest_Start_Date.dt.month
test_merged.loc[:, 'Earliest_Start_Date_month'] = test_merged.Earliest_Start_Date.dt.month

In [10]:
train_merged.loc[:, 'Internship_deadline_year'] = train_merged.Internship_deadline.dt.year
test_merged.loc[:, 'Internship_deadline_year'] = test_merged.Internship_deadline.dt.year

train_merged.loc[:, 'Internship_deadline_month'] = train_merged.Internship_deadline.dt.month
test_merged.loc[:, 'Internship_deadline_month'] = test_merged.Internship_deadline.dt.month

In [11]:
train_merged.loc[:, 'Num_Skills_Required'] = train_merged.Skills_required.map(lambda x: len(x.split(',')))
test_merged.loc[:, 'Num_Skills_Required'] = train_merged.Skills_required.map(lambda x: len(x.split(',')))

In [12]:
def check_profile(row):
    if row['Profile'].lower() in row['Internship_Profile'].lower():
        return 1
    else:
        return 0
    
train_merged.loc[:, 'profile_match'] = train_merged[['Internship_Profile', 'Profile']].apply(check_profile, axis=1)
test_merged.loc[:, 'profile_match'] = test_merged[['Internship_Profile', 'Profile']].apply(check_profile, axis=1)

In [13]:
def performance_scale(percentage):
    if percentage <= 40.0:
        return 'poor'
    elif percentage > 40.0 and percentage <= 60.0:
        return 'good'
    elif percentage > 60.0 and percentage <= 80.0:
        return 'very good'
    else:
        return 'excellent'

train_merged.loc[:, 'Performance_12th'] = train_merged.Performance_12th.map(performance_scale)
train_merged.loc[:, 'Performance_10th'] = train_merged.Performance_10th.map(performance_scale)

test_merged.loc[:, 'Performance_12th'] = test_merged.Performance_12th.map(performance_scale)
test_merged.loc[:, 'Performance_10th'] = test_merged.Performance_10th.map(performance_scale)

In [15]:
def fix_scale(row):
    scale = row['UG_Scale']
    score = row['Performance_UG']
    
    if scale == 100:
        return score / 10.
    else:
        return score

train_merged.loc[:, 'Performance_UG'] = train_merged[['Performance_UG', 'UG_Scale']].apply(fix_scale, axis=1)
test_merged.loc[:, 'Performance_UG'] = test_merged[['Performance_UG', 'UG_Scale']].apply(fix_scale, axis=1)

In [16]:
def is_web_technology(profile):
    profile = profile.lower()
    
    if 'web' in profile:
        return 1
    else:
        return 0

train_merged.loc[:, 'web_technology'] = train_merged.Internship_Profile.map(is_web_technology)
test_merged.loc[:, 'web_technology'] = test_merged.Internship_Profile.map(is_web_technology)

In [18]:
def fix_scale_PG(row):
    scale = row['PG_scale']
    score = row['Performance_PG']
    
    if scale == 100:
        return score / 10.
    else:
        return score

train_merged.loc[:, 'Performance_PG'] = train_merged[['Performance_PG', 'PG_scale']].apply(fix_scale_PG, axis=1)
test_merged.loc[:, 'Performance_PG'] = test_merged[['Performance_PG', 'PG_scale']].apply(fix_scale_PG, axis=1)

In [19]:
def has_masters_degree(pg_score):
    if pg_score > 0.0:
        return 1
    else:
        return 0

train_merged.loc[:, 'has_masters_degree'] = train_merged.Performance_PG.map(has_masters_degree)
test_merged.loc[:, 'has_masters_degree'] = test_merged.Performance_PG.map(has_masters_degree)   

## Label Encoding Categorical Features

In [21]:
categorical_features = train_merged.select_dtypes(include=['object']).columns

for feature in categorical_features:
    lbl = LabelEncoder()
    feature_range = pd.concat([train_merged[feature], test_merged[feature]], axis=0)
    
    lbl.fit(feature_range)
    train_merged.loc[:, feature] = lbl.transform(train_merged[feature])
    test_merged.loc[:, feature] = lbl.transform(test_merged[feature])

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


In [22]:
train_merged = train_merged.fillna(999)
test_merged = test_merged.fillna(999)

In [60]:
train_merged = train_merged.replace(-999, 999)
test_merged = test_merged.replace(-999, 999)

## Data Preparation

In [176]:
features = train_merged.columns.drop(['Is_Shortlisted', 'Internship_deadline', 'Earliest_Start_Date',
                                      'Start Date', 'End Date', 'Start_Date'])

In [177]:
X = train_merged[features]
y = train_merged.Is_Shortlisted

In [178]:
random_sample = np.random.randint(0, X.shape[0], size=15000)

In [179]:
X_train = X.iloc[random_sample]
y_train = y.iloc[random_sample]

In [182]:
test_index = X[~X.Internship_ID.isin(X_train.Internship_ID)].index

In [183]:
X_test = X.ix[test_index]
y_test = y.ix[test_index]

In [156]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44) 

## Classification pipeline

In [255]:
select = SelectKBest(chi2, k=70)
# scaler = StandardScaler()

# clf = LogisticRegression(C=1., penalty='l1', class_weight='auto')
clf = RandomForestClassifier(n_estimators=500, max_depth=5, min_samples_split=10, min_samples_leaf=30, n_jobs=-1)
# clf = XGBClassifier(n_estimators=300, learning_rate=0.008, gamma=5, min_child_weight=5, subsample=0.6, colsample_bytree=0.7)

# pipeline = Pipeline([('scaler', scaler), ('clf', clf)])
# pipeline = Pipeline([('clf', clf)])
pipeline = Pipeline([('select', select), ('clf', clf)])

In [252]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('select', SelectKBest(k=125, score_func=<function chi2 at 0x0000000015ACF6D8>)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=30, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## Cross validation score

In [None]:
scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='roc_auc')

In [None]:
print 'mean score %.2f and std %.2f ' %(scores.mean(), scores.std())

## Feature Importance

In [211]:
important_feat = sorted(zip(map(lambda x: round(x, 4), pipeline.get_params()['clf'].feature_importances_), X_train.columns), 
             reverse=True)

important_feat

[(0.1031, 'Internship_Location'),
 (0.0764, 'Internship_Profile'),
 (0.0747, 'Internship_Type'),
 (0.0562, 'Internship_ID'),
 (0.0526, 'Preferred_location'),
 (0.047, 'Creative'),
 (0.0325, 'No_of_openings'),
 (0.031, 'Skills_required'),
 (0.0308, 'Marketing'),
 (0.0268, 'PHP'),
 (0.0265, 'Is_Part_Time'),
 (0.0262, 'Internship_category'),
 (0.0229, 'PR'),
 (0.0228, 'Database'),
 (0.0225, 'Digital Marketing'),
 (0.0224, 'Plan'),
 (0.0223, 'Market Research'),
 (0.021, 'Internship_Duration(Months)'),
 (0.0207, 'Android'),
 (0.0199, 'Blogs'),
 (0.0187, 'Operations'),
 (0.0176, 'Analysis'),
 (0.0148, 'Expected_Stipend'),
 (0.0141, 'Student_ID'),
 (0.0136, 'Stipend2'),
 (0.0135, 'Facebook'),
 (0.0131, 'Strategy'),
 (0.0125, 'Quality'),
 (0.0122, 'Stipend_Type'),
 (0.0121, 'Designing'),
 (0.0118, 'Social'),
 (0.0097, 'Java'),
 (0.0086, 'Social Media Marketing'),
 (0.0081, 'Presentation'),
 (0.008, 'HTML'),
 (0.0075, 'Process'),
 (0.007, 'Development'),
 (0.0064, 'Stipend1'),
 (0.0055, 'Commun

In [189]:
classifier = pipeline.get_params()['clf']
classifier.coef_

AttributeError: 'RandomForestClassifier' object has no attribute 'coef_'

## Predictions

In [253]:
predsTrain = pipeline.predict_proba(X_train)[:, 1]
predsTest = pipeline.predict_proba(X_test)[:, 1]

In [254]:
print 'AUC score on training set %f ' %(roc_auc_score(y_train, predsTrain))
print 'AUC score on test set %f ' %(roc_auc_score(y_test, predsTest))

AUC score on training set 0.733323 
AUC score on test set 0.613552 


## Train on full dataset

In [136]:
pipeline.fit(X, y)

Pipeline(steps=[('select', SelectKBest(k=75, score_func=<function chi2 at 0x0000000015ACF6D8>)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## Predict on the test set

In [137]:
test_features_columns = test_merged.columns.drop(['Earliest_Start_Date',
                                      'UG_Scale','PG_scale', 
                                      'Start Date', 'End Date', 
                                      'Start_Date', 'Internship_deadline'
                                     ])

In [138]:
test_features = test_merged[test_features_columns]

In [139]:
predictions = pipeline.predict_proba(test_features)[:, 1]

In [142]:
submission = pd.read_csv('../data/submission.csv')
submission['Internship_ID'] = test_merged.Internship_ID
submission['Student_ID'] = test_merged.Student_ID
submission['Is_Shortlisted'] = predictions

In [143]:
submission.to_csv('../submissions/final_submission.csv', index=False)