In [1]:
import pandas
import datetime
from fastai.structured import add_datepart
from sklearn.preprocessing import StandardScaler
from sklearn import feature_selection

rawData = pandas.read_csv('Interview.csv')


In [2]:

def get_cleaned_date(date):
    """
    Return datetime object from a string
    """
    date = str(date).strip()

    if '&' in date:
        date = date.split('&')[0].strip()

    # Since there are a lot of formats in the data, need to handle all the possible options
    date_formats = [
        '%d.%m.%Y', '%d.%m.%y', '%d.%m.%y', '%d-%m-%Y', '%d/%m/%y', '%d/%m/%Y', '%d %b %y', '%d-%b -%y',
        '%d – %b-%y', '%d -%b -%y'
    ]

    for date_format in date_formats:
        try:
            return datetime.datetime.strptime(date, date_format)
        except ValueError:
            pass

#We remove ID column to reduce overfitting and the final columns filled with NaNs
cleanData = rawData.drop(['Name(Cand ID)', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27'], axis = 1)

#This row is full of NaNs we should remove it.
cleanData = cleanData.drop(1233)

#Change column names for easier reference
cleanData.columns = [
    'date',
    'client',
    'industry',
    'loc',
    'pos',
    'skillset',
    'interview_type',
    'gender',
    'current_loc',
    'job_loc',
    'venue_loc',
    'native_loc',
    'has_permission',
    'unscheduled',
    'can_call',
    'can_have_alt_number',
    'has_cv_print',
    'venue',
    'call_letter_shared',
    'expected_attendance',
    'observed_attendance',
    'marital_status']

#Make everything lower case and strip whitespace padding for more uniform data
cleanData = pandas.concat([cleanData[c].astype(str).str.lower() for c in cleanData.columns], axis = 1)
cleanData = pandas.concat([cleanData[c].astype(str).str.strip() for c in cleanData.columns], axis = 1)

#Fix date formats
cleanData['date'] = cleanData['date'].map(get_cleaned_date)

#We know the date range should be between sep 2014 and jan 2017 so we eliminate all records outside of that range.
#As there are some outliers into the future.
#NOTE: we should try to calculate mean +- 1SD to possibly substitute these outliers and check if we get better results.
cleanData = cleanData[(cleanData['date'] >'2014-09-01') & (cleanData['date'] < '2017-01-31')]

#Lets extract features from the date field that are easier to understand by most algorithms
add_datepart(cleanData, 'date', drop=True)
#NOTE: Might possibly want to add days to nearest holiday and days from nearest holiday.
#But that needs some research as to India's holidays and potential usefulness of such features


#Merge similar values standard chartered bank and standard chartered bank chennai.
#job location is specified in another column.
cleanData['client'].replace('standard chartered bank chennai', 'standard chartered bank', inplace = True)

#aon hewitt, hewitt and aon hewitt gurgaon seem to be the same company, we merge those too
cleanData['client'].replace(['hewitt', 'aon hewitt gurgaon'], 'aon hewitt', inplace = True)

#Merge similar industry items
cleanData['industry'].replace(['it products and services', 'it services'], 'it', inplace = True)

#Fix interview types
cleanData['interview_type'].replace(['scheduled walk in', 'sceduled walkin'], 'scheduled walkin', inplace=True)

#Fix location fields
cleanData['loc'].replace(['- cochin-', 'gurgaonr'], ['cochin', 'gurgaon'], inplace = True)
cleanData['current_loc'].replace('- cochin-', 'cochin', inplace=True)
cleanData['job_loc'].replace('- cochin-', 'cochin', inplace=True)
cleanData['venue_loc'].replace('- cochin-', 'cochin', inplace=True)
cleanData['native_loc'].replace('- cochin-', 'cochin', inplace=True)
cleanData['native_loc'].replace('delhi /ncr', 'delhi', inplace=True)

#We will assume that the time means that the candidate is expected at that time
time_pattern = '\d?\d[.:]\d\d [ap]m'
cleanData['expected_attendance'].replace(time_pattern, 'yes', regex=True, inplace=True)

#Normalize yes/no/na fields
features= ['has_permission', 'unscheduled', 'can_call', 'can_have_alt_number', 'has_cv_print', 'venue', 'call_letter_shared', 'expected_attendance', 'observed_attendance']
no = ['no', 'not yet', 'no dont', 'no- will take it soon', 'no i have only thi number', 'no- i need to check']
na = ['na', 'nan', 'not sure', 'cant say', 'yet to confirm', 'need to check', 'yet to check', 'havent checked', 'uncertain']

#We use 1, 0, -1 for 'yes', 'na', 'no' respectively as such values are easier for most algorithms to make inferences
for feature in features:
    cleanData[feature].replace(no, -1, inplace=True)
    cleanData[feature].replace(na, 0, inplace=True)
    cleanData[feature].replace('yes', 1, inplace=True)
    
#Normalize gender column.
#We use a -1,1 range as it is recommended for gender data.
cleanData['gender'].replace('male', 1, inplace=True)
cleanData['gender'].replace('female', -1, inplace=True)


#We drop values that look like dateTime as they make no sense in the skillset column.
filterSeries = cleanData['skillset'].str.contains(time_pattern, regex=True)
cleanData = cleanData[~filterSeries]

#Let's give the field some somblance of an uniform format for easier processing
cleanData['skillset'].replace([
    '/', ' ?, ?', ' developer', 'r & d', 'sccm ?- ?', ' – ra'
], [
    ',', ',', '',  'r&d', 'sccm,',  ''
], inplace=True, regex=True)
cleanData['skillset'].replace([
    'cdd kyc',
    'java j2ee',
    'oracle plsql',
    'core java',
    'senior software engineer-mednet',
    'sr automation testing',
    'tech lead-mednet',
    'ra publishing',
    'java jsf',
    'java,j2ee,core java',
    'java tech lead',
    'automation testing java',
    '- sapbo,informatica',
    'production support - sccm',
    'tech lead- mednet',
    'technical lead',
    'senior analyst',
    'sccm – sharepoint',
    'sccm – sql',
    'tl',
    'sccm,(network,sharepoint,ms exchange)',
    'java-sas',
    'lcm -manager',
    'basesas_program,reporting'
], [
    'cdd,kyc',
    'java,j2ee',
    'plsql',
    'java',
    'senior,developer,mednet',
    'senior,automation,testing',
    'tech lead,mednet',
    'publishing',
    'java,jsf',
    'java,j2ee',
    'java,tech lead',
    'java,automation,testing',
    'sapbo,informatica',
    'sccm,production support',
    'tech lead,mednet',
    'tech lead',
    'senior,analyst',
    'sccm,sharepoint',
    'sccm,sql',
    'tech lead',
    'sccm,network,sharepoint,ms_exchange',
    'java,sas',
    'lcm manager',
    'baseSAS'
    
], inplace=True)
cleanData['skillset'].replace([
    'lending&liablities',
    'l & l',
    'lending & liability'
], 'lending and liabilities', inplace=True)
cleanData['skillset'].replace(['biosimiliars', 'biosimillar'], 'biosimilars', inplace=True)
cleanData['skillset'].replace(' ',  '_', regex=True, inplace=True)

#Lets binarize each item into categories
skillset = cleanData.skillset.str.split(',', expand=True).stack()
dummies = pandas.get_dummies(skillset, prefix='skillset').groupby(level=0).sum()

cleanData = cleanData.join(dummies)

#We drop skillset as it is now encoded
cleanData.drop('skillset', axis=1, inplace=True)

#Encode other categorical features using oneHot
features = ['loc', 'pos', 'client', 'industry', 'interview_type', 'current_loc', 'job_loc', 'venue_loc', 'native_loc', 'marital_status']

for feature in features:
    dummies = pandas.get_dummies(cleanData[feature], prefix=feature)
    cleanData.drop(feature, axis=1, inplace=True)
    cleanData = cleanData.join(dummies)


In [33]:
def split_data(data, target_field, frac=0.75, random_state=42):
    #Fit the normalizer with our unseparated data minus the target column
    scaler = StandardScaler()
    scaler.fit(data.drop(target_field, axis=1))

    y = data.observed_attendance
    X = data.drop(target_field, axis=1)

    #Separate data into training and test sets
    train = data.sample(frac=frac, random_state=random_state)
    test = data.loc[~data.index.isin(train.index),  :]

    #exctract the target feature
    y1 = train.observed_attendance
    x1 = train.drop(target_field, axis=1)
    y2 = test.observed_attendance
    x2 = test.drop(target_field, axis=1)
    
    #Normalize separated data
    #x1 = scaler.transform(x1)
    #x2 = scaler.transform(x2)

    return x1, y1, x2, y2


    

In [10]:
# Test function
def test(model, x1, x2, y1, y2):
    model.fit(xTrain, yTrain)
    print('Train: ', model.score(x1, y1))
    print('Test: ', model.score(x2, y2))

In [34]:
xTrain, yTrain, xTest, yTest = split_data(cleanData, 'observed_attendance')

In [35]:
# Try linear regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()

print('Try the model as is:')
test(model, xTrain, xTest, yTrain, yTest)

rfecv = feature_selection.RFECV(estimator=model)

print('Reducing features...')
test(rfecv, xTrain, xTest, yTrain, yTest)
print("Optimal number of features : %d" % rfecv.n_features_)



Try the model as is:
Train:  0.28114687145336137
Test:  0.08947106869333588
Reducing features...
Train:  0.013744223153244639
Test:  0.0005435161417249867
Optimal number of features : 1


In [36]:
# Try logistic regression
from sklearn.linear_model import LogisticRegression

# We set random_state for reproceability
model = LogisticRegression(random_state=42)

print('Try the model as is:')
test(model, xTrain, xTest, yTrain, yTest)

rfecv = feature_selection.RFECV(estimator=model)

print('Reducing features...')
test(rfecv, xTrain, xTest, yTrain, yTest)
print("Optimal number of features : %d" % rfecv.n_features_)



Try the model as is:
Train:  0.6355653128430296
Test:  0.625
Reducing features...
Train:  0.6355653128430296
Test:  0.625
Optimal number of features : 180


In [37]:
#Try decision trees
from sklearn import tree

model = tree.DecisionTreeClassifier()

print('Try the model as is:')
test(model, xTrain, xTest, yTrain, yTest)

rfecv = feature_selection.RFECV(estimator=model)

print('Reducing features...')
test(rfecv, xTrain, xTest, yTrain, yTest)
print("Optimal number of features : %d" % rfecv.n_features_)



Try the model as is:
Train:  0.849615806805708
Test:  0.6710526315789473
Reducing features...
Train:  0.7145993413830956
Test:  0.6907894736842105
Optimal number of features : 1


In [38]:
#Try SVM Linear Classification, we preffer classification over regression because this is a classification problem
from sklearn import svm

model = svm.LinearSVC()

print('Try the model as is:')
test(model, xTrain, xTest, yTrain, yTest)

rfecv = feature_selection.RFECV(estimator=model)

print('Reducing features...')
test(rfecv, xTrain, xTest, yTrain, yTest)
print("Optimal number of features : %d" % rfecv.n_features_)



Try the model as is:
Train:  0.36443468715697036
Test:  0.375
Reducing features...
Train:  0.6355653128430296
Test:  0.625
Optimal number of features : 165


In [39]:
#Try K-means
#we only use this because it is required in the challenge, but we do not expect this to solve the actual problem
#because this is just a clustering algorithm.
from sklearn.cluster import KMeans
model = KMeans(n_clusters=2) #We set n_clusters=2 because we only care about 2 possible outcomes.

test(model, xTrain, xTest, yTrain, yTest)




Train:  -5.511326370532864e+16
Test:  -2.217932067861632e+16
