In [1]:
import pandas as pd
import numpy as np
import re
import datetime
# from fastai.structured import add_datepart
from sklearn import feature_selection, preprocessing
from sklearn.model_selection import train_test_split

rawData = pd.read_csv('Interview.csv')


In [2]:
def add_datepart(df, fldname, drop=True, time=False):
    """add_datepart converts a column of df from a datetime64 to many columns containing
    the information from the date. This applies changes inplace.
    Parameters:
    -----------
    df: A pandas data frame. df gain several new columns.
    fldname: A string that is the name of the date column you wish to expand.
        If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
    drop: If true then the original date column will be removed.
    time: If true time features: Hour, Minute, Second will be added.
    Examples:
    ---------
    >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
    >>> df
        A
    0   2000-03-11
    1   2000-03-12
    2   2000-03-13
    >>> add_datepart(df, 'A')
    >>> df
        AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
    0   2000  3      10    11   5          71         False         False           False           False             False        False          952732800
    1   2000  3      10    12   6          72         False         False           False           False             False        False          952819200
    2   2000  3      11    13   0          73         False         False           False           False             False        False          952905600
    """
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)
        
def get_cleaned_date(date):
    """
    Return datetime object from a string
    """
    date = str(date).strip()

    if '&' in date:
        date = date.split('&')[0].strip()

    # Since there are a lot of formats in the data, need to handle all the possible options
    date_formats = [
        '%d.%m.%Y', '%d.%m.%y', '%d.%m.%y', '%d-%m-%Y', '%d/%m/%y', '%d/%m/%Y', '%d %b %y', '%d-%b -%y',
        '%d – %b-%y', '%d -%b -%y'
    ]

    for date_format in date_formats:
        try:
            return datetime.datetime.strptime(date, date_format)
        except ValueError:
            pass

#We remove ID column to reduce overfitting and the final columns filled with NaNs
cleanData = rawData.drop(['Name(Cand ID)', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27'], axis = 1)

#This row is full of NaNs we should remove it.
cleanData = cleanData.drop(1233)

#Change column names for easier reference
cleanData.columns = [
    'date',
    'client',
    'industry',
    'loc',
    'pos',
    'skillset',
    'interview_type',
    'gender',
    'current_loc',
    'job_loc',
    'venue_loc',
    'native_loc',
    'has_permission',
    'unscheduled',
    'can_call',
    'can_have_alt_number',
    'has_cv_print',
    'venue',
    'call_letter_shared',
    'expected_attendance',
    'observed_attendance',
    'marital_status']

#Make everything lower case and strip whitespace padding for more uniform data
cleanData = pd.concat([cleanData[c].astype(str).str.lower() for c in cleanData.columns], axis = 1)
cleanData = pd.concat([cleanData[c].astype(str).str.strip() for c in cleanData.columns], axis = 1)

#Fix date formats
cleanData['date'] = cleanData['date'].map(get_cleaned_date)

#We know the date range should be between sep 2014 and jan 2017 so we eliminate all records outside of that range.
#As there are some outliers into the future.
#NOTE: we should try to calculate mean +- 1SD to possibly substitute these outliers and check if we get better results.
cleanData = cleanData[(cleanData['date'] >'2014-09-01') & (cleanData['date'] < '2017-01-31')]

#Lets extract features from the date field that are easier to understand by most algorithms
add_datepart(cleanData, 'date', drop=True)
#NOTE: Might possibly want to add days to nearest holiday and days from nearest holiday.
#But that needs some research as to India's holidays and potential usefulness of such features


#Merge similar values standard chartered bank and standard chartered bank chennai.
#job location is specified in another column.
cleanData['client'].replace('standard chartered bank chennai', 'standard chartered bank', inplace = True)

#aon hewitt, hewitt and aon hewitt gurgaon seem to be the same company, we merge those too
cleanData['client'].replace(['hewitt', 'aon hewitt gurgaon'], 'aon hewitt', inplace = True)

#Merge similar industry items
cleanData['industry'].replace(['it products and services', 'it services'], 'it', inplace = True)

#Fix interview types
cleanData['interview_type'].replace(['scheduled walk in', 'sceduled walkin'], 'scheduled walkin', inplace=True)

#Fix location fields
cleanData['loc'].replace(['- cochin-', 'gurgaonr'], ['cochin', 'gurgaon'], inplace = True)
cleanData['current_loc'].replace('- cochin-', 'cochin', inplace=True)
cleanData['job_loc'].replace('- cochin-', 'cochin', inplace=True)
cleanData['venue_loc'].replace('- cochin-', 'cochin', inplace=True)
cleanData['native_loc'].replace('- cochin-', 'cochin', inplace=True)
cleanData['native_loc'].replace('delhi /ncr', 'delhi', inplace=True)

#We will assume that the time means that the candidate is expected at that time
time_pattern = '\d?\d[.:]\d\d [ap]m'
cleanData['expected_attendance'].replace(time_pattern, 'yes', regex=True, inplace=True)

#Normalize yes/no/na fields
features= ['has_permission', 'unscheduled', 'can_call', 'can_have_alt_number', 'has_cv_print', 'venue', 'call_letter_shared', 'expected_attendance', 'observed_attendance']
no = ['no', 'not yet', 'no dont', 'no- will take it soon', 'no i have only thi number', 'no- i need to check']
na = ['na', 'nan', 'not sure', 'cant say', 'yet to confirm', 'need to check', 'yet to check', 'havent checked', 'uncertain']

#We use 1, 0, -1 for 'yes', 'na', 'no' respectively as such values are easier for most algorithms to make inferences
for feature in features:
    cleanData[feature].replace(no, -1, inplace=True)
    cleanData[feature].replace(na, 0, inplace=True)
    cleanData[feature].replace('yes', 1, inplace=True)
    
#Normalize gender column.
#We use a -1,1 range as it is recommended for gender data.
cleanData['gender'].replace('male', 1, inplace=True)
cleanData['gender'].replace('female', -1, inplace=True)


#We drop values that look like dateTime as they make no sense in the skillset column.
filterSeries = cleanData['skillset'].str.contains(time_pattern, regex=True)
cleanData = cleanData[~filterSeries]

#Let's give the field some somblance of an uniform format for easier processing
cleanData['skillset'].replace([
    '/', ' ?, ?', ' developer', 'r & d', 'sccm ?- ?', ' – ra'
], [
    ',', ',', '',  'r&d', 'sccm,',  ''
], inplace=True, regex=True)
cleanData['skillset'].replace([
    'cdd kyc',
    'java j2ee',
    'oracle plsql',
    'core java',
    'senior software engineer-mednet',
    'sr automation testing',
    'tech lead-mednet',
    'ra publishing',
    'java jsf',
    'java,j2ee,core java',
    'java tech lead',
    'automation testing java',
    '- sapbo,informatica',
    'production support - sccm',
    'tech lead- mednet',
    'technical lead',
    'senior analyst',
    'sccm – sharepoint',
    'sccm – sql',
    'tl',
    'sccm,(network,sharepoint,ms exchange)',
    'java-sas',
    'lcm -manager',
    'basesas_program,reporting'
], [
    'cdd,kyc',
    'java,j2ee',
    'plsql',
    'java',
    'senior,developer,mednet',
    'senior,automation,testing',
    'tech lead,mednet',
    'publishing',
    'java,jsf',
    'java,j2ee',
    'java,tech lead',
    'java,automation,testing',
    'sapbo,informatica',
    'sccm,production support',
    'tech lead,mednet',
    'tech lead',
    'senior,analyst',
    'sccm,sharepoint',
    'sccm,sql',
    'tech lead',
    'sccm,network,sharepoint,ms_exchange',
    'java,sas',
    'lcm manager',
    'baseSAS'
    
], inplace=True)
cleanData['skillset'].replace([
    'lending&liablities',
    'l & l',
    'lending & liability'
], 'lending and liabilities', inplace=True)
cleanData['skillset'].replace(['biosimiliars', 'biosimillar'], 'biosimilars', inplace=True)
cleanData['skillset'].replace(' ',  '_', regex=True, inplace=True)

#Lets binarize each item into categories
skillset = cleanData.skillset.str.split(',', expand=True).stack()
dummies = pd.get_dummies(skillset, prefix='skillset').groupby(level=0).sum()

cleanData = cleanData.join(dummies)

#We drop skillset as it is now encoded
cleanData.drop('skillset', axis=1, inplace=True)

#Encode other categorical features using oneHot
features = ['loc', 'pos', 'client', 'industry', 'interview_type', 'current_loc', 'job_loc', 'venue_loc', 'native_loc', 'marital_status']

for feature in features:
    dummies = pd.get_dummies(cleanData[feature], prefix=feature)
    cleanData.drop(feature, axis=1, inplace=True)
    cleanData = cleanData.join(dummies)


In [3]:
def split_data(data, target_field, frac=0.75, random_state=42):
    Y = data.observed_attendance
    X = data.drop(target_field, axis=1)

    #Fit the normalizer with our unseparated data minus the target column
    scaler = preprocessing.StandardScaler()
    scaler.fit_transform(X)
    
    #Separate data into training and test sets
    return train_test_split(X, Y, test_size=frac, random_state=random_state)


    

In [4]:
# Test function
def test(model):
    model.fit(xTrain, yTrain)
    print('Train: ', model.score(xTrain, yTrain))
    print('Test: ', model.score(xTest, yTest))

In [5]:
xTest, xTrain, yTest, yTrain = split_data(cleanData, 'observed_attendance')


In [6]:
# Try linear regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()

print('Try the model as is:')
test(model)

rfecv = feature_selection.RFECV(estimator=model)

print('Reducing features...')
test(rfecv)
print("Optimal number of features : %d" % rfecv.n_features_)
print("Selected features: %s", xTrain.columns[rfecv.support_])


Try the model as is:
Train:  0.27847151452122887
Test:  0.099680483347796
Reducing features...
Train:  0.022734419521123983
Test:  -0.0010158778680553482
Optimal number of features : 6
Selected features: %s Index(['Year', 'Week', 'Dayofweek', 'Dayofyear', 'skillset_cots',
       'industry_telecom'],
      dtype='object')


In [7]:
# Try logistic regression
from sklearn.linear_model import LogisticRegression

# We set random_state for reproceability
model = LogisticRegression(random_state=42)

print('Try the model as is:')
test(model)

rfecv = feature_selection.RFECV(estimator=model)

print('Reducing features...')
test(rfecv)
print("Optimal number of features : %d" % rfecv.n_features_)
print("Selected features: %s", xTrain.columns[rfecv.support_])


Try the model as is:
Train:  0.6348684210526315
Test:  0.6270627062706271
Reducing features...
Train:  0.6348684210526315
Test:  0.6270627062706271
Optimal number of features : 180
Selected features: %s Index(['gender', 'has_permission', 'unscheduled', 'can_call',
       'can_have_alt_number', 'has_cv_print', 'venue', 'call_letter_shared',
       'expected_attendance', 'Year',
       ...
       'native_loc_tirupati', 'native_loc_trichy', 'native_loc_trivandrum',
       'native_loc_tuticorin', 'native_loc_vellore', 'native_loc_vijayawada',
       'native_loc_visakapatinam', 'native_loc_warangal',
       'marital_status_married', 'marital_status_single'],
      dtype='object', length=180)


In [8]:
#Try decision trees
from sklearn import tree

model = tree.DecisionTreeClassifier()

print('Try the model as is:')
test(model)

rfecv = feature_selection.RFECV(estimator=model)

print('Reducing features...')
test(rfecv)
print("Optimal number of features : %d" % rfecv.n_features_)
print("Selected features: %s", xTrain.columns[rfecv.support_])




Try the model as is:
Train:  0.8497807017543859
Test:  0.6666666666666666
Reducing features...
Train:  0.7138157894736842
Test:  0.693069306930693
Optimal number of features : 1
Selected features: %s Index(['expected_attendance'], dtype='object')


In [9]:
#Try SVM Linear Classification, we preffer classification over regression because this is a classification problem
from sklearn import svm

model = svm.LinearSVC()

print('Try the model as is:')
test(model)

rfecv = feature_selection.RFECV(estimator=model)

print('Reducing features...')
test(rfecv)
print("Optimal number of features : %d" % rfecv.n_features_)
print("Selected features: %s", xTrain.columns[rfecv.support_])


Try the model as is:
Train:  0.6348684210526315
Test:  0.6270627062706271
Reducing features...
Train:  0.3651315789473684
Test:  0.37293729372937295
Optimal number of features : 177
Selected features: %s Index(['gender', 'has_permission', 'unscheduled', 'can_call',
       'can_have_alt_number', 'has_cv_print', 'venue', 'call_letter_shared',
       'expected_attendance', 'Year',
       ...
       'native_loc_tirupati', 'native_loc_trichy', 'native_loc_trivandrum',
       'native_loc_tuticorin', 'native_loc_vellore', 'native_loc_vijayawada',
       'native_loc_visakapatinam', 'native_loc_warangal',
       'marital_status_married', 'marital_status_single'],
      dtype='object', length=177)


In [18]:
#Try K-means
from sklearn.cluster import KMeans
model = KMeans(n_clusters=2) #We set n_clusters=2 because we only care about 2 possible outcomes.

#We use the full dataset as indicated in the course content.
X = np.array(cleanData.drop(['observed_attendance'], 1).astype(float))
X = preprocessing.scale(X)
Y = np.array(cleanData['observed_attendance'])
model.fit(X)

correct = 0

for i in range(len(X)):
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = model.predict(predict_me)
    if prediction[0] == Y[i]:
        correct += 1

print(correct/len(X))

0.4189300411522634


In [21]:
#Try Mean Shift
from sklearn.cluster import MeanShift
model = MeanShift()
model.fit(X)

labels = model.labels_
cluster_centers = model.cluster_centers_

#We make a copy of the dataframe so we can label it with the result of the model
dataCopy = pd.DataFrame.copy(cleanData)

dataCopy['cluster_group'] = np.nan

for i in range(len(X)):
    #We expect a warning from this line, we will ignore it.
    dataCopy['cluster_group'].iloc[i] = labels[i]

attendance_rates = {}

n_clusters = len(np.unique(labels))
for i in range(n_clusters):
    temp_df = dataCopy[(dataCopy['cluster_group']==float(i))]
    attendance_cluster = temp_df[(temp_df['observed_attendance']==1)]
    attendance_rate = len(attendance_cluster)/len(temp_df)
    attendance_rates[i] = attendance_rate

#The results look very confusing, 73 cluster groups.
#We could look deeper into the data and try to find the correlation between specific variables and the observed_attendance
print(attendance_rates)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


{0: 0.6267087276550999, 1: 0.7083333333333334, 2: 0.5555555555555556, 3: 0.8421052631578947, 4: 0.5294117647058824, 5: 0.15384615384615385, 6: 0.5833333333333334, 7: 0.6666666666666666, 8: 0.75, 9: 0.5714285714285714, 10: 0.5, 11: 0.8333333333333334, 12: 1.0, 13: 1.0, 14: 0.4, 15: 0.6, 16: 1.0, 17: 0.0, 18: 0.75, 19: 0.25, 20: 0.75, 21: 0.75, 22: 1.0, 23: 1.0, 24: 1.0, 25: 1.0, 26: 0.6666666666666666, 27: 0.6666666666666666, 28: 0.6666666666666666, 29: 0.5, 30: 0.5, 31: 0.5, 32: 0.5, 33: 1.0, 34: 0.5, 35: 1.0, 36: 0.5, 37: 1.0, 38: 1.0, 39: 1.0, 40: 0.0, 41: 1.0, 42: 1.0, 43: 0.0, 44: 0.0, 45: 0.0, 46: 1.0, 47: 1.0, 48: 0.0, 49: 1.0, 50: 0.0, 51: 1.0, 52: 0.0, 53: 0.0, 54: 1.0, 55: 0.0, 56: 1.0, 57: 0.0, 58: 1.0, 59: 1.0, 60: 1.0, 61: 1.0, 62: 1.0, 63: 0.0, 64: 1.0, 65: 1.0, 66: 1.0, 67: 0.0, 68: 1.0, 69: 0.0, 70: 1.0, 71: 1.0, 72: 1.0, 73: 1.0}
