### Load packages and own functions from functions.py file

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from collections import Counter
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel


from functions import AddBinaryString, Scoring,plot3

### Load data files

In [None]:
dfUser         = pd.read_csv("../applicant_material/user.csv") 
dfJobs         = pd.read_csv("../applicant_material/job_desc.csv") 

### Clean data

In [None]:
##Delete strings in user_id and create concat the two dataframes
dfUser.user_id = dfUser.user_id.apply(lambda x : int(x[1:]))
dfJobs.user_id = dfJobs.user_id.apply(lambda x : int(x[1:]))
dfAll          = pd.concat([dfUser, dfJobs], axis=1)

#replace brackes by spaces, make all letters lowercase
dfJobs.job_title_full = dfJobs.job_title_full.str.replace('(', ' ')
dfJobs.job_title_full = dfJobs.job_title_full.str.replace(')', ' ')
dfJobs.job_title_full = dfJobs.job_title_full.str.lower();

###  Prepare jobs-data (one-hot encoding of strings)

In [None]:
#Categorize data
salaryBool         = True
keywordsBool       = False
categCompaniesBool = False
categJobDescrBool  = False


#############

if salaryBool:
    dfJobs['Salary_Bool']  = np.where(dfJobs.salary.notnull(), 1, 0)

#One-hot encoding keywords in job-description
if keywordsBool:  
    ListOfMostCommonWords = Counter(" ".join(dfJobs["job_title_full"]).split()).most_common(n = None)
    ListOfStrings         = [entry for entry, count in ListOfMostCommonWords]
    #ListOfStrings            = ['Manager','Junior','Senior', 'Lead', 'Remote', 'M/F', 'Backend','Analyst']
    #ListOfStrings            = ['product','machine','apac', 'pricing', 'manager', 'pricing', 'ux', 'owner','checkout']
    dfJobs                = AddBinaryString(dfJobs,ListOfStrings)

#Categorize Companies /or job_descr
if categCompaniesBool:
    dfJobsDummies         = pd.get_dummies(dfJobs, columns=['company'])  
    dfJobs = dfJobsDummies
elif categJobDescrBool:
    dfJobsDummies         = pd.get_dummies(dfJobs, columns=['job_title_full']) 
    dfJobs = dfJobsDummies
else:
    pass


dfJobs.columns 

### Select features

In [None]:
#Label the feature set
featureName = 'Full'       

#Select from which datasets we want which columns
jobsBool    = True
userBool    = True
dropColJobs = ['job_title_full', 'user_id','company', 'job_title_full']   #columns to drop in Jobs dataframe
dropColUser = ['has_applied','user_id']                                   #columns to drop in User dataframe


#########

if jobsBool and userBool:
    features      = pd.concat([ dfJobs.drop(dropColJobs,axis = 1), 
                         dfUser.drop(dropColUser,axis =1)], axis=1)
elif jobsBool:
    features      = dfJobs.drop(dropColJobs,axis = 1)
elif userBool:
    features     = dfUser.drop(dropColUser, axis = 1 )

    
print('#####################################################################################'
      '\n# The by-hand selected features in the set named "{0}" are:  \n#######################\n\n'.format(featureName),list(features.columns))

### Split data, data scaling, feature selection/PCA

In [None]:
#Features and target variable
X, y = features, dfUser.has_applied

fillnan        = 'mean'
featureSelBool = False
polyBool       = False
scalingBool    = True
PCABool        = False



# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.2, random_state=1)

##Fill nan's with mean
if fillnan == 'mean':
    trainX = trainX.fillna(trainX.mean())
    testX  = testX.fillna(testX.mean());
    X      = X.fillna(X.mean())


#Tree Select Features
if featureSelBool:
    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(trainX, trainy)
    clf.feature_importances_  
    model = SelectFromModel(clf, prefit=True)
    testX,trainX, X = model.transform(testX),model.transform(trainX),model.transform(X)

    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(X, y)
    clf.feature_importances_  
    model = SelectFromModel(clf, prefit=True)
    X = model.transform(X)
    print(trainX.shape)



# Polynomial features
if polyBool:
    poly   = preprocessing.PolynomialFeatures(degree=2, interaction_only=False)
    trainX = poly.fit_transform(trainX)
    testX  = poly.fit_transform(testX)

#Scaling
if scalingBool:
    scaler = StandardScaler()
    scaler.fit(trainX)
    # Apply transform to both the training set and the test set.
    trainX, testX = scaler.transform(trainX) ,  scaler.transform(testX)
    scaler = StandardScaler()
    scaler.fit(X)
    X      = scaler.transform(X)

#PCA
if PCABool:
    pca = PCA(.9)
    pca.fit(trainX)
    trainX = pca.transform(trainX)
    testX  = pca.transform(testX)
    pca.fit(X)
    X     = pca.transform(X)


##Initialize Score variables
scores    = []
scoresAuc = []

In [None]:
print(trainX.shape)

## Logistic regression  

In [None]:
#Name Class Technique
clTechnique = 'Logistic'

model       = LogisticRegression(solver='lbfgs'   ) #,max_iter = 10000)
model.fit(trainX, trainy)



fpr, tpr = Scoring(model, testX,testy, featureName, clTechnique =clTechnique,)
scores.append( [fpr,tpr, featureName + '_' + clTechnique])


mean_auc,std_auc = plot3(model, X,y, featureName, clTechnique,)
scoresAuc.append([mean_auc,std_auc])

## Support Vector machines

In [None]:
clTechnique = 'SVM'

clf = svm.SVC(kernel='rbf', C=1, probability = True).fit(trainX, trainy)
print(clf.score(testX, testy))

fpr, tpr = Scoring(clf = clf, testX = testX, testy = testy, featureName = featureName, clTechnique = clTechnique)
scores.append( [fpr,tpr, featureName + '_' + clTechnique])


mean_auc,std_auc = plot3(clf, X,y, featureName, clTechnique,)
scoresAuc.append([mean_auc,std_auc])



In [None]:
##Cross-validatoin
auc = cross_val_score(clf, trainX, trainy, scoring='roc_auc', cv = 10)
print(auc)
#get the mean of each fold 
print("AUC of Model with Cross Validation is:",auc.mean() )

## Decision Tree

In [None]:
#https://www.datacamp.com/community/tutorials/xgboost-in-python
clTechnique = 'DecTree'

valX, test2X, valy, test2y = train_test_split(testX, testy, test_size=0.5, random_state=3)

xg_class     = xgb.XGBClassifier(max_depth = 1)

eval_set     = [(valX, valy)]
xg_class.fit(trainX, trainy, eval_metric="auc", eval_set=eval_set, verbose=False);

fpr, tpr = Scoring(clf = xg_class, testX = test2X, testy = test2y, featureName = featureName, clTechnique = clTechnique)
scores.append( [fpr,tpr, featureName + '_' + clTechnique])

mean_auc,std_auc = plot3(xg_class, X,y, featureName, clTechnique,)
scoresAuc.append([mean_auc,std_auc])

#### Cross validation

In [None]:
#CV1
auc = cross_val_score(xg_class, X, y, scoring='roc_auc', cv = 10)
print(auc)
#get the mean of each fold 
print("Auc of Model with Cross Validation is:",auc.mean() )

#### Plot Tree

In [None]:
#xg_class = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=20)

plt.figsize=(50,10)
xgb.plot_tree(xg_class,num_trees=0,)
plt.savefig('Tree'+ featureName + '.pdf', dpi = 2000)
plt.show()

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clTechnique = 'RandomForest'
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=500);

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(trainX,trainy);

fpr, tpr  = Scoring(clf = clf, testX = testX, testy = testy, featureName = featureName, clTechnique = clTechnique);
scores.append( [fpr,tpr, featureName + '_' + clTechnique])

mean_auc,std_auc = plot3(clf, X,y, featureName, clTechnique,)
scoresAuc.append([mean_auc,std_auc])

##### Cross validation

In [None]:
#CV1
auc = cross_val_score(clf, X, y, scoring='roc_auc', cv = 10)
print(auc)
#get the mean of each fold 
print("Auc of Model with Cross Validation is:",auc.mean() * 100)

###  Neuronal Network

In [None]:
from sklearn.neural_network import MLPClassifier
clTechnique = 'Neuronal'
clf = MLPClassifier(solver='adam', activation='relu',
                    hidden_layer_sizes=(3,3), random_state=1, max_iter = 1000,validation_fraction=0.2,learning_rate_init=0.001,shuffle = True);
clf.fit(trainX, trainy);

Scoring(clf = clf, testX = testX, testy = testy, featureName = featureName, clTechnique = clTechnique);

mean_auc,std_auc = plot3(clf, X,y, featureName, clTechnique,)
scoresAuc.append([mean_auc,std_auc])

In [None]:
auc = cross_val_score(xg_class, X, y, scoring='roc_auc', cv = 10)
print(auc)
#get the mean of each fold 
print("Auc of Model with Cross Validation is:",auc.mean())

## Test AUC as a func of sample size

In [None]:
from sklearn.ensemble import RandomForestClassifier

print(trainX.shape)

Ns = [250,500,750,1000,1200,1400,1599]

clTechnique = 'RandomForest'
#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=500);

aucs    = []
aucsStd = []
testedScore = []

for N in Ns:
    print(N)
    index = np.random.randint(0, len(trainX), N)
    trainXcut = np.asarray(trainX)[index]
    trainycut = np.asarray(trainy)[index]
    #Train the model using the training sets y_pred=clf.predict(X_test)    
    auc = cross_val_score(clf, trainXcut, trainycut, scoring='roc_auc', cv = 10)
    print("Auc of Model with Cross Validation is:",auc.mean() )
    aucs.append(auc.mean())
    aucsStd.append(auc.std())
    
    clf.fit(trainXcut,trainycut);    
    lr_probs     = clf.predict_proba(testX)
    lr_probs     = lr_probs[:, 1]
    testedScore.append( roc_auc_score(testy, lr_probs))



In [None]:
#Plot auc as function of sample-size
fig,ax = plt.subplots(figsize = (3.3,2.2)) 
plt.errorbar(Ns,aucs,aucsStd, color = 'C0', lw = 1, label = 'AUC Cross Val')
plt.plot(Ns,testedScore, '*-C1', markersize = 5, lw =1, label = 'AUC Test Data')
plt.legend()
plt.ylabel('ROC AUC')
plt.xlabel('Sample size')
plt.tight_layout()
plt.savefig('results/AUC_vs_sampleSize2.pdf')
plt.show()
plt.close()


