# Classification Models.
  #### Each snippet of code has been commited out and it explains its purpose. Do let me know if any changes are required. Thank you.

In [113]:
#Importing all the necessary packages for data analysis and classifications

import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import math
import re
from sklearn import feature_selection
from sklearn.metrics import accuracy_score

## Data Preprocessing and cleaning

In [114]:
#Importing Dataset

df =  pd.read_csv('df.csv',  thousands=r' ')
df.head()

Unnamed: 0,_id,query_poste,query_ville,title,nom_de_la_boite,adress,text,date,lien_plus_info,contract,salary
0,pj_4cdc829d1a0fc791,data scientist,Île-de-France,Data Scientist (F/H),Novencia,- Paris (75),Paris (75)\r\nContexte\r\nData is fuel ! Quell...,"02/05/2019, 22:34:36",https://www.indeed.fr/rc/clk?jk=4cdc829d1a0fc7...,,
1,pj_84710307e240f03d,data scientist,Île-de-France,Administrateur Sales Force (H/F),OPEN XEROX,- Paris (75),"Paris (75)\r\nTemps plein, CDI\r\n2 500 € - 3 ...","10/09/2019, 10:34:37",https://www.indeed.fr/rc/clk?jk=4cdc829d1a0fc7...,"Temps plein, CDI",2 500 € - 3 000 € par mois
2,pj_4c2e1a76922820e4,data scientist,Île-de-France,Data Scientist H/F,AMD CONSULTING,- Paris (75),Paris (75)\r\n35 000 € - 55 000 € par an\r\nAM...,"18/09/2019, 10:34:38",https://www.indeed.fr/rc/clk?jk=4c2e1a76922820...,"Temps plein, CDI",35 000 € - 55 000 € par an
3,pj_41fafa94559b8b2d,data scientist,Île-de-France,Data Senior Developer - H/F,POLYCONSEIL,- Paris 8e (75),Paris 8e (75)\r\nCDI\r\n55 000 € par an\r\nPyt...,"28/09/2019, 10:34:39",https://www.indeed.fr/rc/clk?jk=41fafa94559b8b...,CDI,55 000 € par an
4,pj_5b66e921462381a3,data scientist,Île-de-France,Lead data engineer H/F,Car&Boat Media,- Paris 9e (75),Paris 9e (75)\r\nCDI\r\n65 000 € par an\r\nSpa...,"25/09/2019, 10:34:39",https://www.indeed.fr/rc/clk?jk=5b66e921462381...,CDI,65 000 € par an


In [115]:
#Analysing the Datatypes
df.dtypes

_id                object
query_poste        object
query_ville        object
title              object
nom_de_la_boite    object
adress             object
text               object
date               object
lien_plus_info     object
contract           object
salary             object
dtype: object

In [116]:
# Determining the cleaness of the dataset.

# The total number of missing values is 0 in only 6 variables which means no such missing operations needs to be applied
# on them. The rest of features include nom_de_la_boite containing 10, date containing 3, lien_plus_info (it ccontains 
# the url of the source from which the observation has been received but this is not much important here since our 
# objective is to find the relationship using classification, rather not to validate the authenticity of the data) 206, contract 
# containing 4718 and the salary containing 6581 missing values. 

# Therefore the most important variable is salary whose missing values has to be treated.


df.isnull().sum()

_id                   0
query_poste           0
query_ville           0
title                 0
nom_de_la_boite      10
adress                0
text                  0
date                  3
lien_plus_info      206
contract           4718
salary             6581
dtype: int64

In [117]:
# The observation containing the missing value of the salary can not be dropped because they are about 75% of the total data.
# The remaining observations will be very less to generalize the model.

df['salary'].value_counts()

35 000 € - 45 000 € par an    110
40 000 € - 45 000 € par an     92
40 000 € - 50 000 € par an     72
45 000 € par an                66
35 000 € - 40 000 € par an     62
30 000 € - 35 000 € par an     59
40 000 € par an                58
55 000 € par an                57
45 000 € - 55 000 € par an     50
50 000 € par an                50
45 000 € - 50 000 € par an     49
30 000 € - 40 000 € par an     46
40 000 € - 60 000 € par an     45
65 000 € par an                41
40 000 € - 55 000 € par an     40
35 000 € - 50 000 € par an     38
30 000 € - 50 000 € par an     36
35 000 € - 55 000 € par an     34
1 200 € par mois               33
34 000 € - 38 000 € par an     32
38 000 € - 42 000 € par an     30
50 000 € - 65 000 € par an     29
60 000 € - 70 000 € par an     27
48 000 € - 55 000 € par an     26
50 000 € - 55 000 € par an     25
60 000 € par an                22
60 000 € - 80 000 € par an     22
40 000 € - 65 000 € par an     20
35 000 € - 42 000 € par an     19
30 000 € - 37 

In [118]:
#There are 263 unique values in the salary column with 5 different units which are

#jour - Salary per day
#semanie - Salary per week
#mois -  Salary per month
#an -  Salary per annual year
#heure -  Salary per hour

# Let's find out how many observations are there in each cautegory.

yearly = 0
monthly = 0
weekly = 0
daily = 0
hourly = 0

def isNaN(num):
    return num != num

for i in df['salary']:
    
    if( not isNaN(i)):
        
        if('an' in i):
            yearly = yearly + 1
        elif('mois' in i):
            monthly = monthly + 1
        elif('semaine' in i):
            weekly = weekly + 1
        elif('heure' in i):
            hourly = hourly + 1
        elif('jour' in i):
            daily = daily + 1
            
print("Yearly: ", yearly, "\nMonthly: ", monthly, "\nweekly: ", weekly, "\ndaily: ", daily, "\nhourly: ", hourly)


print("\nTotal: ", yearly + monthly + weekly + hourly + daily)

print("\nNon Missing values: ", df.shape[0] - df['salary'].isnull().sum())


Yearly:  2057 
Monthly:  340 
weekly:  101 
daily:  31 
hourly:  32

Total:  2561

Non Missing values:  2561


In [119]:
# The majority of the observations are in years, therefore converting all the different units to the year.

for i in range(len(df['salary'])):
    
    if( not isNaN(df['salary'].iloc[i])):  
        
        if('an' in df['salary'].iloc[i]):
              
            df['salary'].iloc[i] = df['salary'].iloc[i].replace(" ","")
            temp = re.findall(r'\d+', df['salary'].iloc[i]) 
            res = list(map(int, temp))
            if(len(res)>1):
                final = str(res[0]) + " - " + str(res[1])
            else:
                final  = str(res[0])
            df['salary'].iloc[i] = final
            
        elif('mois' in df['salary'].iloc[i]):
            df['salary'].iloc[i] = df['salary'].iloc[i].replace(" ","")
            temp = re.findall(r'\d+', df['salary'].iloc[i]) 
            res = list(map(int, temp))
            
            if(len(res)>1):
                res[0] = res[0] * 12
                res[1] = res[1] * 12
            
                final = str(res[0]) + " - " + str(res[1])
            else:
                res[0] = res[0] * 12
            
                final  = str(res[0])
            
            df['salary'].iloc[i] = final
            
        elif('semaine' in df['salary'].iloc[i]):
            df['salary'].iloc[i] = df['salary'].iloc[i].replace(" ","")
            temp = re.findall(r'\d+', df['salary'].iloc[i]) 
            res = list(map(int, temp))
            if(len(res)>1):
                res[0] = res[0] * 4 * 12
                res[1] = res[1] * 4 * 12
            
                final = str(res[0]) + " - " + str(res[1])
            else:
                res[0] = res[0] * 4 * 12
                
                final  = str(res[0])
            
            df['salary'].iloc[i] = final
            
        elif('heure' in df['salary'].iloc[i]):
            df['salary'].iloc[i] = df['salary'].iloc[i].replace(" ","")
            temp = re.findall(r'\d+', df['salary'].iloc[i]) 
            res = list(map(int, temp))
            
            if(len(res)>1):
                res[0] = res[0] * 8 * 5 * 4  * 12
                res[1] = res[1] * 8 * 5 * 4  * 12
                final = str(res[0]) + " - " + str(res[1])
            else:
                res[0] = res[0] * 8 * 5 * 4  * 12
                
                final  = str(res[0])
            
            df['salary'].iloc[i] = final
            
        elif('jour' in df['salary'].iloc[i]):
            
            df['salary'].iloc[i] = df['salary'].iloc[i].replace(" ","")
            temp = re.findall(r'\d+', df['salary'].iloc[i]) 
            res = list(map(int, temp))
            
            
            if(len(res)>1):
                
                res[1] = res[1] * 5 * 4  * 12
                final = str(res[0]) + " - " + str(res[1])
            else:
                res[0] = res[0] * 5 * 4  * 12
                final  = str(res[0])
            
            df['salary'].iloc[i] = final
            

In [120]:
# Now we have converted the ambigious and messy salary data into a structured data whose unit is in annual salary.

#Now the job is to replace the missing values.

df.groupby(['query_poste'])['salary'].value_counts()


query_poste            salary        
business intelligence  40000 - 45000     16
                       30000 - 40000     11
                       32000 - 50000     11
                       6600               9
                       55000 - 70000      8
                       30000 - 37000      7
                       30000 - 42000      7
                       38000 - 42000      6
                       30000 - 38000      3
                       35000 - 45000      3
                       45000              3
                       21000              2
                       21000 - 22800      2
                       34000 - 40000      2
                       36000 - 45000      2
                       40000 - 46000      2
                       45000 - 55000      2
                       8400               2
                       28000 - 45000      1
data analyst           35000 - 50000     28
                       45000             22
                       55000          

In [121]:
#Replacing the missing values by the most repeated value (mode) of each group i.e of each post

data_scientist = df[df['query_poste'] == 'data scientist']['salary'].mode()
bi = df[df['query_poste'] == 'business intelligence']['salary'].mode()
da = df[df['query_poste'] == 'data analyst']['salary'].mode()
dev = df[df['query_poste'] == 'développeur']['salary'].mode()

for i in range(len(df['salary'])):
    
    if(isNaN(df['salary'].iloc[i])):  
        
        if(df['query_poste'].iloc[i] == 'data scientist'):
            df['salary'].iloc[i] = data_scientist[0]
        elif(df['query_poste'].iloc[i] == 'business intelligence'):
            df['salary'].iloc[i] = bi[0]
        elif(df['query_poste'].iloc[i] == 'data analyst'):
            df['salary'].iloc[i] = da[0]
        elif(df['query_poste'].iloc[i] == 'développeur'):
            df['salary'].iloc[i] = dev[0]
        
            

In [122]:
#Now we can see that they are no missing values at all. That's where the preprocessing of the data is completed.
#Now we can apply the models and classify the data.

df['salary'].isnull().sum()

0

In [123]:
#Need to make classes(bins) of the salary so that we can use the classification algorithms.

#After analysing the data and distribution of the classes in the salary column. we can now divide the salary into five different
#classes as mentioned below.

# very high - >=65000 
# high - 50000 - 65000
# average  - 40000 - 50000
# low  - 30000 - 40000
# very low - <=30000


for i in range(len(df['salary'])): 
              
            temp = re.findall(r'\d+', df['salary'].iloc[i]) 
            res = list(map(int, temp))
            if(len(res)>1):
                
                if(res[1]>=65000):
                    df['salary'].iloc[i] = 'very high'
                if(res[1]>=50000 and res[1]<65000):
                    df['salary'].iloc[i] = 'high'
                if(res[1]>=40000 and res[1]<50000):
                    df['salary'].iloc[i] = 'average'
                if(res[1]>=30000 and res[1]<40000):
                    df['salary'].iloc[i] = 'low'
                if(res[1]<30000):
                    df['salary'].iloc[i] = 'very low'
            else:
                if(res[0]>=65000):
                    df['salary'].iloc[i] = 'very high'
                if(res[0]>=50000 and res[0]<65000):
                    df['salary'].iloc[i] = 'high'
                if(res[0]>=40000 and res[0]<50000):
                    df['salary'].iloc[i] = 'average'
                if(res[0]>=30000 and res[0]<40000):
                    df['salary'].iloc[i] = 'low'
                if(res[0]<30000):
                    df['salary'].iloc[i] = 'very low'
                    

        


In [124]:
# The data is all set to apply the algorithms of classification now. The salaries have been classified into five
# different classes.

df['salary']

0       very high
1             low
2            high
3            high
4       very high
5       very high
6       very high
7       very high
8       very high
9       very high
10      very high
11      very high
12      very high
13      very high
14      very high
15      very high
16      very high
17      very high
18      very high
19           high
20           high
21           high
22           high
23           high
24           high
25           high
26           high
27           high
28           high
29           high
          ...    
9112      average
9113      average
9114      average
9115      average
9116      average
9117      average
9118      average
9119      average
9120      average
9121      average
9122      average
9123      average
9124      average
9125      average
9126      average
9127      average
9128      average
9129      average
9130      average
9131      average
9132      average
9133      average
9134      average
9135      average
9136      

## Prediction and Model building 

In [125]:
#Finding the correlation of the numerical features with target variable.

#ID variables are always to be removed since they does not make any sense to use for training.

# Analysing the categorical variables

important_categorical_varaibles = ['query_poste','query_ville']
important_categorical_varaibles = df[important_categorical_varaibles]
important_dummies = pd.get_dummies(important_categorical_varaibles)
target_variable = df['salary']
important_dummies

Unnamed: 0,query_poste_business intelligence,query_poste_data analyst,query_poste_data scientist,query_poste_développeur,query_ville_Bordeaux,query_ville_Bordeaux&start=40,query_ville_Lyon,query_ville_Nantes,query_ville_Nantes&start=660,query_ville_Toulouse,query_ville_Île-de-France
0,0,0,1,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,1
5,0,0,1,0,0,0,0,0,0,0,1
6,0,0,1,0,0,0,0,0,0,0,1
7,0,0,1,0,0,0,0,0,0,0,1
8,0,0,1,0,0,0,0,0,0,0,1
9,0,0,1,0,0,0,0,0,0,0,1


In [126]:
# Applying Chi Square to find the correlation between the two categorical variables.

feature_selection.chi2(important_dummies, target_variable)

(array([ 628.53151563, 3452.33470377, 4582.70601928,  859.97445637,
          29.04422965,  348.55269923,  229.83664129,  367.15049498,
         129.78628963,  426.01452206, 1077.05137719]),
 array([1.03464731e-134, 0.00000000e+000, 0.00000000e+000, 7.82319058e-185,
        7.65731580e-006, 3.60135850e-074, 1.43139520e-048, 3.47101476e-078,
        4.32618939e-027, 6.64584851e-091, 7.13308187e-232]))

In [127]:
important_dummies.columns

Index(['query_poste_business intelligence', 'query_poste_data analyst',
       'query_poste_data scientist', 'query_poste_développeur',
       'query_ville_Bordeaux', 'query_ville_Bordeaux&start=40',
       'query_ville_Lyon', 'query_ville_Nantes',
       'query_ville_Nantes&start=660', 'query_ville_Toulouse',
       'query_ville_Île-de-France'],
      dtype='object')

In [132]:
X_data = important_dummies[['query_poste_business intelligence', 'query_poste_data analyst',
       'query_poste_data scientist', 'query_poste_développeur',
       'query_ville_Bordeaux', 'query_ville_Bordeaux&start=40',
       'query_ville_Lyon', 'query_ville_Nantes',
       'query_ville_Nantes&start=660', 'query_ville_Toulouse',
       'query_ville_Île-de-France']]
Y_data  = df['salary']

## Correlation Matrix showing the relationship of City, Post and Salary.

In [129]:
dummy_data = X_data
dummy_data['salary'] = df['salary']
dummy_data.corr()

Unnamed: 0,query_poste_business intelligence,query_poste_data analyst,query_poste_data scientist,query_poste_développeur,query_ville_Bordeaux,query_ville_Bordeaux&start=40,query_ville_Lyon,query_ville_Nantes,query_ville_Nantes&start=660,query_ville_Toulouse,query_ville_Île-de-France
query_poste_business intelligence,1.0,-0.1447,-0.160187,-0.525716,0.091251,-0.111639,-0.005444,-0.076594,-0.066632,-0.04357,0.121701
query_poste_data analyst,-0.1447,1.0,-0.134136,-0.440218,-0.014674,-0.093483,-0.084747,-0.077452,-0.055795,-0.093957,0.244299
query_poste_data scientist,-0.160187,-0.134136,1.0,-0.487335,-0.033108,-0.103488,-0.128081,-0.101349,-0.061767,-0.099026,0.307781
query_poste_développeur,-0.525716,-0.440218,-0.487335,1.0,-0.034282,0.212356,0.146376,0.175119,0.126745,0.159992,-0.456815
query_ville_Bordeaux,0.091251,-0.014674,-0.033108,-0.034282,1.0,-0.054531,-0.087364,-0.067222,-0.032547,-0.08345,-0.190679
query_ville_Bordeaux&start=40,-0.111639,-0.093483,-0.103488,0.212356,-0.054531,1.0,-0.115548,-0.088909,-0.043047,-0.110372,-0.252194
query_ville_Lyon,-0.005444,-0.084747,-0.128081,0.146376,-0.087364,-0.115548,1.0,-0.14244,-0.068965,-0.176825,-0.404036
query_ville_Nantes,-0.076594,-0.077452,-0.101349,0.175119,-0.067222,-0.088909,-0.14244,1.0,-0.053066,-0.136059,-0.310887
query_ville_Nantes&start=660,-0.066632,-0.055795,-0.061767,0.126745,-0.032547,-0.043047,-0.068965,-0.053066,1.0,-0.065876,-0.150523
query_ville_Toulouse,-0.04357,-0.093957,-0.099026,0.159992,-0.08345,-0.110372,-0.176825,-0.136059,-0.065876,1.0,-0.385937


## Logistic Regression

In [133]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=0)

logreg = LogisticRegression()

logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [134]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.81


In [135]:
# Accuracy of Logistic Regression turns out to be 0.81 on the test set.

## Random Forest

In [136]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [137]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, y_train)  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [144]:
y_pred = clf.predict(X_test)
print('Accuracy of Random forest on test set: {:.2f}'.format(clf.score(X_test, y_test)))

Accuracy of Random forest on test set: 0.81


In [145]:
# Accuracy of Random Forest turns out to be 0.81 on the test set as well.

## Radial basis Function Kernel (SVM)



In [146]:
from sklearn.svm import SVC

# Create a SVC classifier using an RBF kernel with gamma auto selected.

clf = SVC(gamma='auto')

clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [147]:
y_pred = clf.predict(X_test)
print('Accuracy of Radial basis function kernal (SVM) classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))
accuracy_score(y_test, y_pred)

Accuracy of Radial basis function kernal (SVM) classifier on test set: 0.81


0.8086386003280481

In [148]:
# Create a SVC classifier using an RBF kernel with custom gamma of 0.01

svm = SVC(kernel='rbf', random_state=0, gamma=.01, C=1)
# Train the classifier
svm.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [154]:
y_pred = clf.predict(X_test)
print('Accuracy of SVC classifier with gamma 0.01 on test set: {:.2f}'.format(clf.score(X_test, y_test)))



Accuracy of SVC classifier with gamma 0.01 on test set: 0.81


## Gradient Boosting Classifier

In [150]:
# import machine learning algorithms
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [151]:
# train with Gradient Boosting algorithm
# compute the accuracy scores on train and validation sets when training with different learning rates

learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_test, y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.783
Accuracy score (validation): 0.779

Learning rate:  0.1
Accuracy score (training): 0.803
Accuracy score (validation): 0.800

Learning rate:  0.25
Accuracy score (training): 0.811
Accuracy score (validation): 0.809

Learning rate:  0.5
Accuracy score (training): 0.811
Accuracy score (validation): 0.808

Learning rate:  0.75
Accuracy score (training): 0.811
Accuracy score (validation): 0.808

Learning rate:  1
Accuracy score (training): 0.811
Accuracy score (validation): 0.808



In [152]:
# Output confusion matrix and classification report of Gradient Boosting algorithm on validation set

gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.5, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train, y_train)
predictions = gb.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print()
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix:
[[1136   11    0   13    0]
 [  79  169    0   33    0]
 [  68    2    0    5    0]
 [  36   15    0  173    0]
 [  69   11    0    9    0]]

Classification Report
              precision    recall  f1-score   support

     average       0.82      0.98      0.89      1160
        high       0.81      0.60      0.69       281
         low       0.00      0.00      0.00        75
   very high       0.74      0.77      0.76       224
    very low       0.00      0.00      0.00        89

    accuracy                           0.81      1829
   macro avg       0.47      0.47      0.47      1829
weighted avg       0.73      0.81      0.76      1829



  'precision', 'predicted', average, warn_for)


## XGBoost Classifier

In [None]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

In [None]:
score = xgb_clf.score(X_test, y_test)
print(score)