## Import section

In [None]:
import sys
import scipy
import numpy as np
import matplotlib
import pandas
import sklearn
from pandas import read_csv
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, learning_curve, permutation_test_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.decomposition.pca import PCA

In [None]:
df = read_csv("./data/result_table.csv", sep=",")
df = df.drop(columns = ['ID','street','country_code','address_text','marker_icon', 'company_url', 'latitude','longitude','employment_type','published_at','remote_interview','id' ,'company_logo_url', 'skills'])
df = df.dropna()


In [None]:
x_features = ['.NET','Design','Network','REST API','Embeded','Cloud','Database','Android','IT','Soft Skills','Scrum master','Mobile','Common','JavaScript','DevOps','Software engineering','Testing','Automation','Shell Scripting','Backend','Data Science','Blockchain','C++','Client Service','Front-end','Civil Engineering','Developer','iOS','SQL','Python','PHP','Erlang','Scala','Git','Games','Golang','Google','Java','Web','Consultant','Project Manager','Analitics','CRM','Perl','R','Ruby','Rust','SAP','CSS','XML','Salesforce','API','Data','Excel','Vert.x']
y_column = ['experience_level']
x = df[x_features]
y = df[y_column]
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.4, random_state=2)

In [None]:
models = []
models.extend([
    ('KNN', KNeighborsClassifier(), 0),
    ('CART', DecisionTreeClassifier(), 1),
    ('NB', GaussianNB(), 2),
    # ('SVM', SVC(gamma='auto'), 3),
    # ('MLP', MLPClassifier(alpha=1e-5, hidden_layer_sizes=(50,10), max_iter=5000), 4)
])

In [None]:
def get_learning_curve(classification_model, training_set_enlarging_step=10):
    train_sizes = np.linspace(0.1, 1, training_set_enlarging_step)
    train_sizes, train_scores, validation_scores = learning_curve(
        estimator = classification_model,
        X = x,
        y = y, 
        train_sizes = train_sizes,
        cv = 5,
        scoring = 'accuracy')
    return train_sizes, train_scores, validation_scores

def plot_learning_curve(model,name):
    train_sizes, train_scores, test_scores = \
        get_learning_curve(model)

    plt.style.use('seaborn')
    plt.plot(train_sizes, -train_scores.mean(axis = 1), color= 'red', label = 'Training error')
    plt.plot(train_sizes, -test_scores.mean(axis = 1), color= 'navy',label = 'Validation error')
    plt.ylabel('Accuracy', fontsize = 14)
    plt.xlabel('Training set size', fontsize = 14)
    plt.title('Learning curves for a %s' % name, fontsize = 18, y = 1.03)
    plt.legend()
    plt.show()

def print_scores(cv_results, predictions):
    print('\nMean %f' % cv_results.mean())
    print('STD %f' % cv_results.std())
    print('\nConfusion matrix:')
    print(confusion_matrix(y_test, predictions))
    print('\nAccuracy %f' % accuracy_score(y_test, predictions))
    print('Precision %f' % precision_score(y_test, predictions, average = 'weighted'))
    print('Recall %f' % recall_score(y_test, predictions, average = 'weighted'))
    print('\nClassification report:')
    print(classification_report(y_test, predictions))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.4, random_state=2)
results = []
names = []
for name, model, subplot_row in models:
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.4, random_state=2)
    print(f"---------------------------\nRunning classification for: {name}")
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
    names.append(name)

    # Make predictions on validation dataset
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)

    print_scores(cv_results, predictions)
    plot_learning_curve(model,name)

    pca = PCA(n_components = 10)
    x_pca = pca.fit_transform(x)
    x_train, x_test, y_train, y_test = train_test_split(x_pca,y, test_size=0.4, random_state=2)

    print(f"---------------------------\nRunning classification after PCA: {name}")
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
    names.append(name)

    # Make predictions on validation dataset
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)

    print_scores(cv_results, predictions)
    plot_learning_curve(model,name)
