In [None]:
from sklearn import metrics
!pip install scipy numpy matplotlib pandas sklearn > /dev/null

In [None]:
# Load libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import read_csv
from pandas import read_json
from matplotlib import pyplot
from sklearn.model_selection import train_test_split, learning_curve, permutation_test_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, plot_roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import json

<h2>Dataset configuration</h2>

In [None]:
class Config(object):
    def __init__(self, dataSourceUrl, dataResultUrl, test_size, n_splits, should_describe_data):
        self.dataSourceUrl = dataSourceUrl
        self.dataResultUrl = dataResultUrl
        self.test_size = test_size
        self.n_splits = n_splits
        self.should_describe_data = should_describe_data

def as_config(dict, dataSetName):
    return Config(
        dict[dataSetName]['dataSourceUrl'],
        dict[dataSetName]['dataResultUrl'],
        dict[dataSetName]['test_size'],
        dict[dataSetName]['n_splits'],
        dict[dataSetName]['should_describe_data'],
    )

In [None]:
json_config = """
{
    "it_data":{
        "dataSourceUrl": "./data/joinit_data.json",
        "dataResultUrl": "./data/joinit_data.csv",
        "test_size": 0.2,
        "n_splits": 10,
        "should_describe_data": true
    }
}"""
cfg = json.loads(json_config)

<h2>Load (and describe) dataset, create test and train datasets</h2>

In [None]:
cfg = as_config(cfg, 'it_data')

dataset = read_json(cfg.dataSourceUrl)
# dataset.to_csv(cfg.dataResultUrl)
if cfg.should_describe_data:    
    print(dataset.shape)

Loading generalized skill labels and adding it as columns to dataset

In [None]:
# print(dataset)
skill_map = read_csv('./data/skills_mapped.csv', sep=":")
unique_skill_columns = (skill_map['mapping'].unique())

for skill in unique_skill_columns:
    dataset[str(skill)] = 0

For each row, map skill/level to generalized column label
Result in result_df, saved to file.

##### note that it only works on preformated skills_mapped csv.

In [None]:
result_df = dataset
for index, row in dataset.iterrows():
    skill_dict = row['skills']
    for skill_level_tuple in skill_dict:
        name = skill_level_tuple['name']
        name_index_in_map = np.where(skill_map['Skill'] == name)[0][0]
        name = skill_map.iloc[name_index_in_map]['mapping']
        if not name == '-':
            if row[name] == 0:
                row[name] = skill_level_tuple['level']
            elif not row[name] == 0:
                row[name] = row[name] if row[name] >= skill_level_tuple['level'] else skill_level_tuple['level']
    result_df.loc[index] = row
result_df.to_csv("./data/result_table.csv")

Old stuff after that cell

In [None]:
import difflib
# temp = dataset['skills'][0]
skills_arr = []
# skills = [skill_arr.append() for r in dataset['skills']]
for r in dataset['skills']:
    for name in r:
        skills_arr.append(name['name'])
unique_skills = np.unique(np.array(skills_arr))
## usuwanie po znalezionych podobnych stwierdzeniach
similar = []
very_unique = []
for skill in unique_skills:
    if not skill in similar:
        very_unique.append(skill)
        similar += difflib.get_close_matches(skill, unique_skills, cutoff=0.1)

## usuwanie po 1. słowie
first_word = []
very_very_unique = []
for skill in very_unique:
    if not skill.split(' ')[0] in first_word:
        first_word.append(skill.split(' ')[0])
        very_very_unique.append(skill)
print(len(very_unique))
pd.DataFrame(very_very_unique).to_csv('./data/skills_double_trim.csv')
# dataset_skills = read_json(dataset['skills'])

In [None]:
array = dataset.values
x = array[:,0:len(dataset.columns)-1]
y = array[:,len(dataset.columns)-1]
#na podstawie x i y otrzymujemy tablice testowe i wynikowe
x_train, x_validation, y_train, y_validation = train_test_split(x,y, test_size=cfg.test_size, random_state=1)

<h2>Classification models</h2>

In [None]:
models = []
models.extend([
    ('KNN', KNeighborsClassifier(), 0),
    ('CART', DecisionTreeClassifier(), 1),
    ('NB', GaussianNB(), 2),
    ('SVM', SVC(gamma='auto'), 3),
    ('MLP', MLPClassifier(alpha=1e-5, hidden_layer_sizes=(50,10), max_iter=5000), 4)
    ])

<h2>Classification</h2>
kfold - k cross-validation to algorytm polegający na testowaniu nauczania(sprawdzania jego wydajności). 
Zbiór TESTOWY jest dzielony na K podzbiorów. W każdej z k iteracji,
brane jest k-1 pozdbiorów, następuje ich nauczanie, następnie sprawdzenie 'jakości' nauczonego modelu.
Przy pomocy danego algorytmu uczenia maszynowego!

In [None]:
def plot_show():
    pyplot.draw()
    pyplot.pause(0.1)

def get_specificity(y_validate, y_predicted):
    cnf_matrix = confusion_matrix(y_validate, y_predicted)
    
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TP = np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)
    
    FP = FP.astype(float)
    TN = TN.astype(float)
        
    return np.mean(TN/(TN+FP))
    
def get_learning_curve(classification_model, training_set_enlarging_step=10):
    train_sizes = np.linspace(0.1, 1, training_set_enlarging_step)
    train_sizes, train_scores, validation_scores = learning_curve(
        estimator = classification_model,
        X = x,
        y = y, 
        train_sizes = train_sizes,
        cv = 5,
        scoring = 'accuracy')
    return train_sizes, train_scores, validation_scores

def plot_learning_curve(model,name):
    train_sizes, train_scores, test_scores = \
        get_learning_curve(model)

    plt.style.use('seaborn')
    plt.plot(train_sizes, -train_scores.mean(axis = 1), color= 'red', label = 'Training error')
    plt.plot(train_sizes, -test_scores.mean(axis = 1), color= 'navy',label = 'Validation error')
    plt.ylabel('Accuracy', fontsize = 14)
    plt.xlabel('Training set size', fontsize = 14)
    plt.title('Learning curves for a %s' % name, fontsize = 18, y = 1.03)
    plt.legend()
    plt.show()
    
def compare_algorithms(results, names):
    fig = pyplot.figure()
    fig.suptitle("Algorithm Comparison")
    ax = fig.add_subplot(1,1,1)
    ax.set_title("Algorithm Comparison")
    ax.boxplot(results, labels=names)
    plot_show()

def print_scores(cv_results, predictions):
    print('\nMean %f' % cv_results.mean())
    print('STD %f' % cv_results.std())
    print('\nConfusion matrix:')
    print(confusion_matrix(y_validation, predictions))
    print('\nAccuracy %f' % accuracy_score(y_validation, predictions))
    print('Precision %f' % precision_score(y_validation, predictions, average = 'weighted'))
    print('Recall %f' % recall_score(y_validation, predictions, average = 'weighted'))
    print('Specificity %f' % get_specificity(y_validation, predictions))
    print('\nClassification report:')
    print(classification_report(y_validation, predictions))

def plot_roc_curves():
    fig = pyplot.figure()
    ax = plt.gca()
    for name, model, subplot_row in models:
        rfc_disp = plot_roc_curve(model, x_validation, 
                                  y_validation, ax=ax, alpha=0.8)
    plt.show()

def accuracySignificancy(model, x_train, y_train, cv):
    fig = pyplot.figure()
    fig.suptitle("Estimating accuracy score's statistical significancy")
    ax = fig.add_subplot(1,1,1)
    n_classes = np.unique(y_train).size
    score, permutation_scores, pvalue = permutation_test_score(model, x_train, y_train, scoring="accuracy", cv=cv, n_permutations=100)
    print("Classification score %s (pvalue : %s)" % (score, pvalue))
    # View histogram of permutation scores
    ax.hist(permutation_scores, 20, label='Permutation scores',
             edgecolor='black')
    ylim = plt.ylim()
    ax.plot(2 * [score], ylim, '--g', linewidth=3,
             label='Classification Score'
             ' (pvalue %s)' % pvalue)
    ax.plot(2 * [1. / n_classes], plt.ylim(), '--k', linewidth=3, label='Luck')
    ax.set_xlabel('Score')
    plt.show()

def clustering_scores(x, labels_true=None):
    inertias = []
    silhouette_values = []
    calinski_harabasz = []
    davies_bouldin = []
    k_range = 15
    for k in range(2, k_range):
        model = KMeans(n_clusters=k, random_state=1)
        clustering = model.fit(x)
        labels=clustering.labels_
        inertias.append(clustering.inertia_)
        silhouette_values.append(silhouette_score(x, labels))
        calinski_harabasz.append(calinski_harabasz_score(x, labels))
        davies_bouldin.append(davies_bouldin_score(x, labels))

    fig, ax = plt.subplots(4, 1, figsize=(8,32))
    ax[0].plot(inertias)
    ax[0].set_title("Elbow chart")
    ax[0].set_xlabel('clusters')
    ax[0].set_ylabel('distortion')
    ax[1].plot(silhouette_values)
    ax[1].set_title("Silhouette score")
    ax[1].set_xlabel('clusters')
    ax[1].set_ylabel('Silhouette score')
    ax[2].plot(calinski_harabasz)
    ax[2].set_title("Calinski Harabasz score")
    ax[2].set_xlabel('clusters')
    ax[2].set_ylabel('Calinski Harabasz score')
    ax[3].plot(davies_bouldin)
    ax[3].set_title("Davies Bouldin score")
    ax[3].set_xlabel('clusters')
    ax[3].set_ylabel('Davies Bouldin score')
    plt.tight_layout()
    plt.show()
    
def count_percentage():
    experience_level_count = dataset['experience_level'].value_counts()
    experience_level_percentage = dataset['experience_level'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
    
    
    marker_icon_count = dataset['marker_icon'].value_counts()
    marker_icon_percentage = dataset['marker_icon'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
    
    
    employment_type_count = dataset['employment_type'].value_counts()
    employment_type_percentage = dataset['employment_type'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
    
    df1 = pd.DataFrame({'Count': marker_icon_count, 'Percentage': marker_icon_percentage})
    df2 = pd.DataFrame({'Count': experience_level_count, 'Percentage': experience_level_percentage})
    df3 = pd.DataFrame({'Count': employment_type_count, 'Percentage': employment_type_percentage})
    display(df1)
    display(df2)
    display(df3)

def evaluate_per_dataset():
    results = []
    names = []
    
    for name, model, subplot_row in models:
            print(f"---------------------------\nRunning classification for: {name}")
            kfold = StratifiedKFold(n_splits=cfg.n_splits, random_state=1, shuffle=True)
            cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
            accuracySignificancy(model, x_train, y_train, kfold)
            results.append(cv_results)
            names.append(name)

            # Make predictions on validation dataset
            model.fit(x_train, y_train)
            predictions = model.predict(x_validation)
            
            print_scores(cv_results, predictions)
            plot_learning_curve(model,name)
    #ROC
    plot_roc_curves()

    # Compare Algorithms - ROC etc
    compare_algorithms(results, names)
    
    clustering_scores(x)

In [None]:
evaluate_per_dataset()