In [None]:
from sklearn import metrics
!pip install scipy numpy matplotlib pandas sklearn tabulate seaborn jupyterthemes > /dev/null
%matplotlib notebook

In [None]:
# Load libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import read_csv
from pandas import read_json
from matplotlib import pyplot
from sklearn.model_selection import train_test_split, learning_curve, permutation_test_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, plot_roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import json
import itertools
import seaborn as sns
import urllib.request 
from tabulate import tabulate
from IPython.display import display, HTML
# notebook styles
# from jupyterthemes import jtplot
# !jt -t onedork
display(HTML("<style>.container { width:90% !important; }</style>"))
# jtplot.style(theme='onedork')

<h2>Dataset configuration</h2>

In [None]:
class Config(object):
    def __init__(self, dataSourceUrl, dataResultUrl, test_size, n_splits, should_describe_data):
        self.dataSourceUrl = dataSourceUrl
        self.dataResultUrl = dataResultUrl
        self.test_size = test_size
        self.n_splits = n_splits
        self.should_describe_data = should_describe_data

def as_config(dict, dataSetName):
    return Config(
        dict[dataSetName]['dataSourceUrl'],
        dict[dataSetName]['dataResultUrl'],
        dict[dataSetName]['test_size'],
        dict[dataSetName]['n_splits'],
        dict[dataSetName]['should_describe_data'],
    )

In [None]:
json_config = """
{
    "it_data":{
        "dataSourceUrl": "./data/joinit_data.json",
        "dataResultUrl": "./data/joinit_data.csv",
        "test_size": 0.2,
        "n_splits": 10,
        "should_describe_data": true
    }
}"""
curr_table = []

<h2>Load (and describe) dataset, create test and train datasets</h2>

In [None]:
cfg = as_config(json.loads(json_config), 'it_data')

df = read_json(cfg.dataSourceUrl)
df.to_csv(cfg.dataResultUrl)
df = df.drop(columns=["street", "address_text", "company_url", "latitude", "longitude", "company_logo_url"])

#### Save no-salary observations to separate dataframe: "salaryless_df"

In [None]:
salaryless_df = df.loc[((df.salary_currency.isnull()) | (df.salary_from.isnull()))]
df = df.loc[((df.salary_currency.notnull()) & (df.salary_from.notnull()))]
print(f"Found {salaryless_df.shape[0]} job ads without salary range or currency")

#### Check numbers of unique country code, currency

In [None]:
def find_exchange_rate(code, table):
    return next(filter(lambda c: c["code"]==code, curr_table[0]["rates"]))["mid"]
uniq_countries = df['country_code'].nunique()
uniq_currencies = df['salary_currency'].nunique()
if uniq_countries != 1 or uniq_currencies != 1:
    print(f"Found {uniq_countries} countries and {uniq_currencies} currencies!")
    print(f"Dropping foreign countries and translating currencies...")
    df = df.loc[df["country_code"] == "PL"]
    if not curr_table:
        with urllib.request.urlopen("https://api.nbp.pl/api/exchangerates/tables/a/?format=json") as url:
            curr_table = json.loads(url.read().decode())
    to_translate = df.loc[df["salary_currency"] != "pln"]
    for curr in to_translate.salary_currency.unique():
        ex_rate = find_exchange_rate(curr.upper(), curr_table)
        df.loc[df["salary_currency"] == curr] = df.loc[df["salary_currency"] == curr].apply(lambda x: x*ex_rate if x.name in ["salary_from", "salary_to"] else ("pln" if x.name == "salary_currency" else x))
    print(f"Unique countries: {df['country_code'].nunique()}, currencies: {df['salary_currency'].nunique()}, observations: {df.shape[0]}")

In [None]:
if cfg.should_describe_data:    
    print(df.shape)
    display(HTML(tabulate(df.head(1), headers=df.columns, tablefmt="html")))
    print(df.describe())

In [None]:
def setAxesRanges(axes):
    for a in axes:
        start, end = a.get_ylim()
        a.yaxis.set_ticks(np.arange(start, end, (end-start)/4))
        a.set_ylim(top=end*1.2)
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)
def transform_company_size(val):
    val = val.replace('+', "").replace("<", "").replace(">", "").replace(" ", "")
    if "-" in val:
        ran = val.split("-", 1)
        if '' in ran:
            return int(ran[0]) if ran[1] == "" else int(ran[1])
        else:
            return (int(ran[0])+int(ran[1]))/2
    else:
        return int(val)
X = df["company_size"].copy().values
X = [transform_company_size(x) for x in X if hasNumbers(x)]
first_X = [x for x in X if x < 1000]
second_X = [x for x in X if x >= 1000 and x < 5000]
third_X = [x for x in X if x >= 5000]

sns.set(color_codes=True)
f, axes = plt.subplots(3, 1)
sns.distplot(first_X, ax=axes[0], kde=False, hist=True);
sns.distplot(second_X, ax=axes[1], kde=False, hist=True);
sns.distplot(third_X, ax=axes[2], kde=False, hist=True);
f.tight_layout()
setAxesRanges(axes)
sns.distplot(df.copy().assign(Spread=lambda df: 100*(df.salary_to-df.salary_from)/((df.salary_to+df.salary_from)/2)).Spread, kde=False, hist=True)
with sns.axes_style("white"):
    without_outlier=df.copy()[df.salary_to<80_000]
    sns.jointplot(x=without_outlier.salary_from, y=without_outlier.salary_to, kind="hex");
    sns.jointplot(x=without_outlier.salary_from, y=without_outlier.salary_to, data=df, kind="kde");

---------------------------------------------------------------------------

In [None]:
import difflib
# temp = dataset['skills'][0]
skills_arr = []
# skills = [skill_arr.append() for r in dataset['skills']]
for r in dataset['skills']:
    for name in r:
        skills_arr.append(name['name'])
unique_skills = np.unique(np.array(skills_arr))
## usuwanie po znalezionych podobnych stwierdzeniach
similar = []
very_unique = []
for skill in unique_skills:
    if not skill in similar:
        very_unique.append(skill)
        similar += difflib.get_close_matches(skill, unique_skills, cutoff=0.1)

## usuwanie po 1. słowie
first_word = []
very_very_unique = []
for skill in very_unique:
    if not skill.split(' ')[0] in first_word:
        first_word.append(skill.split(' ')[0])
        very_very_unique.append(skill)
print(len(very_unique))
pd.DataFrame(very_very_unique).to_csv('./data/skills_double_trim.csv')
# dataset_skills = read_json(dataset['skills'])

In [None]:
array = dataset.values
x = array[:,0:len(dataset.columns)-1]
y = array[:,len(dataset.columns)-1]
#na podstawie x i y otrzymujemy tablice testowe i wynikowe
x_train, x_validation, y_train, y_validation = train_test_split(x,y, test_size=cfg.test_size, random_state=1)

<h2>Classification models</h2>

In [None]:
models = []
models.extend([
    ('KNN', KNeighborsClassifier(), 0),
    ('CART', DecisionTreeClassifier(), 1),
    ('NB', GaussianNB(), 2),
    ('SVM', SVC(gamma='auto'), 3),
    ('MLP', MLPClassifier(alpha=1e-5, hidden_layer_sizes=(50,10), max_iter=5000), 4)
    ])

<h2>Classification</h2>
kfold - k cross-validation to algorytm polegający na testowaniu nauczania(sprawdzania jego wydajności). 
Zbiór TESTOWY jest dzielony na K podzbiorów. W każdej z k iteracji,
brane jest k-1 pozdbiorów, następuje ich nauczanie, następnie sprawdzenie 'jakości' nauczonego modelu.
Przy pomocy danego algorytmu uczenia maszynowego!

In [None]:
def plot_show():
    pyplot.draw()
    pyplot.pause(0.1)

def get_specificity(y_validate, y_predicted):
    cnf_matrix = confusion_matrix(y_validate, y_predicted)
    
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TP = np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)
    
    FP = FP.astype(float)
    TN = TN.astype(float)
        
    return np.mean(TN/(TN+FP))
    
def get_learning_curve(classification_model, training_set_enlarging_step=10):
    train_sizes = np.linspace(0.1, 1, training_set_enlarging_step)
    train_sizes, train_scores, validation_scores = learning_curve(
        estimator = classification_model,
        X = x,
        y = y, 
        train_sizes = train_sizes,
        cv = 5,
        scoring = 'accuracy')
    return train_sizes, train_scores, validation_scores

def plot_learning_curve(model,name):
    train_sizes, train_scores, test_scores = \
        get_learning_curve(model)

    plt.style.use('seaborn')
    plt.plot(train_sizes, -train_scores.mean(axis = 1), color= 'red', label = 'Training error')
    plt.plot(train_sizes, -test_scores.mean(axis = 1), color= 'navy',label = 'Validation error')
    plt.ylabel('Accuracy', fontsize = 14)
    plt.xlabel('Training set size', fontsize = 14)
    plt.title('Learning curves for a %s' % name, fontsize = 18, y = 1.03)
    plt.legend()
    plt.show()
    
def compare_algorithms(results, names):
    fig = pyplot.figure()
    fig.suptitle("Algorithm Comparison")
    ax = fig.add_subplot(1,1,1)
    ax.set_title("Algorithm Comparison")
    ax.boxplot(results, labels=names)
    plot_show()

def print_scores(cv_results, predictions):
    print('\nMean %f' % cv_results.mean())
    print('STD %f' % cv_results.std())
    print('\nConfusion matrix:')
    print(confusion_matrix(y_validation, predictions))
    print('\nAccuracy %f' % accuracy_score(y_validation, predictions))
    print('Precision %f' % precision_score(y_validation, predictions, average = 'weighted'))
    print('Recall %f' % recall_score(y_validation, predictions, average = 'weighted'))
    print('Specificity %f' % get_specificity(y_validation, predictions))
    print('\nClassification report:')
    print(classification_report(y_validation, predictions))

def plot_roc_curves():
    fig = pyplot.figure()
    ax = plt.gca()
    for name, model, subplot_row in models:
        rfc_disp = plot_roc_curve(model, x_validation, 
                                  y_validation, ax=ax, alpha=0.8)
    plt.show()

def accuracySignificancy(model, x_train, y_train, cv):
    fig = pyplot.figure()
    fig.suptitle("Estimating accuracy score's statistical significancy")
    ax = fig.add_subplot(1,1,1)
    n_classes = np.unique(y_train).size
    score, permutation_scores, pvalue = permutation_test_score(model, x_train, y_train, scoring="accuracy", cv=cv, n_permutations=100)
    print("Classification score %s (pvalue : %s)" % (score, pvalue))
    # View histogram of permutation scores
    ax.hist(permutation_scores, 20, label='Permutation scores',
             edgecolor='black')
    ylim = plt.ylim()
    ax.plot(2 * [score], ylim, '--g', linewidth=3,
             label='Classification Score'
             ' (pvalue %s)' % pvalue)
    ax.plot(2 * [1. / n_classes], plt.ylim(), '--k', linewidth=3, label='Luck')
    ax.set_xlabel('Score')
    plt.show()

def clustering_scores(x, labels_true=None):
    inertias = []
    silhouette_values = []
    calinski_harabasz = []
    davies_bouldin = []
    k_range = 15
    for k in range(2, k_range):
        model = KMeans(n_clusters=k, random_state=1)
        clustering = model.fit(x)
        labels=clustering.labels_
        inertias.append(clustering.inertia_)
        silhouette_values.append(silhouette_score(x, labels))
        calinski_harabasz.append(calinski_harabasz_score(x, labels))
        davies_bouldin.append(davies_bouldin_score(x, labels))

    fig, ax = plt.subplots(4, 1, figsize=(8,32))
    ax[0].plot(inertias)
    ax[0].set_title("Elbow chart")
    ax[0].set_xlabel('clusters')
    ax[0].set_ylabel('distortion')
    ax[1].plot(silhouette_values)
    ax[1].set_title("Silhouette score")
    ax[1].set_xlabel('clusters')
    ax[1].set_ylabel('Silhouette score')
    ax[2].plot(calinski_harabasz)
    ax[2].set_title("Calinski Harabasz score")
    ax[2].set_xlabel('clusters')
    ax[2].set_ylabel('Calinski Harabasz score')
    ax[3].plot(davies_bouldin)
    ax[3].set_title("Davies Bouldin score")
    ax[3].set_xlabel('clusters')
    ax[3].set_ylabel('Davies Bouldin score')
    plt.tight_layout()
    plt.show()

def evaluate_per_dataset():
    results = []
    names = []
    
    for name, model, subplot_row in models:
            print(f"---------------------------\nRunning classification for: {name}")
            kfold = StratifiedKFold(n_splits=cfg.n_splits, random_state=1, shuffle=True)
            cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
            accuracySignificancy(model, x_train, y_train, kfold)
            results.append(cv_results)
            names.append(name)

            # Make predictions on validation dataset
            model.fit(x_train, y_train)
            predictions = model.predict(x_validation)
            
            print_scores(cv_results, predictions)
            plot_learning_curve(model,name)
    #ROC
    plot_roc_curves()

    # Compare Algorithms - ROC etc
    compare_algorithms(results, names)
    
    clustering_scores(x)

In [None]:
evaluate_per_dataset()