In [None]:
import sys
import os

import matplotlib.pyplot as plt

SCRIPT_DIR = os.path.dirname(os.path.abspath('src'))
sys.path.append(os.path.dirname(SCRIPT_DIR))

In [None]:
import pickle
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import numpy as np

In [None]:
LOAD_PATH = '../data/interim/'
LOAD_ANALYSIS_DF = '4.0-preprocessed-data-analysation.pkl'
LOAD_SKILLS_DEV = '7.0-Chosen_features_and_roles.pkl'

In [None]:
survey = pd.read_pickle(LOAD_PATH + LOAD_ANALYSIS_DF)
skills_dev_df = pd.read_pickle(LOAD_PATH + LOAD_SKILLS_DEV)

In [None]:
with open(LOAD_PATH + 'chosen_columns.pkl', 'rb') as f:
    chosen_columns = pickle.load(f)
survey = survey[chosen_columns['analysis']]


In [None]:
survey

In [None]:
survey.Country.value_counts().sort_values()

# 1.Employment vs Dev type
first we need to know Employment type abd is there any relation between them with dev type or not

In [None]:
def binarize(df, column):
    binarizer = MultiLabelBinarizer()
    mask = df[column].notnull()

    #filter by boolean indexing
    arr = binarizer.fit_transform(df.loc[mask, column])

    #create DataFrame and add missing (NaN)s index values
    return  (pd.DataFrame(arr, index=df.index[mask], columns=binarizer.classes_)
           .reindex(df[column].index, fill_value=0))


In [None]:
def change_labels(x: str):
    if 'Developer_' in x:
        x = x.replace('Developer_', '') + ' dev'

    if ' or ' in x:
        index_to_delete = x.index(' or ')
        x = x[index_to_delete + 4:]

    if 'Engineer_' in x:
        x = x.replace('Engineer_', '') + '  Engineer'

    if x.__contains__('back_end') or x.__contains__('full_stack'):
        x += ' dev'
    return x

In [None]:
jobs_freq = skills_dev_df['DevType'].sum().reset_index()
jobs_freq.columns = ['job_type', 'freq']
jobs_freq = jobs_freq.sort_values(by = 'freq', ascending=False)
jobs_freq


In [102]:
filtering = (jobs_freq.job_type.str.contains('full_stack')) | (jobs_freq.job_type.str.contains('back_end'))
job_freq_without_specific_titles = jobs_freq[~filtering]
most_10_freq_jobs = list(job_freq_without_specific_titles.iloc[:10].sort_values(by='freq').job_type)
most_10_freq_jobs

['Engineer_data',
 'Developer_embedded applications or devices',
 'Data scientist or machine learning specialist',
 'Cloud infrastructure engineer',
 'DevOps specialist',
 'Developer_mobile',
 'Developer_desktop or enterprise applications',
 'Developer_front-end',
 'Developer_back-end',
 'Developer_full-stack']

In [103]:
def employment_vs_dev_type(dev_types):
    # Define employment types
    employment_columns = {
        'full-time': 'Employed_full-time',
        'part-time': 'Employed_part-time',
        'freelancer': 'Independent contractor_freelancer_or self-employed'
    }

    # Prepare employment data
    employment_df = binarize(survey, 'Employment')
    employment_df = employment_df[list(employment_columns.values())]
    employment_df.rename(columns=employment_columns, inplace=True)
    employment_df.columns = pd.MultiIndex.from_product([['Employment'], employment_df.columns])

    # Prepare developer data
    dev_df = skills_dev_df['DevType'][dev_types]
    dev_df.columns = pd.MultiIndex.from_product([['DevType'], dev_df.columns])

    # Merge employment and developer data
    employment_dev = employment_df.merge(dev_df, left_index=True, right_index=True)

    # Calculate employment vs. developer counts
    employment_dev_count = (employment_dev.Employment.values.T @ employment_dev.DevType)
    employment_dev_count.index = employment_columns.keys()
    # Calculate percentages
    employment_dev_percentage = (employment_dev_count / employment_dev_count.sum(axis=0)) * 100
    employment_dev_percentage = employment_dev_percentage.drop('full-time', axis=0)

    # Create the figure
    fig = go.Figure()

    # Get the viridis color scale
    # colorscale = px.colors.qualitative.Dark24

    for i, employment_type in enumerate(employment_dev_percentage.index):
        fig.add_trace(go.Bar(
            name=employment_type,
            y=dev_types,
            x=employment_dev_percentage.loc[employment_type, :].values,
            orientation='h',
            # marker_color=colorscale[i % len(colorscale)]  # Cycle through viridis colors
        ))
    # Set layout
    layout = go.Layout(
        title="Most Common Jobs vs Employment Type",
        yaxis=dict(
            title="Job Type",
            tickmode='array',
            tickvals=np.arange(len(dev_types)),
            ticktext=[change_labels(x) for x in dev_types],
            tickfont={'size': 15},
        ),
        xaxis=dict(
            title="Percentage (%)",
            tickfont={'size': 20}
        ),
        barmode='group',
        title_font_size=20
    )

    fig.update_layout(layout)
    fig.update_layout(barmode='group', title_x=.5)
    fig.update_traces(
        hovertemplate="%{x:.2f}% of %{y} are employed as ",
        textfont_size=20
    )
    fig.update_yaxes(tickangle=45, tickfont=dict(family='Rockwell', size=15))

    return fig

In [104]:
employment_vs_dev_type(most_10_freq_jobs)


# 2. most common languages

In [107]:
def plot_most_used_languages():
    # extract most 10 used programing languages that used in development
    language_have_worked_with = binarize(survey, 'LanguageHaveWorkedWith')
    most_used_languages = language_have_worked_with.sum(axis=0).sort_values(ascending=False).iloc[:5]
    most_used_languages_per = most_used_languages / len(survey)
    most_used_languages_per.name = 'used'


    # Extract those languages that the developer doesn't know and wants to know
    language_want_to_work_with = binarize(survey, 'LanguageWantToWorkWith')
    #filter them, I don't want the languages that chosen in both
    Languages = (language_have_worked_with *.5) + language_want_to_work_with
    Languages[Languages==1.5] = 0
    most_planning_languages_per = Languages.sum(axis=0)[most_used_languages.index]/len(survey)
    most_planning_languages_per.name = 'planning to use'

    #concat_two_series
    df = pd.concat([most_used_languages_per, most_planning_languages_per], axis=1, names = ['used', 'planning'])

    # Create the figure
    fig = go.Figure()

    # Get the viridis color scale
    # colorscale = px.colors.qualitative.Dark24

    for i, used_type in enumerate(df.columns):
        text = (df.loc[:, used_type].values * 100).astype(int)
        text = [f"{value}%" for value in text]

        fig.add_trace(go.Bar(
            name=used_type,
            y=df.index,
            x=df.loc[:, used_type].values,
            orientation='h',
            text=text,
            # marker_color=colorscale[i % len(colorscale)]  # Cycle through viridis colors
        ))
    # Set layout
    layout = go.Layout(
        title="Most Used Languages",
        yaxis=dict(
            title="Programing Languages",
            tickmode='array',
            tickvals=np.arange(len(df.index)),
            ticktext=df.index,
            tickfont={'size': 15},
        ),
        xaxis=dict(
            title="Percentage (%)",
            tickfont={'size': 20},
            autorange = "reversed"
        ),
        barmode='group',
        title_font_size=20
    )

    # Reverse the y-axis
    layout.yaxis['side'] = 'right'

    fig.update_layout(layout)
    fig.update_layout(barmode='group', title_x=.5)

    fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
    ))

    fig.update_yaxes(tickangle=45, tickfont=dict(family='Rockwell', size=15))

    return fig

In [108]:
plot_most_used_languages()

# 3. job salary

In [219]:
def jobs_salaries():
    survey_comp = survey.copy()

    # we only need rows with one dev_type for fair comparison
    filteration =  (skills_dev_df['DevType'].sum(axis=1) == 1)
    dev_type = skills_dev_df[filteration]['DevType'].idxmax(1)
    survey_comp = survey_comp.loc[dev_type.index,:]
    survey_comp['DevType'] = dev_type

    # we also need more filtration for salaries not equal nan
    survey_comp = survey_comp[~survey_comp['CompTotal'].isna()]

    # calculate median of years of experience and salaries
    dev_years = survey_comp.groupby('DevType')['YearsCodePro'].median().sort_values(ascending=False)
    dev_comp = survey_comp.groupby('DevType')['CompTotal'].median().sort_values(ascending=False)
    dev_years_comp = pd.concat([dev_comp, dev_years], axis=1)

    # plot figure
    fig = px.scatter(x = dev_years_comp.index, y = dev_years_comp.CompTotal, size=dev_years_comp.YearsCodePro.values)
    layout = go.Layout(
            title="Most Paid Jobs With Respect To Years Of Experience",
            xaxis=dict(
                title="Job Type",
                tickmode='array',
                tickvals=np.arange(len(dev_years_comp.index)),
                ticktext=[change_labels(x) for x in dev_years_comp.index],
                tickfont={'size': 15},
            ),
            yaxis=dict(
                title="Salary",
                tickfont={'size': 20}
            ),
            title_font_size=20,
            title_x=0.5
        )

    fig.update_yaxes(tickangle=45, tickfont=dict(family='Rockwell', size=15))
    fig.update_xaxes(tickangle=45, tickfont=dict(family='Rockwell', size=15))


    fig.update_layout(layout)

    return fig

In [220]:
jobs_salaries()