In [168]:
import pandas as pd
from constants import *
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
import math

In [146]:
def create_single_plot(x: list, y: list, xaxis_title: str = 'X axis', yaxis_title: str = 'Y axis', title: str = 'Title',
                       save: bool = False, y_range: tuple = None, save_path: str = 'plot'):
    fig_ = go.Figure()

    fig_.add_trace(go.Scatter(x=x, y=y, mode='lines+markers', line=dict(width=4), marker=dict(size=10)))

    layout_ = {
        "title": title,
        "xaxis_title": xaxis_title,
        "yaxis_title": yaxis_title
    }

    fig_.update_layout(**layout_)

    if y_range is not None:
        fig_.update_yaxes(range=list(y_range))

    if save:
        pio.write_image(fig_, f'plots/{save_path}.svg')

    return fig_


def combine_plots(figures: list, title: str = '', yaxis_title: str = '', xaxis_title: str = '', save: bool = True,
                  x_range: tuple = None, y_range: tuple = None, subplot_titles: bool = True, save_path: str = 'plot'):
    num_figures_ = len(figures)
    num_rows_ = num_cols_ = int(math.ceil(math.sqrt(num_figures_)))

    if subplot_titles:
        subplot_titles_ = [fi.layout.title.text for fi in figures]
        fig_ = make_subplots(rows=num_rows_, cols=num_cols_, subplot_titles=subplot_titles_)
    else:
        fig_ = make_subplots(rows=num_rows_, cols=num_cols_)

    for i, f in enumerate(figures):
        row_ = i // num_cols_ + 1
        col_ = i % num_cols_ + 1
        for r in f['data']:
            # Add the trace to the subplot
            fig_.add_trace(r, row=row_, col=col_)

    layout_ = {
        "title": title,
        "width": 1900,
        "height": 940,
        "showlegend": False
    }

    fig_.update_layout(**layout_)
    fig_.update_yaxes(title_text=yaxis_title)
    fig_.update_xaxes(title_text=xaxis_title)

    if x_range is not None:
        fig_.update_xaxes(range=list(x_range))
    if y_range is not None:
        fig_.update_yaxes(range=list(y_range))

    if save:
        pio.write_image(fig_, f'plots/{save_path}.svg')

    return fig_


def create_histogram(data: list, title: str = '', xaxis_title: str = '', yaxis_title: str = '',
                     x_range: tuple = None, y_range: tuple = None, save: bool = False, save_path: str = 'plot'):
    fig_ = go.Figure()

    fig_.add_trace(go.Histogram(x=data))

    # Layout settings
    layout = {
        'title': title,
        'xaxis_title': xaxis_title,
        'yaxis_title': yaxis_title
    }
    if x_range is not None:
        layout['xaxis'] = {'range': list(x_range)}

    if y_range is not None:
        layout['yaxis'] = {'range': list(y_range)}

    fig_.update_layout(**layout)

    if save:
        pio.write_image(fig_, f'plots/{save_path}.svg')

    return fig_

In [147]:
frame = pd.read_csv('data.csv')
frame.columns = ['year', 'title', 'category', 'currency', 'salary', 'salary_in_dollars', 'employee_residence',
                 'experience', 'employment_type', 'working_mode', 'company_location', 'company_size']

## Salary Trends Over Time (Temporal Analysis)

In [228]:
def stage_1(data, mode="average"):
    tmp_dict_ = {}
    category_ = data['category'].unique()
    year_ = sorted(data['year'].unique())

    for c in category_:
        tmp_dict_[c] = {
            'values': [],
            'years': {}
        }
        for y in year_:
            tmp_frame_ = data[(data['category'] == c) & (data['year'] == y)]
            tmp_dict_[c]['years'][f"{y}"] = data[(data['category'] == c) & (data['year'] == y)][
                'salary_in_dollars'].values

            if mode == "average":
                values_ = tmp_frame_['salary_in_dollars'].mean()
            elif mode == "median":
                values_ = tmp_frame_['salary_in_dollars'].median()

            tmp_dict_[c]['values'].append(values_)

    dates_ = [str(element) for element in year_]
    tmp_max_values_ = []
    for name_, value_ in tmp_dict_.items():
        tmp_max_values_.append(max(value_['values']))

    max_value_ = max(tmp_max_values_)

    line_plot_figures_ = []
    for name_, value_ in tmp_dict_.items():
        line_plot_figures_.append(
            create_single_plot(x=dates_, y=value_['values'], xaxis_title="Year", yaxis_title="Median Salary",
                               title=name_, save=False, y_range=(0, max_value_ * 1.2), save_path=f'stage_1/{name_}'))

        t = f'{mode.capitalize()} salary over time for each category'
        # Create a plot of median salaries
        combine_plots(figures=line_plot_figures_, title=t,
                      yaxis_title='Median', xaxis_title='Year', save=True, y_range=(0, max_value_ * 1.2),
                      save_path=f'stage_1/{t}')

    ### histograms
    max_values = {}
    for name_, value_ in tmp_dict_.items():
        series_ = value_['years']
        max_values[name_] = []
        for i, s in series_.items():

            if len(s) > 0:
                max_values[name_].append(max(s))

    histogram_figures = {}
    for name_, value_ in tmp_dict_.items():
        histogram_figures[name_] = []
        series_ = value_['years']
        for i, s in series_.items():
            histogram_figures[name_].append(
                create_histogram(data=s, title=f'{i}', xaxis_title='Year', yaxis_title='Frequency',
                                 x_range=(0, max(max_values[name_]) * 1.1), save=False,
                                 save_path=f'stage_1/{name_} {i}'))

    combined_histograms_ = []
    for name_, value_ in histogram_figures.items():
        t = name_.capitalize()
        combined_histograms_.append(
            combine_plots(figures=value_, title=t, yaxis_title='Frequency', xaxis_title='Value',
                          x_range=(0, max(max_values[name_]) * 1.1), save=True, subplot_titles=True,
                          save_path=f'stage_1/histograms/{t}'))

    return tmp_dict_


stage_1(frame, mode="average")
stage_1(frame, mode="median")

{'Data Engineering': {'values': [100000.0, 96282.0, 135000.0, 142200.0],
  'years': {'2020': array([100000,  54742,  59303,  70000,  74130,  79833, 106000, 112872,
          188000,  70139, 109024, 125000, 114047,  56000,  47899, 110000,
          130800], dtype=int64),
   '2021': array([ 66970,  75050,  45390, 150000, 200000,  60000, 200000, 100000,
           82528,  80000, 153000,  90000,  28476, 140000, 103160, 113476,
          150000, 115000, 150000,  89294, 276000, 160000, 200000, 174000,
           96282,  72212, 200000, 110000,  72500,  69741, 112000,  65013,
           28016, 111775,  93150, 160000,  26005,  66022,  20000, 165000,
          185000,  76833,  18000,  45391,  28369], dtype=int64),
   '2022': array([ 59888, 175000, 155000, 153600, 106800, 170000, 130000, 145000,
          100000, 175000, 120000, 145000, 115000, 260000, 175000, 250000,
           63000, 150000, 150000, 120000,  95000, 160000, 135000, 216000,
          144000,  85000,  65000, 130000, 115000, 105000