In [4]:
import pandas as pd
import numpy as np
import plotly.express as px

def safe_read(path):
    try:
        return pd.read_csv(path)
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding="cp1251", sep=",", engine="python", errors="ignore")

df = safe_read("merged_data.csv")

In [5]:
miss = (df.isna().sum()
          .sort_values(ascending=False)
          .reset_index(name="count")
          .rename(columns={"index":"column"}))
miss = miss[miss["count"]>0]

fig = px.bar(miss, x="count", y="column", orientation="h",
             title="Количество пропущенных значений по столбцам")
fig.update_traces(marker_color="royalblue",
                  text=miss["count"].map(lambda v: f"{v:,.0f}".replace(","," ")),
                  textposition="inside", insidetextanchor="start",
                  textfont_color="white", cliponaxis=False)
fig.update_layout(template="simple_white",
                  xaxis=dict(title="Пропущенные значения", tickformat=",",
                             range=[0, miss["count"].max()*1.12]),
                  yaxis=dict(title="Столбец", automargin=True,
                             categoryorder="total ascending"),
                  margin=dict(l=220,r=40,t=60,b=40),
                  height=max(380, 30*len(miss)+120))
fig.show()

miss_share = miss.assign(share=lambda d: d["count"]/len(df))
fig2 = px.bar(miss_share, x="share", y="column", orientation="h",
              title="Доля пропусков по столбцам")
fig2.update_traces(marker_color="royalblue",
                   text=miss_share["share"].map(lambda v: f"{v:.1%}"),
                   textposition="outside", cliponaxis=False)
fig2.update_layout(template="simple_white",
                   xaxis=dict(title="Доля", tickformat=".0%"),
                   yaxis=dict(title="Столбец", automargin=True,
                              categoryorder="total ascending"),
                   margin=dict(l=220,r=40,t=60,b=40),
                   height=max(380, 28*len(miss_share)+120))
fig2.show()

sparsity = miss["count"].sum()/(df.shape[0]*df.shape[1])
print(f"Общая разреженность по пропускам: {sparsity:.2%}")

Общая разреженность по пропускам: 9.56%


In [8]:
num_cols = df.select_dtypes(include="number").columns.tolist()
show_cols = [c for c in ["Зарплата_от","Зарплата_до"] if c in num_cols] or num_cols[:5]
long = df[show_cols].melt(var_name="column", value_name="value").dropna()
fig = px.box(long, x="column", y="value", points=False, title="Распределения и выбросы (boxplot)")
fig.update_layout(template="simple_white")
fig.show()