In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
from pathlib import Path

plt.style.use("ggplot")

os.makedirs("task4_outputs", exist_ok=True)

In [38]:
INPUT_CSV_PATH = "./data/kaggle_survey_2017_2021.csv"

df = pd.read_csv(INPUT_CSV_PATH, low_memory=False)

print("Shape:", df.shape)
df.head(3)

Shape: (106302, 293)


Unnamed: 0,-,Time from Start to Finish (seconds),Q1,Q2,Q3,Q4,Q5,Q6,Q7_Part_1,Q7_Part_2,...,Q38_B_Part_1,Q38_B_Part_2,Q38_B_Part_3,Q38_B_Part_4,Q38_B_Part_5,Q38_B_Part_6,Q38_B_Part_7,Q38_B_Part_9,Q38_B_Part_11,Q38_B_OTHER
0,Year,Duration (in seconds),What is your age (# years)?,What is your gender? - Selected Choice,In which country do you currently reside?,What is the highest level of formal education ...,Select the title most similar to your current ...,For how many years have you been writing code ...,What programming languages do you use on a reg...,What programming languages do you use on a reg...,...,"In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor...","In the next 2 years, do you hope to become mor..."
1,2021,910,50-54,Man,India,Bachelorâ€™s degree,Other,5-10 years,Python,R,...,,,,,,,,,,
2,2021,784,50-54,Man,Indonesia,Masterâ€™s degree,Program/Project Manager,20+ years,,,...,,,,,,,,,,


In [39]:
df = df.drop_duplicates()

df.columns = df.columns.str.strip()

df = df.applymap(lambda x: x.strip().title() if isinstance(x, str) else x)

print("Shape after cleaning:", df.shape)

Shape after cleaning: (101846, 293)


In [40]:
missing_summary = df.isnull().mean().sort_values(ascending=False).head(10)
print("Top 10 columns with missing values (%):")
print(missing_summary)

threshold = 0.7
df = df.loc[:, df.isnull().mean() < threshold]

for col in df.select_dtypes(include="object"):
    df[col] = df[col].fillna("Unknown")

for col in df.select_dtypes(include=np.number):
    df[col] = df[col].fillna(df[col].median())

print("Remaining missing values:", df.isnull().sum().sum())

Top 10 columns with missing values (%):
Q16_Part_17      0.99999
Q34_B_Part_16    0.99999
Q9_Part_12       0.99999
Q34_A_Part_16    0.99999
Q32_A_Part_20    0.99999
Q14_Part_11      0.99999
Q27_B_Part_11    0.99999
Q10_Part_16      0.99999
Q19_Part_5       0.99999
Q32_B_Part_20    0.99999
dtype: float64
Remaining missing values: 0


In [41]:
encoded_df = df.copy()

for col in encoded_df.select_dtypes(include="object"):
    encoded_df[col + "__code"] = encoded_df[col].astype("category").cat.codes

encoded_df.head(3)

Unnamed: 0,-,Time from Start to Finish (seconds),Q1,Q2,Q3,Q4,Q5,Q6,Q7_Part_1,Q7_Part_3,...,Q20__code,Q21__code,Q22__code,Q23__code,Q24_Part_1__code,Q25__code,Q26__code,Q40_Part_1__code,Q41__code,Q42_Part_4__code
0,Year,Duration (In Seconds),What Is Your Age (# Years)?,What Is Your Gender? - Selected Choice,In Which Country Do You Currently Reside?,What Is The Highest Level Of Formal Education ...,Select The Title Most Similar To Your Current ...,For How Many Years Have You Been Writing Code ...,What Programming Languages Do You Use On A Reg...,What Programming Languages Do You Use On A Reg...,...,8,7,7,0,1,48,8,1,7,4
1,2021,910,50-54,Man,India,Bachelorâ€™S Degree,Other,5-10 Years,Python,Unknown,...,10,4,5,2,2,18,5,0,4,1
2,2021,784,50-54,Man,Indonesia,Masterâ€™S Degree,Program/Project Manager,20+ Years,Unknown,Sql,...,10,2,3,4,2,35,0,2,0,3


In [42]:
df.to_csv("task4_outputs/survey_clean.csv", index=False)
encoded_df.to_csv("task4_outputs/survey_encoded.csv", index=False)

print("Cleaned & encoded dataset")

Cleaned & encoded dataset


In [43]:
col_map = {
    "country": [c for c in df.columns if "Country" in c][0] if any("Country" in c for c in df.columns) else None,
    "language": [c for c in df.columns if "Language" in c][0] if any("Language" in c for c in df.columns) else None,
    "education": [c for c in df.columns if "Education" in c][0] if any("Education" in c for c in df.columns) else None,
    "compensation": [c for c in df.columns if "Compensation" in c][0] if any("Compensation" in c for c in df.columns) else None,
    "role": [c for c in df.columns if "Role" in c or "Job" in c][0] if any("Role" in c or "Job" in c for c in df.columns) else None,
    "experience": [c for c in df.columns if "Experience" in c][0] if any("Experience" in c for c in df.columns) else None,
}

col_map

{'country': None,
 'language': None,
 'education': None,
 'compensation': None,
 'role': None,
 'experience': None}

In [44]:
insights = []

if col_map["country"]:
    top_countries = df[col_map["country"]].value_counts().head(10)
    insights.append("1. Top respondent countries:\n" + str(top_countries.head(5)))
    plt.figure(figsize=(8,5))
    top_countries.plot(kind="bar")
    plt.title("Top Respondent Countries")
    plt.savefig("task4_outputs/top_countries.png")
    plt.close()

if col_map["language"]:
    langs = df[col_map["language"]].value_counts().head(10)
    insights.append("2. Most popular programming languages:\n" + str(langs.head(5)))
    plt.figure(figsize=(8,5))
    langs.plot(kind="bar")
    plt.title("Most Popular Programming Languages")
    plt.savefig("task4_outputs/top_languages.png")
    plt.close()

if col_map["education"]:
    edu = df[col_map["education"]].value_counts().head(10)
    insights.append("3. Education level distribution:\n" + str(edu.head(5)))
    plt.figure(figsize=(8,5))
    edu.plot(kind="bar")
    plt.title("Education Level Distribution")
    plt.savefig("task4_outputs/education.png")
    plt.close()

if col_map["compensation"]:
    comp = df[col_map["compensation"]].value_counts().head(10)
    insights.append("4. Compensation insights (top 5):\n" + str(comp.head(5)))
    plt.figure(figsize=(8,5))
    comp.plot(kind="bar")
    plt.title("Compensation Distribution")
    plt.savefig("task4_outputs/compensation.png")
    plt.close()

if col_map["role"]:
    roles = df[col_map["role"]].value_counts().head(10)
    insights.append("5. Role distribution:\n" + str(roles.head(5)))
    plt.figure(figsize=(8,5))
    roles.plot(kind="bar")
    plt.title("Role Distribution")
    plt.savefig("task4_outputs/roles.png")
    plt.close()

with open("task4_outputs/top5_insights.md", "w") as f:
    f.write("\n\n".join(insights))

print("Top 5 insights generated and saved in task4_outputs/")

Top 5 insights generated and saved in task4_outputs/


In [45]:
for i, ins in enumerate(insights, 1):
    print(f"--- Insight {i} ---\n{ins}\n")