In [5]:
# Cell A  – filter to full-time devs
print(df['Employment'].value_counts(dropna=False).head())   # see categories

df = df[df['Employment'] == 'Employed full-time'].copy()
print("Rows after filter →", df.shape)



Employment
Employed, full-time                                                         39041
Independent contractor, freelancer, or self-employed                         4846
Student, full-time                                                           4709
Employed, full-time;Independent contractor, freelancer, or self-employed     3557
Not employed, but looking for work                                           2341
Name: count, dtype: int64
Rows after filter → (0, 114)


In [6]:
# Cell B  – map “dissatisfied” answers to 1 (burned-out), the rest to 0
dissat = {'Very dissatisfied', 'Slightly dissatisfied'}

def burnout_flag(answer):
    if pd.isna(answer):
        return pd.NA          # keep as missing for now
    return 1 if answer in dissat else 0

df['burnout'] = df['JobSat'].apply(burnout_flag)

print(df['burnout'].value_counts(dropna=False))



Series([], Name: count, dtype: int64)


In [7]:
# Cell C – peek at the raw answers
df['AI_Search_Use'].value_counts(dropna=False)


KeyError: 'AI_Search_Use'

In [8]:
# List any columns that mention "AI" or "Search" so we can see the exact spelling
[col for col in df.columns if 'AI' in col or 'Search' in col][:20]



['AISearchDevHaveWorkedWith',
 'AISearchDevWantToWorkWith',
 'AISearchDevAdmired',
 'AISelect',
 'AISent',
 'AIBen',
 'AIAcc',
 'AIComplex',
 'AIToolCurrently Using',
 'AIToolInterested in Using',
 'AIToolNot interested in Using',
 'AINextMuch more integrated',
 'AINextNo change',
 'AINextMore integrated',
 'AINextLess integrated',
 'AINextMuch less integrated',
 'AIThreat',
 'AIEthics',
 'AIChallenges',
 'TimeSearching']

In [9]:
AI_FREQ_COL = 'AI Search Frequency'   # ← put your column name here exactly


In [10]:
freq_map = {
    'Multiple times per day':        3,
    'A few times per week':          2,
    'A few times per month':         1,
    'Less than once per month':      0,
    'Never':                         0
}

df['ai_freq'] = df[AI_FREQ_COL].map(freq_map)

# Quick check
print(df[[AI_FREQ_COL, 'ai_freq']].head())
print(df['ai_freq'].value_counts(dropna=False))


KeyError: 'AI Search Frequency'

In [11]:
import re

ai_cols = [c for c in df.columns if re.search(r'ai', c, flags=re.I)]
print("Columns that mention AI:", len(ai_cols))
for c in ai_cols:
    print(" •", c)


Columns that mention AI: 20
 • MainBranch
 • AISearchDevHaveWorkedWith
 • AISearchDevWantToWorkWith
 • AISearchDevAdmired
 • AISelect
 • AISent
 • AIBen
 • AIAcc
 • AIComplex
 • AIToolCurrently Using
 • AIToolInterested in Using
 • AIToolNot interested in Using
 • AINextMuch more integrated
 • AINextNo change
 • AINextMore integrated
 • AINextLess integrated
 • AINextMuch less integrated
 • AIThreat
 • AIEthics
 • AIChallenges


In [12]:
AI_TOOL_COL = 'AIToolCurrently Using'          # exact column name

def count_tools(cell):
    """Return number of AI tools listed (0 if NaN)."""
    if pd.isna(cell) or cell.strip() == '':
        return 0
    # split on comma and count non-empty parts
    return len([t for t in cell.split(',') if t.strip()])

df['ai_freq'] = df[AI_TOOL_COL].apply(count_tools)

# Quick sanity-check
print(df[[AI_TOOL_COL, 'ai_freq']].head(10))
print("\nai_freq value counts:\n", df['ai_freq'].value_counts(dropna=False).sort_index())


Empty DataFrame
Columns: [AIToolCurrently Using, ai_freq]
Index: []

ai_freq value counts:
 Series([], Name: count, dtype: int64)


In [13]:
print("Current shape:", df.shape)
print(df.head(3))


Current shape: (0, 116)
Empty DataFrame
Columns: [ResponseId, MainBranch, Age, Employment, RemoteWork, Check, CodingActivities, EdLevel, LearnCode, LearnCodeOnline, TechDoc, YearsCode, YearsCodePro, DevType, OrgSize, PurchaseInfluence, BuyNewTool, BuildvsBuy, TechEndorse, Country, Currency, CompTotal, LanguageHaveWorkedWith, LanguageWantToWorkWith, LanguageAdmired, DatabaseHaveWorkedWith, DatabaseWantToWorkWith, DatabaseAdmired, PlatformHaveWorkedWith, PlatformWantToWorkWith, PlatformAdmired, WebframeHaveWorkedWith, WebframeWantToWorkWith, WebframeAdmired, EmbeddedHaveWorkedWith, EmbeddedWantToWorkWith, EmbeddedAdmired, MiscTechHaveWorkedWith, MiscTechWantToWorkWith, MiscTechAdmired, ToolsTechHaveWorkedWith, ToolsTechWantToWorkWith, ToolsTechAdmired, NEWCollabToolsHaveWorkedWith, NEWCollabToolsWantToWorkWith, NEWCollabToolsAdmired, OpSysPersonal use, OpSysProfessional use, OfficeStackAsyncHaveWorkedWith, OfficeStackAsyncWantToWorkWith, OfficeStackAsyncAdmired, OfficeStackSyncHaveWorked

In [14]:
df = df[df['Employment'] == 'Employed full-time']


In [15]:
df_orig = pd.read_csv("data/raw/so_survey_2024.csv", low_memory=False)   # load fresh
df_orig['Employment'].value_counts(dropna=False).head(20)


Employment
Employed, full-time                                                                        39041
Independent contractor, freelancer, or self-employed                                        4846
Student, full-time                                                                          4709
Employed, full-time;Independent contractor, freelancer, or self-employed                    3557
Not employed, but looking for work                                                          2341
Employed, part-time                                                                         1266
Student, full-time;Employed, part-time                                                      1115
Employed, full-time;Student, full-time                                                       897
Employed, full-time;Student, part-time                                                       839
Student, full-time;Not employed, but looking for work                                        686
Not employed, and n

In [16]:
df = df_orig[df_orig['Employment'] == 'I am employed full-time'].copy()


In [17]:
print("After filter:", df.shape)


After filter: (0, 114)


In [18]:
AI_TOOL_COL = 'AIToolCurrently Using'   # same as before

def count_tools(cell):
    if pd.isna(cell) or cell.strip() == '':
        return 0
    return len([t for t in cell.split(',') if t.strip()])

df['ai_freq'] = df[AI_TOOL_COL].apply(count_tools)

print(df[[AI_TOOL_COL, 'ai_freq']].head())
print("\nai_freq value counts:\n", df['ai_freq'].value_counts().sort_index())


Empty DataFrame
Columns: [AIToolCurrently Using, ai_freq]
Index: []

ai_freq value counts:
 Series([], Name: count, dtype: int64)


In [19]:
# Reload only the Employment column so we see real spellings
df_orig = pd.read_csv("data/raw/so_survey_2024.csv", usecols=['Employment'])
print(df_orig['Employment'].value_counts(dropna=False).head(15))


Employment
Employed, full-time                                                         39041
Independent contractor, freelancer, or self-employed                         4846
Student, full-time                                                           4709
Employed, full-time;Independent contractor, freelancer, or self-employed     3557
Not employed, but looking for work                                           2341
Employed, part-time                                                          1266
Student, full-time;Employed, part-time                                       1115
Employed, full-time;Student, full-time                                        897
Employed, full-time;Student, part-time                                        839
Student, full-time;Not employed, but looking for work                         686
Not employed, and not looking for work                                        633
Student, part-time;Employed, part-time                                        558
I pre

In [20]:
# Reload full dataset (skip if df_orig already has all columns)
df = pd.read_csv("data/raw/so_survey_2024.csv", low_memory=False)

# Exact-match option
df = df[df['Employment'] == 'Employed, full-time'].copy()

# OR safer contains-match
# df = df[df['Employment'].str.contains('Employed, full', case=False, na=False)].copy()

print("After filter:", df.shape)        # expect ~39 k rows


After filter: (39041, 114)


In [21]:
AI_TOOL_COL = 'AIToolCurrently Using'   # column already exists

def count_tools(cell):
    if pd.isna(cell) or cell.strip() == '':
        return 0
    return len([t for t in cell.split(',') if t.strip()])

df['ai_freq'] = df[AI_TOOL_COL].apply(count_tools)

print(df[[AI_TOOL_COL, 'ai_freq']].head())
print("\nai_freq value counts:\n", df['ai_freq'].value_counts().sort_index())


  AIToolCurrently Using  ai_freq
0                   NaN        0
1                   NaN        0
2                   NaN        0
6                   NaN        0
8                   NaN        0

ai_freq value counts:
 ai_freq
0    18441
1    20600
Name: count, dtype: int64


In [22]:
# Safe numeric conversions
df['age']        = pd.to_numeric(df['Age'],          errors='coerce')
df['years_code'] = pd.to_numeric(df['YearsCodePro'], errors='coerce')

# Remote-work encoding (simple category → integer code)
df['remote_code'] = df['RemoteWorkStatus'].astype('category').cat.codes
# If your column is named just 'Remote' or 'RemoteWork', adjust the name above.

# Quick peek
df[['age', 'years_code', 'remote_code']].describe().T


KeyError: 'RemoteWorkStatus'

In [23]:
[c for c in df.columns if 'remote' in c.lower()]


['RemoteWork']

In [24]:
REMOTE_COL = 'RemoteWork'       # ← put your exact column name here

df['remote_code'] = (
    df[REMOTE_COL]
      .astype('category')        # turn text → pandas category
      .cat.codes                 # integer codes (-1 means NaN)
)


In [25]:
df[['age', 'years_code', 'remote_code']].describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,0.0,,,,,,,
years_code,33841.0,10.834195,8.607691,1.0,4.0,8.0,15.0,50.0
remote_code,39041.0,0.894521,0.878063,-1.0,0.0,1.0,2.0,2.0


In [26]:
[col for col in df.columns if 'age' in col.lower()]



['Age',
 'LanguageHaveWorkedWith',
 'LanguageWantToWorkWith',
 'LanguageAdmired',
 'age']

In [27]:
df['Age'].value_counts(dropna=False)


Age
25-34 years old       17282
35-44 years old       10775
18-24 years old        4839
45-54 years old        4234
55-64 years old        1600
65 years or older       182
Prefer not to say        71
Under 18 years old       58
Name: count, dtype: int64

In [28]:
age_map = {
    'Under 18'         : 16,   # midpoint of 0-18
    '18-24'            : 21,
    '25-34'            : 29.5,
    '35-44'            : 39.5,
    '45-54'            : 49.5,
    '55-64'            : 59.5,
    '65 or older'      : 70,
    'Prefer not to say': pd.NA
}


In [29]:
df['age'] = df['Age'].map(age_map)
print(df['age'].describe())


count       0
unique      0
top       NaN
freq      NaN
Name: age, dtype: object


In [30]:
df['years_code'] = pd.to_numeric(df['YearsCodePro'], errors='coerce')
df[['age', 'years_code']].describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
years_code,33841.0,10.834195,8.607691,1.0,4.0,8.0,15.0,50.0


In [31]:
df['remote_code'] = (
    df['RemoteWork']
      .astype('category')
      .cat.codes        # 0,1,2…  ; -1 means missing
)

df[['remote_code']].head()


Unnamed: 0,remote_code
0,2
1,2
2,2
6,2
8,1


In [32]:
df[['burnout', 'ai_freq', 'age', 'years_code', 'remote_code']].head()



KeyError: "['burnout'] not in index"

In [33]:
unique_ages = df['Age'].dropna().unique().tolist()
print("Unique Age strings ({}):".format(len(unique_ages)))
for v in unique_ages:
    print(repr(v))



Unique Age strings (8):
'Under 18 years old'
'35-44 years old'
'45-54 years old'
'25-34 years old'
'55-64 years old'
'18-24 years old'
'65 years or older'
'Prefer not to say'


In [34]:
age_map = {
    'Under 18 years old' : 16,      # midpoint of 0-18
    '18-24 years old'    : 21,
    '25-34 years old'    : 29.5,
    '35-44 years old'    : 39.5,
    '45-54 years old'    : 49.5,
    '55-64 years old'    : 59.5,
    '65 years or older'  : 70,
    'Prefer not to say'  : pd.NA    # keep as missing
}

df['age'] = df['Age'].map(age_map).astype(float)
print(df['age'].describe())


TypeError: float() argument must be a string or a real number, not 'NAType'

In [35]:
import numpy as np

age_map = {
    'Under 18 years old' : 16,
    '18-24 years old'    : 21,
    '25-34 years old'    : 29.5,
    '35-44 years old'    : 39.5,
    '45-54 years old'    : 49.5,
    '55-64 years old'    : 59.5,
    '65 years or older'  : 70,
    'Prefer not to say'  : np.nan       # ← changed
}

df['age'] = df['Age'].map(age_map).astype(float)
print(df['age'].describe())


count    38970.000000
mean        34.783205
std          9.851567
min         16.000000
25%         29.500000
50%         29.500000
75%         39.500000
max         70.000000
Name: age, dtype: float64


In [36]:
age_map['Prefer not to say'] = pd.NA     # keep as-is

df['age'] = (
    df['Age'].map(age_map)               # object dtype w/ pd.NA
          .pipe(pd.to_numeric, errors='coerce')  # converts pd.NA → NaN, rest → float
)
print(df['age'].describe())


count    38970.000000
mean        34.783205
std          9.851567
min         16.000000
25%         29.500000
50%         29.500000
75%         39.500000
max         70.000000
Name: age, dtype: float64


In [37]:
df[['burnout', 'ai_freq', 'age', 'years_code', 'remote_code']].head()


KeyError: "['burnout'] not in index"

In [38]:
df['JobSat'].value_counts(dropna=False)



JobSat
NaN     16504
8.0      5834
7.0      5032
6.0      2924
9.0      2824
10.0     1592
5.0      1533
3.0       900
4.0       848
2.0       599
0.0       242
1.0       209
Name: count, dtype: int64

In [39]:
df['burnout'] = df['JobSat'].apply(
    lambda x: 1 if pd.notna(x) and x <= 3 else 0 if pd.notna(x) else pd.NA
)

df['burnout'].value_counts(dropna=False)


burnout
0       20587
<NA>    16504
1        1950
Name: count, dtype: int64

In [40]:
[c for c in df.columns if 'AITool' in c or 'AI Tool' in c]


['AIToolCurrently Using',
 'AIToolInterested in Using',
 'AIToolNot interested in Using']

In [41]:
def count_tools(cell):
    if pd.isna(cell) or cell.strip() == '':
        return 0
    # split on commas, drop empty pieces, count what’s left
    return len([t for t in cell.split(',') if t.strip()])

df['ai_freq'] = df['AIToolCurrently Using'].apply(count_tools)

# Quick check
df['ai_freq'].value_counts(dropna=False).sort_index()


ai_freq
0    18441
1    20600
Name: count, dtype: int64

In [42]:
df['Age'].value_counts(dropna=False)


Age
25-34 years old       17282
35-44 years old       10775
18-24 years old        4839
45-54 years old        4234
55-64 years old        1600
65 years or older       182
Prefer not to say        71
Under 18 years old       58
Name: count, dtype: int64

In [43]:
import numpy as np

age_map = {
    'Under 18 years old' : 16.0,   # midpoint of 0-18
    '18-24 years old'    : 21.0,
    '25-34 years old'    : 29.5,
    '35-44 years old'    : 39.5,
    '45-54 years old'    : 49.5,
    '55-64 years old'    : 59.5,
    '65 years or older'  : 70.0,
    'Prefer not to say'  : np.nan  # keep as missing
}

df['age'] = df['Age'].map(age_map)

# sanity-check
print(df['age'].describe())


count    38970.000000
mean        34.783205
std          9.851567
min         16.000000
25%         29.500000
50%         29.500000
75%         39.500000
max         70.000000
Name: age, dtype: float64


In [44]:
df['years_code'] = pd.to_numeric(df['YearsCodePro'], errors='coerce')

# quick summary
print(df['years_code'].describe())


count    33841.000000
mean        10.834195
std          8.607691
min          1.000000
25%          4.000000
50%          8.000000
75%         15.000000
max         50.000000
Name: years_code, dtype: float64


In [45]:
[c for c in df.columns if 'remote' in c.lower()]


['RemoteWork', 'remote_code']

In [46]:
print("Raw RemoteWork values:")
print(df['RemoteWork'].value_counts(dropna=False), "\n")

print("remote_code preview:")
print(df['remote_code'].value_counts(dropna=False).sort_index())


Raw RemoteWork values:
RemoteWork
Hybrid (some remote, some in-person)    17281
Remote                                  13193
In-person                                8552
NaN                                        15
Name: count, dtype: int64 

remote_code preview:
remote_code
-1       15
 0    17281
 1     8552
 2    13193
Name: count, dtype: int64


In [47]:
from pathlib import Path

# make sure the processed folder exists
Path("data/processed").mkdir(parents=True, exist_ok=True)

# save
df.to_parquet("data/processed/so_clean.parquet", index=False)

print("✅  Saved to data/processed/so_clean.parquet")


ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [1]:
from pathlib import Path
Path("data/processed").mkdir(parents=True, exist_ok=True)
df.to_parquet("data/processed/so_clean.parquet", index=False)
print("✅  Saved to data/processed/so_clean.parquet")


NameError: name 'df' is not defined

In [2]:
import pandas as pd, numpy as np
from pathlib import Path

# 1. Load raw CSV
df = pd.read_csv("data/raw/so_survey_2024.csv", low_memory=False)

# 2. Keep full-time employees only
df = df[df['Employment'] == 'Employed, full-time'].copy()

# 3. burnout flag
df['burnout'] = df['JobSat'].apply(
    lambda x: 1 if pd.notna(x) and x <= 3 else 0 if pd.notna(x) else np.nan
)

# 4. ai_freq
def count_tools(cell):
    if pd.isna(cell) or cell.strip() == '':
        return 0
    return len([t for t in cell.split(',') if t.strip()])
df['ai_freq'] = df['AIToolCurrently Using'].apply(count_tools)

# 5. age mapping
age_map = {
    'Under 18 years old': 16.0, '18-24 years old': 21.0, '25-34 years old': 29.5,
    '35-44 years old': 39.5, '45-54 years old': 49.5, '55-64 years old': 59.5,
    '65 years or older': 70.0, 'Prefer not to say': np.nan
}
df['age'] = df['Age'].map(age_map)

# 6. years_code & remote_code
df['years_code']  = pd.to_numeric(df['YearsCodePro'], errors='coerce')
df['remote_code'] = df['RemoteWork'].astype('category').cat.codes

# 7. Save parquet
Path("data/processed").mkdir(parents=True, exist_ok=True)
df.to_parquet("data/processed/so_clean.parquet", index=False)
print("✅  Saved to data/processed/so_clean.parquet")


✅  Saved to data/processed/so_clean.parquet
