In [1]:
import pandas as pd

# 3.1 Load CSV from your data folder
df = pd.read_csv('/Users/honeymustard/Desktop/data/Data_Job_WA.csv')
# 3.2 Quick checks
print(df.shape)              # prints (rows, cols)
print(df.columns.tolist())   # lists column names
df.head()                    # shows first 5 rows


(892, 12)
['Job_title', 'Company', 'State', 'City', 'Min_Salary', 'Max_Salary', 'Job_Desc', 'Industry', 'Rating', 'Date_Posted', 'Valid_until', 'Job_Type']


Unnamed: 0,Job_title,Company,State,City,Min_Salary,Max_Salary,Job_Desc,Industry,Rating,Date_Posted,Valid_until,Job_Type
0,Data Scientist,ManTech,VA,Chantilly,108486,145165,This company is in a hiring surge in response ...,Business Services,4.1,2020-04-30,2020-06-06,FULL_TIME
1,Data Scientist,GEICO,MD,Chevy Chase,69285,113337,GEICO’s Data Science team uses predictive anal...,Insurance,3.3,2020-05-02,2020-06-06,FULL_TIME
2,Data Scientist,Tecolote Research,DC,Washington,74791,102528,"\nData Scientist\n \n Location: Washington, D...",Aerospace & Defense,4.0,2020-05-01,2020-06-06,FULL_TIME
3,Data Scientist,Systems &amp;amp; Technology Research,VA,Arlington,94721,115685,STR is a government research contractor specia...,Aerospace & Defense,4.7,2020-04-25,2020-06-06,FULL_TIME
4,Chief Data Scientist,ManTech,VA,Alexandria,-1,-1,This company is in a hiring surge in response ...,Business Services,4.1,2020-05-06,2020-06-06,FULL_TIME


In [2]:
# 4.1 Regex replace to strip numbers at end of string
df['company_clean'] = df['Company'] \
    .str.replace(r'\s*\d+\.\d+$', '', regex=True)  # :contentReference[oaicite:2]{index=2}
# 4.2 Trim whitespace
df['company_clean'] = df['company_clean'].str.strip()  # :contentReference[oaicite:3]{index=3}
# 4.3 Verify a few examples
df[['Company','company_clean']].drop_duplicates().head(10)


Unnamed: 0,Company,company_clean
0,ManTech,ManTech
1,GEICO,GEICO
2,Tecolote Research,Tecolote Research
3,Systems &amp;amp; Technology Research,Systems &amp;amp; Technology Research
5,Booz Allen Hamilton Inc.,Booz Allen Hamilton Inc.
6,Novetta,Novetta
8,GetWellNetwork,GetWellNetwork
10,The Knot Worldwide,The Knot Worldwide
11,Amazon,Amazon
12,General Dynamics Information Technology,General Dynamics Information Technology


In [3]:
# 5.1 Lowercase and trim
df['job_type_clean']    = df['Job_Type'].str.lower().str.strip()     # :contentReference[oaicite:4]{index=4}
df['industry_clean']    = df['Industry'].str.lower().str.strip()

# 5.2 Map synonyms to one label
job_map = {'ft':'full-time','full time':'full-time','contractor':'contract'}
df['job_type_clean'] = df['job_type_clean'].replace(job_map)         # :contentReference[oaicite:5]{index=5}

# 5.3 Inspect unique values
print(df['job_type_clean'].unique())
print(df['industry_clean'].value_counts().head(10))


['full_time' 'intern' 'part_time' 'other']
industry_clean
business services            254
information technology       148
aerospace & defense          132
biotech & pharmaceuticals     73
government                    69
finance                       23
accounting & legal            13
education                     12
manufacturing                  9
health care                    5
Name: count, dtype: int64


In [8]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)      # :contentReference[oaicite:7]{index=7}
categories = ['job_type_clean','industry_clean','City','State']
encoded = ohe.fit_transform(df[categories])

# Convert to DataFrame
feat_names = ohe.get_feature_names_out(categories)
df_ohe_skl = pd.DataFrame(encoded, columns=feat_names, index=df.index)

# Merge back into original DataFrame
df = pd.concat([df, df_ohe_skl], axis=1)                               # :contentReference[oaicite:8]{index=8}


In [9]:
set(['company_clean','job_type_clean','industry_clean']) - set(df.columns)


set()

In [10]:
df_ohe.head()


Unnamed: 0,Job_title,Company,Min_Salary,Max_Salary,Job_Desc,Industry,Rating,Date_Posted,Valid_until,Job_Type,...,City_Suitland,City_Tysons,City_Vienna,City_Washington,City_White Oak,State_DC,State_MD,State_NC,State_TX,State_VA
0,Data Scientist,ManTech,108486,145165,This company is in a hiring surge in response ...,Business Services,4.1,2020-04-30,2020-06-06,FULL_TIME,...,False,False,False,False,False,False,False,False,False,True
1,Data Scientist,GEICO,69285,113337,GEICO’s Data Science team uses predictive anal...,Insurance,3.3,2020-05-02,2020-06-06,FULL_TIME,...,False,False,False,False,False,False,True,False,False,False
2,Data Scientist,Tecolote Research,74791,102528,"\nData Scientist\n \n Location: Washington, D...",Aerospace & Defense,4.0,2020-05-01,2020-06-06,FULL_TIME,...,False,False,False,True,False,True,False,False,False,False
3,Data Scientist,Systems &amp;amp; Technology Research,94721,115685,STR is a government research contractor specia...,Aerospace & Defense,4.7,2020-04-25,2020-06-06,FULL_TIME,...,False,False,False,False,False,False,False,False,False,True
4,Chief Data Scientist,ManTech,-1,-1,This company is in a hiring surge in response ...,Business Services,4.1,2020-05-06,2020-06-06,FULL_TIME,...,False,False,False,False,False,False,False,False,False,True


In [11]:
set(feat_names) & set(df.columns)  # should be empty before concatenation


{'City_Adelphi',
 'City_Alexandria',
 'City_Andrews AFB',
 'City_Annandale',
 'City_Annapolis Junction',
 'City_Arlington',
 'City_Beltsville',
 'City_Bethesda',
 'City_Bowie',
 'City_Burke',
 'City_Centreville',
 'City_Chantilly',
 'City_Chevy Chase',
 'City_College Park',
 'City_Columbia',
 'City_Crystal City',
 'City_Crystal City, state=Virginia, Virginia',
 'City_Fairfax',
 'City_Falls Church',
 'City_Fort Belvoir',
 'City_Fort Meade',
 'City_Fulton',
 'City_Gaithersburg',
 'City_Germantown',
 'City_Greenbelt',
 'City_Herndon',
 'City_Lanham',
 'City_Laurel',
 'City_Mc Lean',
 'City_McLean',
 'City_North Bethesda',
 'City_Raleigh',
 'City_Reston',
 'City_Rockville',
 'City_Rosslyn',
 'City_San Antonio',
 'City_Silver Spring',
 'City_Springfield',
 'City_Sterling',
 'City_Suitland',
 'City_Tysons',
 'City_Vienna',
 'City_Washington',
 'City_White Oak',
 'State_DC',
 'State_MD',
 'State_NC',
 'State_TX',
 'State_VA',
 'State_nan',
 'industry_clean_accounting & legal',
 'industry_clea

In [13]:
df.to_csv('/Users/honeymustard/Desktop/data/Data_Job_WA_cleaned.csv', index=False)                # :contentReference[oaicite:9]{index=9}
