In [4]:
import pandas as pd

df = pd.read_csv("../data/DataAnalyst.csv")

print("Rows:", len(df))
print("\nColumns:\n", df.columns)

print("\nNull percentages:\n", df.isna().mean().round(3))

print("\nSample salaries:")
salary_col = [c for c in df.columns if "salary" in c.lower()][0]
print(df[salary_col].dropna().head(15))

print("\nTop job titles:")
title_col = [c for c in df.columns if "title" in c.lower()][0]
print(df[title_col].value_counts().head(15))

print("\nTop locations:")
loc_col = [c for c in df.columns if "location" in c.lower()][0]
print(df[loc_col].value_counts().head(15))



Rows: 2253

Columns:
 Index(['Unnamed: 0', 'Job Title', 'Salary Estimate', 'Job Description',
       'Rating', 'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'Easy Apply'],
      dtype='object')

Null percentages:
 Unnamed: 0           0.0
Job Title            0.0
Salary Estimate      0.0
Job Description      0.0
Rating               0.0
Company Name         0.0
Location             0.0
Headquarters         0.0
Size                 0.0
Founded              0.0
Type of ownership    0.0
Industry             0.0
Sector               0.0
Revenue              0.0
Competitors          0.0
Easy Apply           0.0
dtype: float64

Sample salaries:
0     $37K-$66K (Glassdoor est.)
1     $37K-$66K (Glassdoor est.)
2     $37K-$66K (Glassdoor est.)
3     $37K-$66K (Glassdoor est.)
4     $37K-$66K (Glassdoor est.)
5     $37K-$66K (Glassdoor est.)
6     $37K-$66K (Glassdoor est.)
7     $37K-$66K (Glass

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2872 entries, 0 to 2871
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          2872 non-null   object 
 1   company        2872 non-null   object 
 2   description    2872 non-null   object 
 3   onsite_remote  2872 non-null   object 
 4   salary         0 non-null      float64
 5   location       2872 non-null   object 
 6   criteria       2872 non-null   object 
 7   posted_date    2872 non-null   object 
 8   link           2872 non-null   object 
dtypes: float64(1), object(8)
memory usage: 202.1+ KB


In [30]:
import pandas as pd
import re

df = pd.read_csv("../data/DataAnalyst.csv")

df = df.drop(columns=["Unnamed: 0", "Competitors"])

df = df.rename(columns={
    "Job Title": "job_title_raw",
    "Salary Estimate": "salary_raw",
    "Job Description": "job_description",
    "Company Name": "company_name",
    "Type of ownership": "company_type",
    "Founded": "founded_year"
})

def parse_salary(s):
    if pd.isna(s) or s.strip() == "-1":
        return None, None, "USD", "yearly"

    s = s.replace("(Glassdoor est.)", "")
    s = s.replace("$", "")
    s = s.replace("K", "000")
    s = s.strip()

    parts = s.split("-")

    if len(parts) != 2:
        return None, None, "USD", "yearly"

    try:
        low = int(parts[0])
        high = int(parts[1])
        return low, high, "USD", "yearly"
    except:
        return None, None, "USD", "yearly"


df[["salary_min", "salary_max", "salary_currency", "salary_type"]] = (
    df["salary_raw"].apply(lambda x: pd.Series(parse_salary(x)))
)

def clean_title_and_seniority(t):
    if pd.isna(t):
        return None, None

    t = t.lower()

    # Remove bracket junk
    t = re.sub(r"\(.*?\)", "", t)
    t = re.sub(r"\[.*?\]", "", t)

    # Remove punctuation & numbers
    t = re.sub(r"[0-9.,:/\-]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()

    # Seniority detection (order matters)
    if any(k in t for k in ["lead", "principal", "head"]):
        return "Lead Data Analyst", "Lead"

    if any(k in t for k in ["senior", "sr"]):
        return "Senior Data Analyst", "Senior"

    if any(k in t for k in ["junior", "jr", "associate", "entry"]):
        return "Junior Data Analyst", "Junior"

    # Role detection fallback
    if "analyst" in t:
        return "Data Analyst", "Mid"

    # If somehow it's not even an analyst role
    return "Data Analyst", "Mid"

df[["title_clean", "seniority_level"]] = (
    df["job_title_raw"]
    .apply(lambda x: pd.Series(clean_title_and_seniority(x)))
)

def split_location(loc):
    if pd.isna(loc):
        return None, None, "USA"

    loc = loc.strip()

    # Remove anything in parentheses
    loc = re.sub(r"\(.*?\)", "", loc)

    parts = [p.strip() for p in loc.split(",") if p.strip() != ""]

    # Case: City, State
    if len(parts) == 2:
        return parts[0], parts[1].upper(), "USA"

    # Case: City, County, State
    if len(parts) == 3:
        return parts[0], parts[2].upper(), "USA"

    # Fallback: try last token as state
    if len(parts) > 1:
        return parts[0], parts[-1].upper(), "USA"

    return None, None, "USA"

df[["city", "state", "country"]] = (
    df["Location"].apply(lambda x: pd.Series(split_location(x)))
)

cols = [
    "job_title_raw",
    "title_clean",
    "seniority_level",
    "salary_raw",
    "salary_min",
    "salary_max",
    "salary_currency",
    "salary_type",
    "job_description",
    "company_name",
    "Industry",       # will rename to industry later
    "Sector",         # will rename to sector later
    "company_type",
    "founded_year",
    "Size",           # will rename to size later
    "Revenue",        # will rename to revenue later
    "Rating",         # will rename to rating later
    "Location",       # will rename to location later
    "city",
    "state",
    "country",
    "Easy Apply",     # will rename to easy_apply later
    "Headquarters"   # will rename to headquarters later
]

df = df[cols]

df = df.rename(columns={
    "Industry": "industry",
    "Sector": "sector",
    "Size": "size",
    "Revenue": "revenue",
    "Rating": "rating",
    "Location": "location",
    "Easy Apply": "easy_apply",
    "Headquarters": "headquarters"
})

df["salary_min"] = df["salary_min"].astype("Int64")
df["salary_max"] = df["salary_max"].astype("Int64")

df["easy_apply"] = (
    df["easy_apply"]
    .astype(str)
    .str.strip()
    .replace({
        "True": True,
        "-1": False,
    })
)

df["easy_apply"] = df["easy_apply"].astype("boolean")

df.to_csv("../data/clean_seed.csv", index=False)

  .replace({


In [19]:
print(df[["salary_raw", "salary_min", "salary_max"]].head(15))

print("\nSalary range:")
print(df[["salary_min", "salary_max"]].describe())

print("\nNull salary rows:")
print(df[df["salary_min"].isna()].shape[0])

                    salary_raw  salary_min  salary_max
0   $37K-$66K (Glassdoor est.)     37000.0     66000.0
1   $37K-$66K (Glassdoor est.)     37000.0     66000.0
2   $37K-$66K (Glassdoor est.)     37000.0     66000.0
3   $37K-$66K (Glassdoor est.)     37000.0     66000.0
4   $37K-$66K (Glassdoor est.)     37000.0     66000.0
5   $37K-$66K (Glassdoor est.)     37000.0     66000.0
6   $37K-$66K (Glassdoor est.)     37000.0     66000.0
7   $37K-$66K (Glassdoor est.)     37000.0     66000.0
8   $37K-$66K (Glassdoor est.)     37000.0     66000.0
9   $37K-$66K (Glassdoor est.)     37000.0     66000.0
10  $37K-$66K (Glassdoor est.)     37000.0     66000.0
11  $37K-$66K (Glassdoor est.)     37000.0     66000.0
12  $37K-$66K (Glassdoor est.)     37000.0     66000.0
13  $37K-$66K (Glassdoor est.)     37000.0     66000.0
14  $37K-$66K (Glassdoor est.)     37000.0     66000.0

Salary range:
          salary_min     salary_max
count    2252.000000    2252.000000
mean    54266.873890   89979.1296

In [20]:
print(df[["Location", "city", "state"]].head(15))

print("\nTop states:")
print(df["state"].value_counts().head(15))

print("\nNull city rows:")
print(df[df["city"].isna()].shape[0])

           Location         city state
0      New York, NY     New York    NY
1      New York, NY     New York    NY
2      New York, NY     New York    NY
3      New York, NY     New York    NY
4      New York, NY     New York    NY
5      New York, NY     New York    NY
6      New York, NY     New York    NY
7      New York, NY     New York    NY
8      New York, NY     New York    NY
9      New York, NY     New York    NY
10     New York, NY     New York    NY
11    Fairfield, NJ    Fairfield    NJ
12     New York, NY     New York    NY
13     New York, NY     New York    NY
14  Jersey City, NJ  Jersey City    NJ

Top states:
state
CA    626
TX    394
NY    345
IL    164
PA    114
AZ     97
CO     96
NC     90
NJ     86
WA     54
VA     48
OH     35
UT     33
FL     27
IN     23
Name: count, dtype: int64

Null city rows:
0


In [21]:
print(df["title_clean"].value_counts())
print("\n")
print(df["seniority_level"].value_counts())
print("\n")
print(df[["job_title_raw", "title_clean", "seniority_level"]].sample(15))

title_clean
Data Analyst           1677
Senior Data Analyst     411
Junior Data Analyst      98
Lead Data Analyst        67
Name: count, dtype: int64


seniority_level
Mid       1677
Senior     411
Junior      98
Lead        67
Name: count, dtype: int64


                                          job_title_raw          title_clean  \
1543                       Data Analyst- German Fluency         Data Analyst   
1479                                       Data Analyst         Data Analyst   
1317                    Data Analyst - Healthcare Fraud         Data Analyst   
290   Data Analyst,, Sales Operations - Oracle Data ...         Data Analyst   
1159                    Data Analyst (Philadelphia, PA)         Data Analyst   
1927                                       Data Analyst         Data Analyst   
1852                 Financial Data Analyst - Full Time         Data Analyst   
1592                        Bioinformatics Data Analyst         Data Analyst   
1418                    

In [28]:
df = pd.read_csv("../data/DataAnalyst.csv")

df["Easy Apply"].unique()

array(['True', '-1'], dtype=object)