# 12 - Create Static Data for App MVP

This notebook creates a static dataset for the app MVP by selecting specific rows from the test and HR datasets. The final combined dataset includes candidate details and is saved as a Parquet file for use in the app backend.

In [2]:
import pandas as pd

In [3]:
# Display all rows and columns
pd.set_option('display.max_colwidth', None)  # Show full content in each cell
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)   # Show all columns

In [4]:
df = pd.read_parquet("../data/processed/X_test.parquet")

In [5]:
row_indices = [323, 925, 1023]  # Indices to select
selected_rows = df.loc[row_indices]

In [7]:
hr_data = pd.read_parquet("../data/interim/hr_data_simulated.parquet")
hr_selected_rows = hr_data.loc[row_indices]

In [8]:
hr_selected_rows["Employee_Name"]

323     Mccullough, Quinn
925        Hodge, Aaliyah
1023      Mercado, Emmett
Name: Employee_Name, dtype: object

In [9]:
# Drop 'probability' from selected_rows if it exists
if 'probability' in selected_rows.columns:
    selected_rows.drop(columns='probability', inplace=True)

# Select rows by indices in HR data
hr_selected_rows = hr_data.loc[row_indices]
hr_selected_rows = hr_selected_rows["Employee_Name"]

# Concatenate along columns (axis 1)
combined_rows = pd.concat([selected_rows, hr_selected_rows], axis=1)

In [11]:
# Reset the index and add it as a new column named 'candidate_id'
hr_selected_rows = hr_selected_rows.reset_index().rename(columns={"index": "Candidate_ID"})

# Display the updated DataFrame
print(hr_selected_rows.head())

   candidate_id      Employee_Name
0           323  Mccullough, Quinn
1           925     Hodge, Aaliyah
2          1023    Mercado, Emmett


In [12]:
# Save the selected rows to a new parquet file without index
combined_rows.to_parquet("../data/app/static_data.parquet", index=False)

In [13]:
combined_rows.to_parquet("../app/data/static_data.parquet", index=False)