In [2]:
import pandas as pd
import numpy as np

# SPSS Loader and Editor

Prepares our SPSS file using these standards (https://docs.google.com/document/d/1LZ_i9fHxzdG6w6_Ie7i7wYuiAET9Ubc-SEA3mEiOSDs/edit?tab=t.0)

This notebook loads an SPSS file and then allows the user to change it

In [3]:
# from google.colab import drive
# Mount Google Drive
# drive.mount('/content/drive')
!pip install pyreadstat



In [4]:
!pip install openpyxl



# Load and Edit STUDENT IBM SPSS File

In [5]:
import boto3
import os
import pyreadstat

# Define local file path
#local_file_path_student_miss = "/content/drive/My Drive/PISA_2022_Background_questionnaire_STUDENT_filtered_recode_miss.csv"
local_file_path_student_miss = "PISA_2022_Background_questionnaire_edited.SAV"  # Change as needed

# Define S3 details
bucket_name = "sagemaker-us-west-2-986030204467"
file_key = "capstone/testfiles/PISA_2022_Background_questionnaire_edited.SAV"

# AWS credentials are usually stored in ~/.aws/credentials or IAM roles (if running on AWS services)
s3_client = boto3.client("s3")

# Check if the file exists locally
if os.path.exists(local_file_path_student_miss):
    print(f"📂 Loading data from local file... {local_file_path_student_miss}")
    student_dataset, student_meta = pyreadstat.read_sav(local_file_path_student_miss)
    print("Load completed")
else:
    print("☁️ Downloading data from S3...")
    
    # Create S3 client
    s3_client = boto3.client("s3")

    # Download the file from S3
    response = s3_client.get_object(Bucket=bucket_name, Key=file_key)

    # Read the file into pandas DataFrame
    student_dataset, student_meta = pyreadstat.read_sav(response["Body"])

    # Save a local copy for future use
    student_dataset.to_sav(local_file_path_student_miss, index=False)
    print(f"✅ File saved locally as {local_file_path_student_miss}")

📂 Loading data from local file... PISA_2022_Background_questionnaire_edited.SAV
Load completed


In [1]:
filtered_df = student_dataset[(student_dataset["CNT"] == "USA") & (student_dataset["LANGN"] == 922)]
filtered_df.head(10)

NameError: name 'student_dataset' is not defined

# Add -999 as a Categorical Column

If a categorical column is 'missing' in SPSS, we need to make is something that is NaN

In [26]:
# Identify categorical columns
categorical_cols = student_dataset.select_dtypes(include=['category']).columns

# Add -999 as a new category
for col in categorical_cols:
    student_dataset[col] = student_dataset[col].cat.add_categories([-999])

# Replace all NaN's with -999

CSV's need the 'missing values' from SAV to be 'something', we will use -999

In [27]:
# Now replace NaN with -999
student_dataset.fillna(-999, inplace=True)

# Check a few variables to see if they now have -999's

In [28]:
student_dataset['ST021Q01TA'].head(10)

0   -999.0
1   -999.0
2      6.0
3   -999.0
4   -999.0
5   -999.0
6   -999.0
7   -999.0
8   -999.0
9   -999.0
Name: ST021Q01TA, dtype: float64

LANGTEST_PAQ has a lot of 'System Missing' variables which are .'s (dots)

In [29]:
student_dataset['LANGTEST_PAQ'].head(10)

0   -999.0
1   -999.0
2   -999.0
3   -999.0
4   -999.0
5   -999.0
6   -999.0
7   -999.0
8   -999.0
9   -999.0
Name: LANGTEST_PAQ, dtype: float64

In [30]:
student_dataset['ST322Q01JA'].head(10)

0      5.0
1   -999.0
2   -999.0
3      3.0
4      3.0
5      3.0
6      5.0
7      1.0
8   -999.0
9   -999.0
Name: ST322Q01JA, dtype: float64

In [31]:
student_dataset[student_dataset['ST256Q02JA'] == 5.0].head(10)

Unnamed: 0,CNT,CNTRYID,CNTSCHID,CNTSTUID,CYC,NatCen,STRATUM,SUBNATIO,REGION,OECD,...,PV5MPRE,PV6MPRE,PV7MPRE,PV8MPRE,PV9MPRE,PV10MPRE,SENWT,VER_DAT,MATH_Average_score,MATH_Proficient
42,ALB,8.0,800232.0,800053.0,08MS,800,ALB03,80000,800.0,0.0,...,448.232,449.867,413.254,433.048,408.494,428.838,0.5258,03MAY23:10:11:25,405.0012,0.0
84,ALB,8.0,800253.0,800109.0,08MS,800,ALB08,80000,800.0,0.0,...,443.281,429.318,461.789,446.097,473.5,387.17,1.5876,03MAY23:10:11:26,430.2083,1.0
143,ALB,8.0,800266.0,800180.0,08MS,800,ALB01,80000,800.0,0.0,...,366.181,310.848,240.076,181.98,273.417,308.886,0.62707,03MAY23:10:11:25,309.102,0.0
153,ALB,8.0,800008.0,800192.0,08MS,800,ALB10,80000,800.0,0.0,...,350.679,337.529,371.003,309.67,320.21,289.271,0.71568,03MAY23:10:11:26,310.5375,0.0
162,ALB,8.0,800286.0,800205.0,08MS,800,ALB07,80000,800.0,0.0,...,303.086,386.598,288.889,285.513,321.619,408.19,0.62724,03MAY23:10:11:26,355.982,0.0
200,ALB,8.0,800191.0,800248.0,08MS,800,ALB05,80000,800.0,0.0,...,443.514,403.478,413.072,433.904,503.552,515.897,0.76754,03MAY23:10:11:25,411.242,0.0
267,ALB,8.0,800108.0,800328.0,08MS,800,ALB06,80000,800.0,0.0,...,334.877,389.624,364.871,321.214,358.059,355.716,0.6255,03MAY23:10:11:26,294.4648,0.0
270,ALB,8.0,800055.0,800332.0,08MS,800,ALB03,80000,800.0,0.0,...,507.269,534.171,543.555,455.654,514.238,500.791,0.6939,03MAY23:10:11:25,507.3029,1.0
271,ALB,8.0,800007.0,800333.0,08MS,800,ALB10,80000,800.0,0.0,...,363.699,356.505,323.537,411.367,256.575,352.282,0.80794,03MAY23:10:11:26,346.985,0.0
379,ALB,8.0,800286.0,800471.0,08MS,800,ALB07,80000,800.0,0.0,...,390.681,403.371,433.98,316.0,393.471,371.491,0.55452,03MAY23:10:11:26,360.7511,0.0


In [32]:
student_dataset[student_dataset['LANGN'] == 999].head(10)

Unnamed: 0,CNT,CNTRYID,CNTSCHID,CNTSTUID,CYC,NatCen,STRATUM,SUBNATIO,REGION,OECD,...,PV5MPRE,PV6MPRE,PV7MPRE,PV8MPRE,PV9MPRE,PV10MPRE,SENWT,VER_DAT,MATH_Average_score,MATH_Proficient
12,ALB,8.0,800040.0,800015.0,08MS,800,ALB03,80000,800.0,0.0,...,251.333,232.104,261.224,261.809,237.493,241.997,0.77623,03MAY23:10:11:26,249.6507,0.0
24,ALB,8.0,800172.0,800031.0,08MS,800,ALB04,80000,800.0,0.0,...,480.081,467.938,485.822,449.964,457.826,477.458,0.77731,03MAY23:10:11:25,456.0084,1.0
64,ALB,8.0,800157.0,800080.0,08MS,800,ALB08,80000,800.0,0.0,...,317.813,354.375,412.546,398.409,368.4,365.332,0.72901,03MAY23:10:11:26,381.7627,0.0
71,ALB,8.0,800130.0,800088.0,08MS,800,ALB03,80000,800.0,0.0,...,457.776,435.416,460.337,485.492,485.201,439.603,0.82871,03MAY23:10:11:25,475.3291,1.0
75,ALB,8.0,800281.0,800094.0,08MS,800,ALB03,80000,800.0,0.0,...,315.062,282.749,263.78,320.757,329.398,366.29,0.57916,03MAY23:10:11:25,282.5159,0.0
78,ALB,8.0,800115.0,800099.0,08MS,800,ALB03,80000,800.0,0.0,...,271.203,276.95,413.716,279.566,362.352,226.675,0.76431,03MAY23:10:11:25,294.6531,0.0
83,ALB,8.0,800281.0,800107.0,08MS,800,ALB03,80000,800.0,0.0,...,239.755,350.775,346.888,264.382,290.628,257.486,0.57916,03MAY23:10:11:25,291.0274,0.0
89,ALB,8.0,800065.0,800115.0,08MS,800,ALB03,80000,800.0,0.0,...,359.148,373.75,352.655,400.553,316.995,323.861,0.85009,03MAY23:10:11:26,309.4011,0.0
93,ALB,8.0,800241.0,800120.0,08MS,800,ALB05,80000,800.0,0.0,...,365.814,385.002,446.048,433.398,394.145,401.901,0.81769,03MAY23:10:11:25,403.636,0.0
107,ALB,8.0,800267.0,800136.0,08MS,800,ALB06,80000,800.0,0.0,...,484.609,535.808,447.311,524.476,475.762,469.584,0.71612,03MAY23:10:11:26,493.6695,1.0


In [33]:
student_dataset.loc[student_dataset['ST322Q01JA'].isin([6, 6.0]), ['ST322Q01JA']]

Unnamed: 0,ST322Q01JA
19,6.0
21,6.0
36,6.0
53,6.0
72,6.0
...,...
613718,6.0
613719,6.0
613731,6.0
613739,6.0


In [37]:
student_dataset.loc[student_dataset['ST250D06JA'].isin([9999997, 9999998, 9999999]), ['ST250D06JA']]

Unnamed: 0,ST250D06JA


# Edit the file and remove unneeded columns per Selenes XLSX sheet

In this case we use 

#### 1. Download Student Variables Inclusion/Exclusion Sheet

In [42]:
# Define the Google Sheets export URL
# Variables to include - Students.xlsx
sheet_url = "https://docs.google.com/spreadsheets/d/1rb0AVCWQAEQ-c5vYfKn1aHtmK9bjvq3u/export?format=xlsx"

# Read the Excel file directly from the URL
xls = pd.ExcelFile(sheet_url)

#### 2. Check Dataframe Shape before editing (should have 1280 columns)

In [43]:
student_dataset.shape

(613744, 1280)

#### 3. Use XLS Sheet to Include/Exclude Columns

In [44]:
# Read the Exclude sheet
exclude_df = pd.read_excel(xls, sheet_name="Exclude")

# Read the main data sheet (assuming it's the first sheet)
include_df = pd.read_excel(xls, sheet_name=xls.sheet_names[0])

# Filter columns to keep based on the Exclude sheet conditions
# BRING BACK IN THE CONSTITUENT WLE COLUMNS
columns_to_keep = exclude_df[
    (exclude_df["Include"] == "x") & (exclude_df["Reason for exclusion"] == "Can use WLE")
]["NAME"].tolist()

# BRING IN ANY COLUMNS that are NOT WLE
# Identify columns where "Include" = 'O' and add them to columns_to_keep
columns_to_keep += include_df[
    (include_df["Include"] == "o") & (include_df["Reasons for inclusion"] != "WLE")
]["NAME"].tolist()

# Filter dataframe columns
df_filtered = student_dataset[columns_to_keep]

In [45]:
def parse_nullable_values(value):
    """Convert nullable column values into lists."""
    if pd.isna(value):  # If NaN, return empty list
        return []
    if isinstance(value, str):  # Check if the value is a string (like '998,999')
        return [int(v.strip()) for v in value.split(",")]  # Convert CSV string to list of ints
    return [value]  # If it's a single number, wrap it in a list
    
def count_nullables_before(df, include_df):
    """
    Count occurrences of nullable values before replacement.

    :param df: Pandas DataFrame (student dataset)
    :param include_df: DataFrame containing the 'Include' sheet
    :return: Dictionary with counts of nullable values per column
    """
    count_dict = {}

    for _, row in include_df.iterrows():
        col = row['NAME']
        nullables = parse_nullable_values(row['nullables'])
        print(nullables)
        if pd.notna(col) and pd.notna(row['nullables']) and col in df.columns:
            count_dict[col] = df[col].isin(nullables).sum()

    return count_dict

# Count nullables before replacement
nullables_before = count_nullables_before(df_filtered, include_df)
nullables_before

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[6]
[6]
[6]
[6]
[6]
[6]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[997, 998, 999]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[6]
[6]
[6]
[6]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[4]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


{'ST322Q01JA': 39491,
 'ST322Q02JA': 25559,
 'ST322Q03JA': 17624,
 'ST322Q04JA': 41568,
 'ST322Q06JA': 51716,
 'ST322Q07JA': 37880,
 'LANGN': 25848,
 'IC184Q01JA': 13423,
 'IC184Q02JA': 12960,
 'IC184Q03JA': 23619,
 'IC184Q04JA': 30010,
 'WB161Q01HA': 12339}

#### 4. Replace all 'other nullables' with -999

In [46]:
def replace_values_with_neg999(df, include_df, exclude_df):
    """
    Replace specified nullable values in columns based on 'Include' and 'Exclude' sheets.

    :param df: Pandas DataFrame (student dataset)
    :param include_df: DataFrame containing the 'Include' sheet
    :param exclude_df: DataFrame containing the 'Exclude' sheet
    :return: Modified DataFrame
    """
    # Build dictionary of columns and their nullable values from 'Include' sheet
    replace_dict = {
        row['NAME']: parse_nullable_values(row['nullables'])
        for _, row in include_df.iterrows()
        if pd.notna(row['NAME']) and pd.notna(row['nullables'])  # Ensure valid column names and values
    }

    # Add nullable values from 'Exclude' sheet
    for _, row in exclude_df.iterrows():
        col = row['NAME']
        nullables = parse_nullable_values(row['nullables'])

        if pd.notna(col) and pd.notna(row['nullables']):  # Ensure valid column names and values
            if col in replace_dict:
                replace_dict[col].extend(nullables)  # Add to existing column list
            else:
                replace_dict[col] = nullables  # Create new entry

    # Apply replacements using .loc to avoid SettingWithCopyWarning
    for col, values in replace_dict.items():
        if col in df.columns:
            df.loc[:, col] = df[col].replace(values, -999)
    
    return df

# Apply replacements based on 'Include' sheet
df_filtered = replace_values_with_neg999(df_filtered, include_df, exclude_df)

In [47]:
df_filtered.loc[df_filtered['ST322Q06JA'].isin([6, 6.0]), ['ST322Q01JA']]

Unnamed: 0,ST322Q01JA


In [48]:
df_filtered[df_filtered['ST256Q02JA'] == 5.0].head(10)

Unnamed: 0,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,ST251Q01JA,ST251Q02JA,ST251Q03JA,ST251Q04JA,ST251Q06JA,...,WB177Q02HA,WB177Q03HA,WB177Q04HA,WB032Q01NA,WB032Q02NA,WB031Q01NA,EXERPRAC,STUBMI,WORKPAY,WORKHOME


In [49]:
def count_neg999_after(df, include_df, nullables_before):
    """
    Count occurrences of -999 after replacement, but only for columns that had nullables before.

    :param df: Pandas DataFrame (student dataset)
    :param include_df: DataFrame containing the 'Include' sheet
    :param nullables_before: Dictionary of nullable counts before replacement
    :return: Dictionary with counts of -999 per column (only for nullables)
    """
    count_dict = {}

    for col in nullables_before.keys():  # Only check columns that had nullables before
        if col in df.columns:
            count_dict[col] = (df[col] == -999).sum()

    return count_dict

# Count -999 occurrences after replacement (only for columns that had nullables before)
neg999_after = count_neg999_after(df_filtered, include_df, nullables_before)
neg999_after

{'ST322Q01JA': 205532,
 'ST322Q02JA': 193213,
 'ST322Q03JA': 186338,
 'ST322Q04JA': 210225,
 'ST322Q06JA': 220603,
 'ST322Q07JA': 205987,
 'LANGN': 25848,
 'IC184Q01JA': 298237,
 'IC184Q02JA': 299614,
 'IC184Q03JA': 309229,
 'IC184Q04JA': 316167,
 'WB161Q01HA': 507446}

#### Should be 0 998/999 now

In [50]:
df_filtered.loc[df_filtered['LANGN'].isin([998, 999]), ['LANGN']]

Unnamed: 0,LANGN


In [52]:
#df_filtered.loc[df_filtered['ST251D08JA'].isin([9999997, 9999998, 9999999]), ['ST251D08JA']]

In [53]:
# Count occurrences of each unique value in LANGN
langn_counts = df_filtered["LANGN"].value_counts()

# Print the counts
print(langn_counts)

LANGN
 156.0    99330
 313.0    60495
 500.0    42910
-999.0    25848
 232.0    16882
          ...  
 677.0        3
 263.0        3
 428.0        3
 567.0        2
 566.0        1
Name: count, Length: 238, dtype: int64


#### 2. Check Dataframe Shape AFTER editing (should have 735 columns)

In [54]:
df_filtered.shape

(613744, 731)

# Remap Countries

In [55]:
# Define the country code to name mapping
country_mapping = {
    "ALB": "Albania", "ARE": "United_Arab_Emirates", "ARG": "Argentina", "AUS": "Australia",
    "AUT": "Austria", "BEL": "Belgium", "BGR": "Bulgaria", "BRA": "Brazil", "BRN": "Brunei_Darussalam",
    "CAN": "Canada", "CHE": "Switzerland", "CHL": "Chile", "COL": "Colombia", "CRI": "Costa_Rica",
    "CZE": "Czech_Republic", "DEU": "Germany", "DNK": "Denmark", "DOM": "Dominican_Republic",
    "ESP": "Spain", "EST": "Estonia", "FIN": "Finland", "FRA": "France", "GBR": "United_Kingdom",
    "GEO": "Georgia", "GRC": "Greece", "GTM": "Guatemala", "HRV": "Croatia", "HUN": "Hungary",
    "IDN": "Indonesia", "IRL": "Ireland", "ISL": "Iceland", "ISR": "Israel", "ITA": "Italy",
    "JAM": "Jamaica", "JOR": "Jordan", "JPN": "Japan", "KAZ": "Kazakhstan", "KHM": "Cambodia",
    "KOR": "Korea", "KSV": "Kosovo", "LTU": "Lithuania", "LVA": "Latvia", "MAR": "Morocco",
    "MDA": "Republic_of_Moldova", "MEX": "Mexico", "MKD": "North_Macedonia", "MLT": "Malta",
    "MNE": "Montenegro", "MNG": "Mongolia", "MYS": "Malaysia", "NLD": "Netherlands", "NOR": "Norway",
    "NZL": "New_Zealand", "PAN": "Panama", "PER": "Peru", "PHL": "Philippines", "POL": "Poland",
    "PRT": "Portugal", "PRY": "Paraguay", "PSE": "Palestinian_Authority", "QAT": "Qatar",
    "QCY": "Cyprus", "ROU": "Romania", "SAU": "Saudi_Arabia", "SGP": "Singapore", "SLV": "El_Salvador",
    "SRB": "Serbia", "SVK": "Slovak_Republic", "SVN": "Slovenia", "SWE": "Sweden", "TAP": "Taiwan",
    "THA": "Thailand", "TUR": "Türkiye", "URY": "Uruguay", "USA": "United_States", "UZB": "Uzbekistan",
    "VNM": "Vietnam"
}

# Apply the mapping to the 'CNT' column
df_filtered.loc[:, "CNT"] = df_filtered["CNT"].replace(country_mapping)

In [56]:
df_filtered["CNT"].unique()

array(['Albania', 'QAZ', 'Argentina', 'Australia', 'Austria', 'Belgium',
       'Brazil', 'Brunei_Darussalam', 'Bulgaria', 'Cambodia', 'Canada',
       'Chile', 'Taiwan', 'Colombia', 'Costa_Rica', 'Croatia',
       'Czech_Republic', 'Denmark', 'Dominican_Republic', 'El_Salvador',
       'Estonia', 'Finland', 'France', 'Georgia', 'Palestinian_Authority',
       'Germany', 'Greece', 'Guatemala', 'HKG', 'Hungary', 'Iceland',
       'Indonesia', 'Ireland', 'Israel', 'Italy', 'Kosovo', 'Jamaica',
       'Japan', 'Kazakhstan', 'Jordan', 'Korea', 'Latvia', 'Lithuania',
       'MAC', 'Malaysia', 'Malta', 'Mexico', 'Mongolia',
       'Republic_of_Moldova', 'Montenegro', 'Morocco', 'Netherlands',
       'New_Zealand', 'Norway', 'Panama', 'Paraguay', 'Peru',
       'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania',
       'Saudi_Arabia', 'Serbia', 'Singapore', 'Slovak_Republic',
       'Vietnam', 'Slovenia', 'Spain', 'Sweden', 'Switzerland',
       'Thailand', 'United_Arab_Emirates', 'Türki

In [57]:
df_filtered.head(5)

Unnamed: 0,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,ST251Q01JA,ST251Q02JA,ST251Q03JA,ST251Q04JA,ST251Q06JA,...,WB177Q02HA,WB177Q03HA,WB177Q04HA,WB032Q01NA,WB032Q02NA,WB031Q01NA,EXERPRAC,STUBMI,WORKPAY,WORKHOME
0,-999.0,1.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,4.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,-999.0,0.0,10.0
1,2.0,2.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,1.0,1.0,1.0,1.0,1.0,2.0,3.0,3.0,3.0,2.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,0.0
3,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,10.0,-999.0,0.0,10.0
4,1.0,1.0,1.0,1.0,1.0,3.0,1.0,2.0,3.0,1.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,2.0,-999.0,0.0,4.0


# Remove non-nationally representative countries

In [58]:
# Define the values to remove
remove_countries = ["QUR", "HKG", "MAC", "QAZ"]

# Filter out the unwanted rows
df_filtered = df_filtered[~df_filtered["CNT"].isin(remove_countries)]

In [59]:
df_filtered.shape

(591857, 731)

In [60]:
test = df_filtered[(df_filtered["CNT"] == "USA") & (df_filtered["LANGN"] == 922)]
test.head(10)

Unnamed: 0,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,ST251Q01JA,ST251Q02JA,ST251Q03JA,ST251Q04JA,ST251Q06JA,...,WB177Q02HA,WB177Q03HA,WB177Q04HA,WB032Q01NA,WB032Q02NA,WB031Q01NA,EXERPRAC,STUBMI,WORKPAY,WORKHOME


# Write out our new file

In [61]:
df_filtered.to_csv('student_filtered_and_edited.csv')