In [1]:
import pandas as pd
import numpy as np
import os

# Load the dataset
file_path = '01_Immigrant Coding Training.xlsx'
xls = pd.ExcelFile(file_path)

# Read the first sheet (assuming it is the main one for processing)
sheet1_df = pd.read_excel(xls, sheet_name='Sheet1')

# Filter out the data where Q1 == 1
filtered_df = sheet1_df[sheet1_df['Q1'] == 1]

# Check the shape
print(f"Filtered dataset shape: {filtered_df.shape}")

Filtered dataset shape: (1557, 19)


In [2]:
# Combine the Q3 columns into a single list representation
filtered_df['Q3_clean'] = filtered_df.loc[:, 'Q3_1':'Q3_8'].values.tolist()

# Shuffle the dataset with a fixed random seed for reproducibility
filtered_df = filtered_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split exactly 1000 for training and 557 for testing
train_df = filtered_df.iloc[:1000]
test_df = filtered_df.iloc[1000:]

# Save the test set
test_path = 'Data_test/test_set.xlsx'
os.makedirs('Data_test/', exist_ok=True)
test_df.to_excel(test_path, index=False)

# Split the training set into 5 parts of 200 samples each
train_splits = [train_df.iloc[i:i + 200] for i in range(0, 1000, 200)]

# Create a directory to store the splits
output_dir = 'Data_train/'
os.makedirs(output_dir, exist_ok=True)

# Save each split into separate files
file_paths = []
for i, split in enumerate(train_splits, 1):
    file_name = f'train_{i}.xlsx'
    split_path = os.path.join(output_dir, file_name)
    split.to_excel(split_path, index=False)
    file_paths.append(split_path)

print("Processing Complete!")
print(f"Test set saved at: {test_path}")
print(f"Train splits saved at: {output_dir}")
print("File paths:")
for path in file_paths:
    print(path)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Q3_clean'] = filtered_df.loc[:, 'Q3_1':'Q3_8'].values.tolist()


Processing Complete!
Test set saved at: Data_test/test_set.xlsx
Train splits saved at: Data_train/
File paths:
Data_train/train_1.xlsx
Data_train/train_2.xlsx
Data_train/train_3.xlsx
Data_train/train_4.xlsx
Data_train/train_5.xlsx
