In [222]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [223]:
# Load train/test with all columns

train_path = "./train_all_cols_v3.csv"

test_path = "./test_all_cols_v3.csv"

train_df = pd.read_csv(train_path, dtype={"isbn": "string"})
test_df = pd.read_csv(test_path, dtype={"isbn": "string"})

predict_df = pd.read_csv("target_books_cleaned.csv")

train_df = train_df.loc[train_df["isbn"].str.startswith(r"978")]
test_df = test_df[test_df["isbn"].str.startswith(r"978")]

#predict_df = pd.read_excel("target_books_cleaned_one_hot.xlsx")

print('Train shape:', train_df.shape)

print('Test shape:', test_df.shape)

train_df

Train shape: (34389, 35)
Test shape: (8562, 35)


Unnamed: 0,isbn,publisher,print_length,item_weight,length,width,height,rating,number_of_reviews,price,...,book_format_paperback,reading_age_adolescence or above,reading_age_baby,reading_age_preadolescence,reading_age_preschool,reading_age_toddler,Quarter_num_1,Quarter_num_2,Quarter_num_3,Quarter_num_4
0,9780008430306,harper_collins,312,15.20,7.87,6.02,0.75,4.8,11824.0,7.99,...,0,0,0,0,0,1,0,0,1,0
1,9780008430306,harper_collins,312,15.20,7.87,6.02,0.75,4.8,11824.0,7.99,...,0,0,0,0,0,1,0,0,0,1
2,9780008430306,harper_collins,312,15.20,7.87,6.02,0.75,4.8,11824.0,7.99,...,0,0,0,0,0,1,1,0,0,0
3,9780008430306,harper_collins,312,15.20,7.87,6.02,0.75,4.8,11824.0,7.99,...,0,0,0,0,0,1,0,1,0,0
4,9780008430306,harper_collins,312,15.20,7.87,6.02,0.75,4.8,11824.0,7.99,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34384,9782764351444,phidal,10,36.96,10.00,7.60,1.62,4.8,1203.0,13.38,...,0,0,0,0,0,1,0,0,1,0
34385,9782764351444,phidal,10,36.96,10.00,7.60,1.62,4.8,1203.0,13.38,...,0,0,0,0,0,1,0,0,0,1
34386,9782764351444,phidal,10,36.96,10.00,7.60,1.62,4.8,1203.0,13.38,...,0,0,0,0,0,1,1,0,0,0
34387,9782764351444,phidal,10,36.96,10.00,7.60,1.62,4.8,1203.0,13.38,...,0,0,0,0,0,1,0,1,0,0


In [None]:
# Separate features and target
TARGET_COL = "Next_Q1"
OPP_TARGET_COL = "Next_Q1_log1p"  # Alternative target column

# train_df[TARGET_COL] = train_df[OPP_TARGET_COL].apply(lambda x: np.log1p(x))
# test_df[TARGET_COL] = test_df[OPP_TARGET_COL].apply(lambda x: np.log1p(x))

# Optionally keep target as log1p for modeling
y_train = train_df[TARGET_COL]
y_test = test_df[TARGET_COL]

# Drop id column and all *_log1p columns from features
cols_to_drop = (
    ['isbn', TARGET_COL, OPP_TARGET_COL, "publisher", 'Next_Q2', 'Next_Q3', 'Next_Q4', 'price','Book_Flag']
)

X_train = train_df.drop(columns=cols_to_drop)
X_test = test_df.drop(columns=cols_to_drop)

numeric_cols = X_train.select_dtypes(include='number').columns.tolist()
numeric_cols = [col for col in numeric_cols if any(("quarter" not in col.lower(), "reading_age" not in col.lower(), "channel" not in col.lower()))]

for col in numeric_cols:
    # Check if all values are non-negative
    if (X_train[col] >= 0).all():
        # Log transform first
        X_train[col] = np.log1p(X_train[col])
        X_test[col] = np.log1p(X_test[col])

        # Then clip on transformed data
        col_np = X_train[col].to_numpy()
        col_mean = col_np.mean()
        col_std = col_np.std()
        col_lower = col_mean - 3 * col_std
        col_upper = col_mean + 3 * col_std

        X_train[col] = np.clip(col_np, col_lower, col_upper)
        X_test[col] = np.clip(X_test[col].to_numpy(), col_lower, col_upper)

In [226]:
# Show only columns with nulls in both CSVs
print("Columns with nulls in train_all_cols.csv:")
null_train = X_train.isnull().sum()
print(null_train[null_train > 0])

print("\nColumns with nulls in test_all_cols.csv:")
null_test = X_test.isnull().sum()
print(null_test[null_test > 0])

Columns with nulls in train_all_cols.csv:
Series([], dtype: int64)

Columns with nulls in test_all_cols.csv:
Series([], dtype: int64)


In [227]:
# Train KNN on train_all_cols and predict Next_Q1_log1p for test_all_cols

k = 40  # you can tune this

knn = KNeighborsRegressor(n_neighbors=k, metric='euclidean')

knn.fit(X_train, y_train)
# Predict on test set

y_pred_test = knn.predict(X_test)

print('First 10 predictions for Next_Q1:')

rmse = mean_squared_error(y_test, y_pred_test)


print(f'RMSE on test: {rmse:.4f}')

print(y_test)
print(y_pred_test)

First 10 predictions for Next_Q1:
RMSE on test: 83.7941
0       174
1        26
2         2
3        28
4        43
       ... 
8557      3
8558      0
8559      7
8560     29
8561      0
Name: Next_Q1, Length: 8562, dtype: int64
[100.475  42.85   23.25  ...   1.525   4.8    18.75 ]


In [228]:
predict_df[X_train.columns]

Unnamed: 0,print_length,item_weight,length,width,height,rating,number_of_reviews,channel,Quarters_since_first,Previous_quarter_qty,...,book_format_paperback,reading_age_adolescence or above,reading_age_baby,reading_age_preadolescence,reading_age_preschool,reading_age_toddler,Quarter_num_1,Quarter_num_2,Quarter_num_3,Quarter_num_4
0,224,19.19,8.3,5.6,0.7,4.8,7001,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,224,36.96,8.4,5.7,0.8,4.8,3970,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,208,36.96,8.35,5.67,0.71,4.7,6726,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,208,9.9,8.31,5.71,0.75,4.3,8,0,0,0,...,1,0,0,1,0,0,0,0,0,1
4,272,7.3,7.72,5.16,0.67,4.7,1166,0,0,0,...,1,0,0,1,0,0,0,0,0,1
5,256,40.96,11.97,8.98,0.71,4.8,2489,0,0,0,...,0,0,0,1,0,0,0,0,0,1
6,192,1.8,10.39,7.44,0.39,4.8,659,0,0,0,...,1,0,0,1,0,0,0,0,0,1


In [230]:
predict_df_ordered = predict_df[X_train.columns]
y_actual = knn.predict(predict_df_ordered)

# Combine isbn and predictions into a single DataFrame
results_df = predict_df[["isbn"]].copy()
results_df["pred_next_q1"] = y_actual

# Preview in notebook
results_df.head()

# Save to CSV
output_path = "knn_target_books_predictions.csv"
results_df.to_csv(output_path, index=False)
print(f"Saved predictions to {output_path}")

Saved predictions to knn_target_books_predictions.csv
