In [None]:
import sys
from pathlib import Path

# Add repo root to sys.path
sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
from src.utils.packages import *
from src.config.config import RAW_DATA_PATH, PROCESSED_DATA_PATH, MODEL_PATH, VAL_RESULTS_PATH, OUTPUT_PATH
from src.utils.utils import preprocess_data, train_and_validate_models, predict_test_data


In [None]:
train_df = pd.read_csv(os.path.join(RAW_DATA_PATH, "train.csv"))
test_df = pd.read_csv(os.path.join(RAW_DATA_PATH, "test.csv"))

In [20]:

data = pd.concat(
    [train_df.assign(source="train_data"), test_df.assign(source="test_data")],
    ignore_index=True,
    copy=False
)

data['Item_Fat_Content'] = data['Item_Fat_Content'].str.strip().str.lower()

mapping = {
    'lf': 'Low Fat',
    'low fat': 'Low Fat',
    'reg': 'Regular',
    'regular': 'Regular'
}

data['Item_Fat_Content'] = data['Item_Fat_Content'].map(mapping).fillna(data['Item_Fat_Content'])

outlet_size_mode = data.groupby('Outlet_Type')['Outlet_Size'] \
                       .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None)

data['Outlet_Size'] = data['Outlet_Size'].fillna(data['Outlet_Type'].map(outlet_size_mode))

data['Item_Weight'] = data['Item_Weight'].fillna(
    data.groupby(['Item_Identifier', 'Item_Type', 'Outlet_Location_Type'])['Item_Weight']
        .transform('mean')
)
category_map = {
    'Dairy': 'Food',
    'Soft Drinks': 'Drinks',
    'Hard Drinks': 'Drinks',
    'Meat': 'Food',
    'Fruits and Vegetables': 'Food',
    'Household': 'Non-Consumable',
    'Baking Goods': 'Food',
    'Snack Foods': 'Food',
    'Frozen Foods': 'Food',
    'Breakfast': 'Food',
    'Health and Hygiene': 'Non-Consumable',
    'Canned': 'Food',
    'Breads': 'Food',
    'Starchy Foods': 'Food',
    'Others': 'Miscellaneous',
    'Seafood': 'Food'
}

data['Item_Category'] = data['Item_Type'].map(category_map).fillna('Miscellaneous')

#Years:
data['Outlet_Years'] = 2013 - data['Outlet_Establishment_Year']

cat_cols = data.select_dtypes(include='object').columns.tolist()
num_cols = data.select_dtypes(exclude='object').columns.tolist()

# Reorder DataFrame: categorical first, numerical next
data = data[cat_cols + num_cols]

data_old = data.copy()

In [21]:
data_old[:5]

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,source,Item_Category,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,Outlet_Years
0,FDA15,Low Fat,Dairy,OUT049,Medium,Tier 1,Supermarket Type1,train_data,Food,9.3,0.016047,249.8092,1999,3735.138,14
1,DRC01,Regular,Soft Drinks,OUT018,Medium,Tier 3,Supermarket Type2,train_data,Drinks,5.92,0.019278,48.2692,2009,443.4228,4
2,FDN15,Low Fat,Meat,OUT049,Medium,Tier 1,Supermarket Type1,train_data,Food,17.5,0.01676,141.618,1999,2097.27,14
3,FDX07,Regular,Fruits and Vegetables,OUT010,Small,Tier 3,Grocery Store,train_data,Food,19.2,0.0,182.095,1998,732.38,15
4,NCD19,Low Fat,Household,OUT013,High,Tier 3,Supermarket Type1,train_data,Non-Consumable,8.93,0.0,53.8614,1987,994.7052,26


In [22]:
cols_to_combine = ['Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

data['Outlet_Combined'] = data[cols_to_combine].astype(str).apply(
    lambda row: '_'.join([str(x).replace(' ', '_') for x in row]),
    axis=1
)

cols_to_combine = ['Item_Fat_Content', 'Item_Category']

data['Item_Combined'] = data[cols_to_combine].astype(str).apply(
    lambda row: '_'.join([str(x).replace(' ', '_') for x in row]),
    axis=1
)

#One Hot Coding:
data = pd.get_dummies(data, columns=['Outlet_Combined','Item_Combined'])

In [23]:
data

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,source,Item_Category,Item_Weight,...,Outlet_Combined_OUT035_Small_Tier_2_Supermarket_Type1,Outlet_Combined_OUT045_Small_Tier_2_Supermarket_Type1,Outlet_Combined_OUT046_Small_Tier_1_Supermarket_Type1,Outlet_Combined_OUT049_Medium_Tier_1_Supermarket_Type1,Item_Combined_Low_Fat_Drinks,Item_Combined_Low_Fat_Food,Item_Combined_Low_Fat_Miscellaneous,Item_Combined_Low_Fat_Non-Consumable,Item_Combined_Regular_Drinks,Item_Combined_Regular_Food
0,FDA15,Low Fat,Dairy,OUT049,Medium,Tier 1,Supermarket Type1,train_data,Food,9.30,...,False,False,False,True,False,True,False,False,False,False
1,DRC01,Regular,Soft Drinks,OUT018,Medium,Tier 3,Supermarket Type2,train_data,Drinks,5.92,...,False,False,False,False,False,False,False,False,True,False
2,FDN15,Low Fat,Meat,OUT049,Medium,Tier 1,Supermarket Type1,train_data,Food,17.50,...,False,False,False,True,False,True,False,False,False,False
3,FDX07,Regular,Fruits and Vegetables,OUT010,Small,Tier 3,Grocery Store,train_data,Food,19.20,...,False,False,False,False,False,False,False,False,False,True
4,NCD19,Low Fat,Household,OUT013,High,Tier 3,Supermarket Type1,train_data,Non-Consumable,8.93,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,Regular,Snack Foods,OUT046,Small,Tier 1,Supermarket Type1,test_data,Food,10.50,...,False,False,True,False,False,False,False,False,False,True
14200,FDD47,Regular,Starchy Foods,OUT018,Medium,Tier 3,Supermarket Type2,test_data,Food,7.60,...,False,False,False,False,False,False,False,False,False,True
14201,NCO17,Low Fat,Health and Hygiene,OUT045,Small,Tier 2,Supermarket Type1,test_data,Non-Consumable,10.00,...,False,True,False,False,False,False,False,True,False,False
14202,FDJ26,Regular,Canned,OUT017,Small,Tier 2,Supermarket Type1,test_data,Food,15.30,...,False,False,False,False,False,False,False,False,False,True


In [None]:
train = data.loc[data['source'] == "train_data"].copy()
test  = data.loc[data['source'] == "test_data"].copy()

test.drop(['Item_Outlet_Sales', 'source'], axis=1, inplace=True)
train.drop(['source'], axis=1, inplace=True)


pre_processed_data_csv_path_train = os.path.join(PROCESSED_DATA_PATH, "train_preprocessed.csv")
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
train.to_csv(pre_processed_data_csv_path_train, index=False)


pre_processed_data_csv_path_test = os.path.join(PROCESSED_DATA_PATH, "test_preprocessed.csv")
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
test.to_csv(pre_processed_data_csv_path_test, index=False)