## Import the Libraries

In [4]:
import pandas as pd
import numpy as np

## Load the dataset

In [5]:
path = './Train.csv'
df = pd.read_csv(path)
df.head(5)

Unnamed: 0,sku_name,starting_inventory,sellin,sellin_channel_1,sellin_channel_2,sellin_channel_3,sellin_channel_4,sellin_channel_5,sellin_channel_6,sellin_channel_7,...,month,year,product_lifecycle_stage,FLAG100,disc_month,cum_disc,CAT_GENDER_BOTH,CAT_GENDER_MEN,CAT_GENDER_WOMEN,Weeks
0,YOSHWARDTERR,0,1013,0,0,0,1013,0,0,0,...,7,2016,U,0.0,0,0,0,1,0,1
1,YOSHWARDTERR,0,2026,0,0,0,2026,0,0,0,...,2,2017,U,0.0,0,0,0,1,0,0
2,YOSHWARDTERR,0,1013,0,0,0,1013,0,0,0,...,5,2017,U,0.0,0,0,0,1,0,0
3,YOSHUANEMARX,0,320108,4052,40520,240081,4052,6078,18234,0,...,4,2018,W,0.270966,1,1,0,0,1,1
4,YOSHUANEMARX,0,132703,2026,0,81040,3039,25325,18234,0,...,5,2018,W,0.063004,0,1,0,0,1,0


## Split the dataset in to train and test
Our problem revolves around demand forecasting and inventory optimization, where we aim to predict the stock requirements for the upcoming quarter using historical data. To achieve this, the dataset has been divided based on month and year information. 
- The **train dataset** includes data from January 2016 to July 2021.  
- The **test dataset**, which represents completely unseen data (akin to production data), spans from August 2021 to October 2021. This test set will be used for evaluating predictions, model performance, and analyzing business impact.

In [6]:
# Define training and testing ranges
train_year_start, train_year_end = 2016, 2021
train_month_end = 7
test_year = 2021
test_month_start, test_month_end = 8, 10

# Create training set
train_set = df[
    ((df['year'] > train_year_start) & (df['year'] < train_year_end)) |  # Full years in range
    ((df['year'] == train_year_start) & (df['month'] >= 1)) |           # Start year
    ((df['year'] == train_year_end) & (df['month'] <= train_month_end)) # End year
]

# Create testing set
test_set = df[
    (df['year'] == test_year) & 
    (df['month'] >= test_month_start) & 
    (df['month'] <= test_month_end)
]

In [7]:
train_set.shape

(43018, 45)

In [8]:
test_set.shape

(1889, 45)

In [9]:
train_set.head(5)

Unnamed: 0,sku_name,starting_inventory,sellin,sellin_channel_1,sellin_channel_2,sellin_channel_3,sellin_channel_4,sellin_channel_5,sellin_channel_6,sellin_channel_7,...,month,year,product_lifecycle_stage,FLAG100,disc_month,cum_disc,CAT_GENDER_BOTH,CAT_GENDER_MEN,CAT_GENDER_WOMEN,Weeks
0,YOSHWARDTERR,0,1013,0,0,0,1013,0,0,0,...,7,2016,U,0.0,0,0,0,1,0,1
1,YOSHWARDTERR,0,2026,0,0,0,2026,0,0,0,...,2,2017,U,0.0,0,0,0,1,0,0
2,YOSHWARDTERR,0,1013,0,0,0,1013,0,0,0,...,5,2017,U,0.0,0,0,0,1,0,0
3,YOSHUANEMARX,0,320108,4052,40520,240081,4052,6078,18234,0,...,4,2018,W,0.270966,1,1,0,0,1,1
4,YOSHUANEMARX,0,132703,2026,0,81040,3039,25325,18234,0,...,5,2018,W,0.063004,0,1,0,0,1,0


In [10]:
test_set.head(5)

Unnamed: 0,sku_name,starting_inventory,sellin,sellin_channel_1,sellin_channel_2,sellin_channel_3,sellin_channel_4,sellin_channel_5,sellin_channel_6,sellin_channel_7,...,month,year,product_lifecycle_stage,FLAG100,disc_month,cum_disc,CAT_GENDER_BOTH,CAT_GENDER_MEN,CAT_GENDER_WOMEN,Weeks
47,YOSHTLYNYOSHZZ,329225,54702,43559,0,0,0,0,5065,1013,...,8,2021,N,0.0,0,0,0,1,0,1
48,YOSHTLYNYOSHZZ,114469,221847,82053,91170,16208,0,20260,6078,2026,...,9,2021,N,0.233333,1,1,0,1,0,0
49,YOSHTLYNYOSHZZ,156002,163093,122573,30390,0,0,0,2026,7091,...,10,2021,N,0.0,0,1,0,1,0,1
136,YOSHRENECARL,1490123,204626,128651,47611,16208,0,0,3039,8104,...,8,2021,N,0.0,0,0,0,0,1,1
137,YOSHRENECARL,1349316,961337,437616,486240,6078,0,0,16208,14182,...,9,2021,N,0.3,1,1,0,0,1,0


In [17]:
# Export the data to CSV fILE 

# Train File
train_set.to_csv("NEW-TRAIN-SPLIT.csv",index=False)

# Test File
test_set.to_csv("NEW-TEST-SET.csv",index=False)