# Feature Engineering

In [8]:
import pandas as pd
import numpy as np

In [85]:
trnsaction_data = "trnsact_clean.csv"

In [2]:
trans = pd.read_csv(trnsaction_data)

In [86]:
trans.head(5)

Unnamed: 0,SKU,STORE,REGISTER,TRANNUM,INTERID,SALEDATE,STYPE,QUANTITY,ORGPRICE,AMT,SEQ,MIC,season
0,3,202,290,1100,326708721,2005-01-18,P,1,440.0,30.0,3500000,818,Winter
1,3,202,540,2700,326708721,2005-01-29,R,1,440.0,30.0,15200000,818,Winter
2,3,303,500,2100,23702074,2004-08-18,P,1,440.0,12.0,4600000,48,Summer
3,3,709,360,500,0,2005-08-14,P,1,440.0,30.0,6500000,818,Summer
4,3,802,660,400,0,2005-08-09,P,1,440.0,30.0,4700000,599,Summer


## Feature: dicount or not

In [9]:
def impute_price(group):
    # If there's a non-zero price, use it to replace 0s
    non_zero_prices = group['ORGPRICE'].replace(0, np.nan).dropna()
    if not non_zero_prices.empty:
        return group['ORGPRICE'].replace(0, non_zero_prices.iloc[0])
    return group['ORGPRICE']

trans['ORGPRICE'] = trans.groupby('SKU').apply(impute_price).reset_index(level=0, drop=True)


In [51]:
trans

Unnamed: 0,SKU,STORE,REGISTER,TRANNUM,INTERID,SALEDATE,STYPE,QUANTITY,ORGPRICE,AMT,SEQ,MIC
0,3,202,290,1100,326708721,2005-01-18,P,1,440.0,30.00,3500000,818
1,3,202,540,2700,326708721,2005-01-29,R,1,440.0,30.00,15200000,818
2,3,303,500,2100,23702074,2004-08-18,P,1,440.0,12.00,4600000,48
3,3,709,360,500,0,2005-08-14,P,1,440.0,30.00,6500000,818
4,3,802,660,400,0,2005-08-09,P,1,440.0,30.00,4700000,599
...,...,...,...,...,...,...,...,...,...,...,...,...
120916891,9999997,7507,750,2800,0,2005-04-23,P,1,39.0,39.00,593900116,395
120916892,9999997,7907,790,3200,0,2005-05-03,P,1,39.0,39.00,502600061,395
120916893,9999997,7907,800,700,0,2005-05-04,R,1,39.0,39.00,600900066,395
120916894,9999997,7907,800,700,0,2005-05-04,P,1,39.0,39.00,601000066,395


In [24]:
percent_discount = trans.groupby(['SKU']).apply(lambda x: (100 * (x.ORGPRICE - x.AMT) / x.ORGPRICE).mean()).reset_index()

In [28]:
percent_discount.replace([np.inf, -np.inf], np.nan, inplace=True)
percent_discount.dropna(inplace = True)
percent_discount

Unnamed: 0,SKU,0
0,3,93.835341
2,5,72.754011
5,23,-89.814815
6,24,-0.470433
8,55,72.933747
...,...,...
714494,9999967,0.000000
714495,9999970,76.253333
714496,9999974,42.863092
714497,9999992,0.000000


In [42]:
df_dicsount = pd.DataFrame(percent_discount).rename(columns={0: "rate"})

In [44]:
df_dicsount['discounted'] = df_dicsount['rate'].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))


In [45]:
df_dicsount

Unnamed: 0,SKU,rate,discounted
0,3,93.835341,1
2,5,72.754011,1
5,23,-89.814815,-1
6,24,-0.470433,-1
8,55,72.933747,1
...,...,...,...
714494,9999967,0.000000,0
714495,9999970,76.253333,1
714496,9999974,42.863092,1
714497,9999992,0.000000,0


## Feature: Most saled season for sku

In [61]:
def get_season(date):
    month = date.month
    if 3 <= month <= 5:
        return 'Spring'
    elif 6 <= month <= 8:
        return 'Summer'
    elif 9 <= month <= 11:
        return 'Fall'
    else:
        return 'Winter'

# Apply the function to create a new 'season' column
trans['SALEDATE'] = pd.to_datetime(trans['SALEDATE'])
trans['season'] = trans['SALEDATE'].apply(get_season)

In [71]:
sales_by_sku_season = trans.groupby(['SKU', 'season']).size().reset_index(name='sales_count')
sales_by_season = trans.groupby(['SKU']).size()


In [87]:
# Find the most sold season for each SKU
most_sold_season_per_sku = sales_by_sku_season.sort_values('sales_count', ascending=False).drop_duplicates('SKU')

# Rename the column for clarity
most_sold_season_per_sku.rename(columns={'season': 'most_sold_season'}, inplace=True)

# Display the DataFrame with each SKU and its most sold season
most_sold_season_per_sku.reset_index(drop=True)
most_sold_season_per_sku = most_sold_season_per_sku.sort_values('SKU')[['SKU', 'most_sold_season']]
most_sold_season_per_sku


Unnamed: 0,SKU,most_sold_season
2,3,Summer
7,4,Winter
10,5,Winter
14,8,Winter
15,20,Summer
...,...,...
1807576,9999967,Summer
1807577,9999970,Fall
1807579,9999974,Summer
1807580,9999992,Summer


In [89]:
merged_df = pd.merge(df_dicsount, most_sold_season_per_sku, on='SKU', how='inner')
merged_df

Unnamed: 0,SKU,rate,discounted,most_sold_season
0,3,93.835341,1,Summer
1,5,72.754011,1,Winter
2,23,-89.814815,-1,Summer
3,24,-0.470433,-1,Summer
4,55,72.933747,1,Fall
...,...,...,...,...
702865,9999967,0.000000,0,Summer
702866,9999970,76.253333,1,Fall
702867,9999974,42.863092,1,Summer
702868,9999992,0.000000,0,Summer


In [90]:
pickle_filename = "discounted_season_feature.pkl"
most_sold_season_per_sku.to_pickle(pickle_filename)