In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics


##### Reading data

In [2]:
dirpath = '/Users/parulgaba/Desktop/Capstone-Ethos/ConfidentialData/csvdata/'

data_path = '/Users/parulgaba/Desktop/Capstone-Ethos/ethos-retail-model/data/'

filename = data_path + 'regression_data/' + 'aggregated_summary_store_type_12_weeks.csv'
chunksize = 10 ** 5
rows=0
summary_df = pd.DataFrame()
for chunk in pd.read_csv(filename, chunksize=chunksize):
    summary_df=pd.concat([summary_df,chunk])
    rows+=chunk.shape[0]
    
summary_df.fillna(0)
print(summary_df.shape)
print (rows)

(476711, 47)
476711


In [4]:
summary_df['stock_prevailing_mrp'] = summary_df['stock_prevailing_mrp'].div(10000)
summary_df['billing'] = summary_df['billing'].div(10000)

In [5]:
items_no_sales = summary_df.groupby(['item_no']).agg({'sales_quantity':'sum'}).reset_index()
unique_item_no_sales = items_no_sales[items_no_sales['sales_quantity'] == 0]['item_no'].unique()
summary_df = summary_df[~summary_df['item_no'].isin(unique_item_no_sales)]
print("Unique items removed with no sales at all for all 3 three years : " + str(len(unique_item_no_sales)))

Unique items removed with no sales at all for all 3 three years : 4324


In [6]:
summary_df.shape

(438196, 47)

In [7]:
summary_df['item_no']=summary_df['item_no'].astype(str)
summary_df['period']=summary_df['period'].astype(str)
summary_df['case_shape']=summary_df['case_shape'].astype(str)

In [8]:
brands = summary_df['brand'].unique()

In [10]:
print (len(summary_df['item_no'].unique()))

14749


In [11]:
item_cols = ['case_size_range', 'gender','movement', 'material', 'dial_color', 'strap_type', 'strap_color','precious_stone', 'glass', 'case_shape', 'watch_type']



In [None]:
# Create a new df with Other items

summary_item_pareto_final = pd.DataFrame()
list = []

for brand in brands:
    items_combined_df = pd.DataFrame()
    summary_by_brand_df = summary_df[summary_df['brand'] == brand]
    item_series = summary_by_brand_df.fillna(0).groupby('item_no').agg({'sales_quantity':'sum'}).sort_values('sales_quantity',ascending=False)
    items_combined_df = pd.concat([items_combined_df, item_series])
    
    mask=items_combined_df.cumsum()/items_combined_df.sum()>0.95
    mask=mask.iloc[:,0]
    
    levels=len(summary_by_brand_df['item_no'].unique())
    
    summary_by_brand_df['brand'] = np.where(summary_by_brand_df['item_no'].isin(item_series[mask].index),'Other',summary_by_brand_df['brand'])         
    summary_by_brand_df['item_no'] = np.where(summary_by_brand_df['item_no'].isin(item_series[mask].index),'Other',summary_by_brand_df['item_no'])
    
    for col in item_cols:
        summary_by_brand_df[col] = np.where(summary_by_brand_df['item_no'].isin(item_series[mask].index),'Other',summary_by_brand_df[col])
    
    new_levels=len(summary_by_brand_df['item_no'].unique())
    
    freq=summary_by_brand_df['item_no'].value_counts()/summary_by_brand_df['item_no'].value_counts().sum()*100
    freq=freq.round(2)
    
    sale_qty=summary_by_brand_df.groupby(['item_no']).agg({'sales_quantity':'sum'}).sort_values('sales_quantity',ascending=False)
    sale_qty=sale_qty/sale_qty.sum()*100
    sale_qty=sale_qty.round(2)
    
    try:
        Other_Sales_Qty=sale_qty['sales_quantity']['Other']
    except:
        Other_Sales_Qty=0
    bill=summary_by_brand_df.groupby(['item_no']).agg({'billing':'sum'}).sort_values('billing',ascending=False)
    bill=bill/bill.sum()*100
    bill=bill.round(2)
    try:
        Other_bill=bill['billing']['Other']
    except:
        Other_bill=0
        
    list.append([brand, levels, new_levels,Other_bill,Other_Sales_Qty])
    
    cols=['Brand', 'Orig SKU count', 'New SKU count', 'Other%(Billing)', 'Other%(Sales Qty)']
    item_pareto_summary = pd.DataFrame(list, columns=cols)
    item_pareto_summary=item_pareto_summary.set_index("Brand")
    
    summary_item_pareto_final = pd.concat([summary_item_pareto_final, summary_by_brand_df])
#items_combined_df.head(5)


In [15]:
print (len(summary_item_pareto_final['item_no'].unique()))

9706


In [16]:
item_pareto_summary.sort_values(by = ['Other%(Billing)']).head()

Unnamed: 0_level_0,Orig SKU count,New SKU count,Other%(Billing),Other%(Sales Qty)
Brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B027,97,83,9.68,5.0
B033,69,63,6.75,5.15
B063,670,397,16.49,5.02
B083,789,389,26.09,5.02
B100,1045,542,21.48,5.02


In [17]:
summary_item_pareto_final[summary_item_pareto_final['item_no'] == 'Other'].shape

(62768, 47)

In [19]:
summary_item_pareto_final.columns

Index(['period', 'item_no', 'state', 'store_type', 'brand', 'store_location',
       'city_type', 'region', 'quantity', 'purchase_quantity',
       'transfer_quantity', 'available_quantity', 'sales_quantity',
       'purchase_cost_amount', 'purchase_mrp', 'purchase_date',
       'stock_prevailing_mrp', 'store_in', 'product_group_code',
       'transfer_cost_amount', 'sales_department', 'days_to_sell',
       'num_of_customers', 'total_price', 'line_discount', 'crm_line_discount',
       'discount', 'tax', 'cost', 'billing', 'contribution', 'trade_incentive',
       'trade_incentive_value', 'total_contribution', 'case_size',
       'case_size_range', 'gender', 'movement', 'material', 'dial_color',
       'strap_type', 'strap_color', 'precious_stone', 'glass', 'case_shape',
       'watch_type', 'area_code'],
      dtype='object')

In [23]:
summarize_method = {
         'brand' :'first', 'store_location' :'first',
       'city_type' :'first', 'region' :'first', 'quantity' :'mean', 'purchase_quantity' :'mean',
       'transfer_quantity' :'mean', 'available_quantity' :'mean', 'sales_quantity' :'mean',
       'purchase_cost_amount' :'mean', 'purchase_mrp' :'mean', 'purchase_date' :'first',
       'stock_prevailing_mrp' :'mean', 'store_in' :'first', 'product_group_code' :'first',
       'transfer_cost_amount' :'mean', 'sales_department' :'first', 'days_to_sell' :'mean',
       'num_of_customers' :'mean', 'total_price' :'mean', 'line_discount' :'mean', 'crm_line_discount' :'mean',
       'discount' :'mean', 'tax' :'mean', 'cost' :'mean', 'billing' :'mean', 'contribution' :'mean', 'trade_incentive' :'mean',
       'trade_incentive_value' :'mean', 'total_contribution' :'mean', 'case_size' :'mean',
       'case_size_range' :'first', 'gender' :'first', 'movement' :'first', 'material' :'first', 'dial_color' :'first',
       'strap_type' :'first', 'strap_color' :'first', 'precious_stone' :'first', 'glass' :'first', 'case_shape' :'first',
       'watch_type' :'first', 'area_code' :'first'
}


sales_sum_df = summary_item_pareto_final.groupby(['store_type','item_no','period','state']).agg(summarize_method).reset_index()
sales_sum_df.shape

(375664, 47)

## Adding log S/So - Location code

In [24]:
#reading market shares
market_share=pd.read_excel(data_path + "market_share_encoded.xlsx", header=0,index_col=0)

#computing market size for each state-period
market_sizes=sales_sum_df.groupby(['state','period']).agg({'sales_quantity':'sum'})
market_sizes=market_sizes.reset_index()
market_sizes=pd.merge(market_sizes,market_share, left_on='state', right_on='SubCode', how='left')#.drop('Attribute_x', axis=1)
market_sizes['Market Size']=market_sizes['sales_quantity'].div(market_sizes['Market Share'], axis=0)

#computing number of stores per state
x=sales_sum_df.groupby(['state','period'])['store_type'].unique()
l=[]
store_nos=pd.DataFrame()
for i in range(len(x)):
    l.append([x.index[i][0],x.index[i][1],len(x[i])])
cols=['state','period','Store numbers']
store_nos = pd.DataFrame(l, columns=cols)

#merging market sizes with number of stores per market
market_sizes=pd.merge(market_sizes,store_nos, how='inner')

#computing market size per store
market_sizes['per store market']=market_sizes['Market Size']/market_sizes['Store numbers']

#adding market share per store-period to the main data
market_sizes=market_sizes[['state','period','per store market']]#extracting only relevant columns from market_sizes
merge_cols=['state','period']
summary_with_market_shares=pd.merge(sales_sum_df,market_sizes, on=merge_cols,how='inner')

#computing So

# summary_with_market_shares['so'] = summary_with_market_shares['per store type market']-summary_with_market_shares['sales_quantity']

summary_with_market_shares['so']=summary_with_market_shares['per store market']-summary_with_market_shares['sales_quantity']
summary_with_market_shares = summary_with_market_shares[summary_with_market_shares['so'] != 0]

#computing log(S/So) [replacing zeros with 1e-08 so that logs dont create a problem]
summary_with_market_shares['so']=summary_with_market_shares['sales_quantity'].replace(0,10**(-5)).div(summary_with_market_shares['so'],axis=0)
summary_with_market_shares['so']=np.log(summary_with_market_shares['so'])



In [25]:
print(len(summary_with_market_shares[summary_with_market_shares['so'].isin([np.inf, -np.inf, np.nan])]))

0


In [26]:
#summary_with_market_shares = summary_with_market_shares[~(summary_with_market_shares['so'].isin([np.inf, -np.inf, np.nan]))]

In [27]:
#checking for NaNs
d=summary_with_market_shares[['sales_quantity','per store market','so']]
d[d.isna().any(axis=1)]

Unnamed: 0,sales_quantity,per store market,so


In [28]:
summary_with_market_shares['so'].head()

0   -19.807252
1   -19.807252
2   -19.807252
3   -19.807252
4    -8.294076
Name: so, dtype: float64

In [29]:
summary_with_market_shares['per store market'].head()

0    4001.10742
1    4001.10742
2    4001.10742
3    4001.10742
4    4001.10742
Name: per store market, dtype: float64

In [31]:
#extracting specific columns

col=[ 'item_no','period', 'state', 'region',
       'brand', 'stock_prevailing_mrp', 'store_type', 'store_location', 'city_type',
       'available_quantity',  'case_size_range',
       'gender', 'movement', 'material', 'dial_color', 'strap_type',
       'strap_color', 'precious_stone', 'glass', 'case_shape', 'watch_type','billing','sales_quantity','so']

summary_final=summary_with_market_shares.loc[:,col]
#df_north_final.fillna(0, inplace=True)

summary_final['item_no']=summary_final['item_no'].astype(str)
summary_final['period']=summary_final['period'].astype(str)
summary_final['case_shape']=summary_final['case_shape'].astype(str)

#### Creating dummy variables

In [32]:
#creating dummy variables
cols=['brand','state','region', 'store_type', 'store_location', 'city_type',
       'case_size_range', 'gender', 'movement', 'material', 'dial_color',
       'strap_type', 'strap_color', 'precious_stone', 'glass', 'case_shape',
       'watch_type', 'period']
summary_final_dummies=pd.get_dummies(data=summary_final, columns=cols)

print('Done')

Done


In [33]:
summary_final_dummies.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 374005 entries, 0 to 375663
Columns: 378 entries, item_no to period_9
dtypes: float64(5), object(1), uint8(372)
memory usage: 152.7+ MB


In [34]:
#for So as the target variable
#creating seperate df for independent and dependent features
y=summary_final_dummies.loc[:, summary_final_dummies.columns == 'sales_quantity']
X=summary_final_dummies.drop(columns =['sales_quantity','item_no','billing', 'so'])



In [35]:
summary_final_dummies.shape


(374005, 378)

In [36]:
#checking for duplicate column names
duplicate_columns = summary_final_dummies.columns[summary_final_dummies.columns.duplicated()]
duplicate_columns

Index([], dtype='object')

In [37]:
#performing train and test split on data
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


"""
split=0.2
test=int(len(X)*split)
train=len(X)-test
X_train=X.head(train)
y_train=y.head(train)
X_test=X.tail(test)
y_test=y.tail(test)
"""

'\nsplit=0.2\ntest=int(len(X)*split)\ntrain=len(X)-test\nX_train=X.head(train)\ny_train=y.head(train)\nX_test=X.tail(test)\ny_test=y.tail(test)\n'

In [45]:
y2 = summary_final_dummies.loc[:, summary_final_dummies.columns == 'so']
X_train2,X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42)

### Regression Modelling

In [38]:
from sklearn.linear_model import LinearRegression
reg_model = LinearRegression()  
reg_model.fit(X_train, y_train)

LinearRegression()

In [39]:
preds = reg_model.predict(X_test)

In [40]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
rmse = np.sqrt(mean_squared_error(y_test, preds))


In [41]:
r2=r2_score(y_test, preds)
print("Linear regression RMSE: %.2f, Test R2: %.2f" % (rmse,r2))

Linear regression RMSE: 1.26, Test R2: 0.77


## Linear regression on so

In [46]:
from sklearn.linear_model import LinearRegression
reg_model_so = LinearRegression()  
reg_model_so.fit(X_train2, y_train2)
preds_so = reg_model_so.predict(X_test2)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
rmse = np.sqrt(mean_squared_error(y_test2, preds_so))
r2=r2_score(y_test2, preds_so)
print("Linear regression RMSE: %.2f, Test R2: %.2f" % (rmse,r2))

Linear regression RMSE: 5.24, Test R2: 0.02


## XGBoost

In [47]:
#!pip install xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


In [48]:

#
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bylevel=0.5, min_child_weight=7, colsample_bytree = 0.5, reg_alpha=0.7, reg_lambda=0.7, subsample=0.3, learning_rate = 0.15,max_depth = 10, alpha = 10, n_estimators = 5)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))


In [49]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test,preds)
print("Xgboost RMSE: %.2f, Test R2: %.2f" % (rmse,r2))

Xgboost RMSE: 2.56, Test R2: 0.05


In [50]:
params = {'colsample_bytree': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
          'subsample': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
          'colsample_bylevel':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
          'min_child_weight':[1,3,5,7] ,
          'reg_lambda':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
          'n_estimator':[3,5,7,10],
          'learning_rate': [0.05,0.1,0.15,0.2,0.25,0.3],
          'max_depth': [3,4,5,6,7,8,10,12,15], 
          'reg_alpha': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
         }

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
#scoring='r2'
scoring='neg_mean_squared_error'
random_search=RandomizedSearchCV(xg_reg,param_distributions=params,scoring=scoring,n_iter=20,cv=5,verbose=3)

random_search.fit(X_train,y_train)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] subsample=0.8, reg_lambda=0.2, reg_alpha=0.4, n_estimator=5, min_child_weight=5, max_depth=4, learning_rate=0.25, colsample_bytree=0.9, colsample_bylevel=0.4 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  subsample=0.8, reg_lambda=0.2, reg_alpha=0.4, n_estimator=5, min_child_weight=5, max_depth=4, learning_rate=0.25, colsample_bytree=0.9, colsample_bylevel=0.4, score=-0.676, total= 1.4min
[CV] subsample=0.8, reg_lambda=0.2, reg_alpha=0.4, n_estimator=5, min_child_weight=5, max_depth=4, learning_rate=0.25, colsample_bytree=0.9, colsample_bylevel=0.4 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  subsample=0.8, reg_lambda=0.2, reg_alpha=0.4, n_estimator=5, min_child_weight=5, max_depth=4, learning_rate=0.25, colsample_bytree=0.9, colsample_bylevel=0.4, score=-2.527, total= 1.4min
[CV] subsample=0.8, reg_lambda=0.2, reg_alpha=0.4, n_estimator=5, min_child_weight=5, max_depth=4, learning_rate=0.25, colsample_bytree=0.9, colsample_bylevel=0.4 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.8min remaining:    0.0s


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  subsample=0.8, reg_lambda=0.2, reg_alpha=0.4, n_estimator=5, min_child_weight=5, max_depth=4, learning_rate=0.25, colsample_bytree=0.9, colsample_bylevel=0.4, score=-0.664, total= 1.4min
[CV] subsample=0.8, reg_lambda=0.2, reg_alpha=0.4, n_estimator=5, min_child_weight=5, max_depth=4, learning_rate=0.25, colsample_bytree=0.9, colsample_bylevel=0.4 
Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  subsample=0.8, reg_lambda=0.2, reg_alpha=0.4, n_estimator=5, mi

KeyboardInterrupt: 

In [None]:
random_search.best_params_

In [None]:
#best model based on the output of random_search.best_estimator_
best_gb=xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=12,
             min_child_weight =7, missing=np.nan,
              n_estimator=3, n_estimators=100,
             n_jobs=0, num_parallel_tree=1, objective='reg:squarederror',
             random_state=0, reg_alpha=0.5, reg_lambda=0.3, scale_pos_weight=1,
             subsample=0.2, tree_method='exact', validate_parameters=1,
             verbosity=0)

In [None]:
from sklearn.model_selection import cross_val_score as cvs
score=cvs(best_gb,X_train,y_train,cv=2,scoring='r2')


In [None]:
score

In [None]:
score.mean()

In [None]:
best_gb.fit(X_train,y_train)
preds = best_gb.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
from sklearn.metrics import r2_score
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2=r2_score(y_test,preds)
print("RMSE: %.2f, R2: %.2f" % (rmse,r2))