In [82]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics


##### Reading data

In [161]:
dirpath = '/Users/parulgaba/Desktop/Capstone-Ethos/ConfidentialData/csvdata/'

data_path = '/Users/parulgaba/Desktop/Capstone-Ethos/ethos-retail-model/data/'

filename = data_path + 'regression_data/' + 'aggregated_summary_store_type_12_weeks.csv'
chunksize = 10 ** 5
rows=0
summary_df = pd.DataFrame()
for chunk in pd.read_csv(filename, chunksize=chunksize):
    summary_df=pd.concat([summary_df,chunk])
    rows+=chunk.shape[0]
    
summary_df.fillna(0)
print(summary_df.shape)
print (rows)

(476711, 47)
476711


In [118]:
summary_df.shape

(476711, 47)

In [None]:
summary_df['stock_prevailing_mrp'] = summary_df['stock_prevailing_mrp'].div(10000)
summary_df['billing'] = summary_df['billing'].div(10000)

In [120]:
items_no_sales = summary_df.groupby(['item_no']).agg({'sales_quantity':'sum'}).reset_index()
unique_item_no_sales = items_no_sales[items_no_sales['sales_quantity'] == 0]['item_no'].unique()
summary_df = summary_df[~summary_df['item_no'].isin(unique_item_no_sales)]
print("Unique items removed with no sales at all for all 3 three years : " + str(len(unique_item_no_sales)))

Unique items removed with no sales at all for all 3 three years : 4324


In [121]:
summary_df.shape

(438196, 47)

In [122]:
summary_df['item_no']=summary_df['item_no'].astype(str)
#summary_df['period']=summary_df['period'].astype(str)
summary_df['case_shape']=summary_df['case_shape'].astype(str)

In [123]:
def paretoItems(df,cols = ['brand']):
    lst=[]
    
    for col in cols:     
        series=df.fillna(0).groupby([col]).agg({'billing':'sum'}).sort_values('billing',ascending=False)
        mask=series.cumsum()/series.sum()>0.9
        #nos=mask.value_counts()[1]
        mask=mask.iloc[:,0]
        levels=len(df[col].unique())
        
        if levels>10:
            df['brand'] = np.where(df[col].isin(series[mask].index),'Other',df['brand'])         
            df[col] = np.where(df[col].isin(series[mask].index),'Other',df[col])         
        new_levels=len(df[col].unique())

        freq=df[col].value_counts()/df[col].value_counts().sum()*100
        freq=freq.round(2)

        sale_qty=df.groupby([col]).agg({'sales_quantity':'sum'}).sort_values('sales_quantity',ascending=False)
        sale_qty=sale_qty/sale_qty.sum()*100
        sale_qty=sale_qty.round(2)
        try:
            Other_Sales_Qty=sale_qty['sales_quantity']['Other']
        except:
            Other_Sales_Qty=0
        
        bill=df.groupby([col]).agg({'billing':'sum'}).sort_values('billing',ascending=False)
        bill=bill/bill.sum()*100
        bill=bill.round(2)
        try:
            Other_bill=bill['billing']['Other']
        except:
            Other_bill=0
        
        #comparison=mrp.merge(sale_qty, left_index=True, right_index=True)
        lst.append([col.upper(),levels, new_levels,Other_bill,Other_Sales_Qty])
        #print ("%s-Originally %d levels,combined %d levels into 'Other'.New Levels %d.By MRP,Other is %2.1f and by sale qty others is %2.1f"%(col.upper(),levels, levels-new_levels, new_levels,mrp['stock_prevailing_mrp']['Other'],sale_qty['sales_quantity']['Other']))
    
    cols=['Feature', 'Orig Levels', 'New Levels', 'Other%(Billing)', 'Other%(Sales Qty)']
    df1 = pd.DataFrame(lst, columns=cols)
    df1=df1.set_index("Feature")
    
    return df1,df

In [124]:
summary_items = summary_df

summary_df_items_pareto, summary_items = paretoItems(summary_items, ['item_no'])

summary_df_items_pareto

Unnamed: 0_level_0,Orig Levels,New Levels,Other%(Billing),Other%(Sales Qty)
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ITEM_NO,14749,10062,1.0,8.89


In [125]:
summarize_method={'brand':'first','region':'first',
                  'city_type':'first','purchase_quantity':'sum','transfer_quantity':'sum','sales_quantity':'sum',
                  'available_quantity': 'mean', 'stock_prevailing_mrp':'mean',
                  'days_to_sell':'mean','num_of_customers':'sum','total_price':'mean','case_size_range':'first',
                  'gender':'first','movement':'first','material':'first','dial_color':'first', 'strap_type':'first',
                  'strap_color':'first', 'precious_stone':'first', 'glass':'first', 'case_shape':'first',
                  'watch_type':'first'}


sales_sum_df = summary_items.groupby(['store_type','item_no','period','state']).agg(summarize_method).reset_index()
sales_sum_df.shape

(399722, 26)

In [126]:
summary_df = sales_sum_df
summary_df.shape

(399722, 26)

## Adding log S/So - Location code

In [127]:
#reading market shares
market_share=pd.read_excel(data_path + "market_share_encoded.xlsx", header=0,index_col=0)

#computing market size for each state-period
market_sizes=summary_df.groupby(['state','period']).agg({'sales_quantity':'sum'})
market_sizes=market_sizes.reset_index()
market_sizes=pd.merge(market_sizes,market_share, left_on='state', right_on='SubCode', how='left')#.drop('Attribute_x', axis=1)
market_sizes['Market Size']=market_sizes['sales_quantity'].div(market_sizes['Market Share'], axis=0)

#computing number of stores per state
x=summary_df.groupby(['state','period'])['store_type'].unique()
l=[]
store_nos=pd.DataFrame()
for i in range(len(x)):
    l.append([x.index[i][0],x.index[i][1],len(x[i])])
cols=['state','period','Store numbers']
store_nos = pd.DataFrame(l, columns=cols)

#merging market sizes with number of stores per market
market_sizes=pd.merge(market_sizes,store_nos, how='inner')

#computing market size per store
market_sizes['per store market']=market_sizes['Market Size']/market_sizes['Store numbers']

#adding market share per store-period to the main data
market_sizes=market_sizes[['state','period','per store market']]#extracting only relevant columns from market_sizes
merge_cols=['state','period']
summary_with_market_shares=pd.merge(summary_df,market_sizes, on=merge_cols,how='inner')

#computing So

# summary_with_market_shares['so'] = summary_with_market_shares['per store type market']-summary_with_market_shares['sales_quantity']

summary_with_market_shares['so']=summary_with_market_shares['per store market']-summary_with_market_shares['sales_quantity']
summary_with_market_shares = summary_with_market_shares[summary_with_market_shares['so'] != 0]

#computing log(S/So) [replacing zeros with 1e-08 so that logs dont create a problem]
summary_with_market_shares['so']=summary_with_market_shares['sales_quantity'].replace(0,10**(-5)).div(summary_with_market_shares['so'],axis=0)
summary_with_market_shares['so']=np.log(summary_with_market_shares['so'])



In [128]:
print(len(summary_with_market_shares[summary_with_market_shares['so'].isin([np.inf, -np.inf, np.nan])]))

0


In [129]:
#summary_with_market_shares = summary_with_market_shares[~(summary_with_market_shares['so'].isin([np.inf, -np.inf, np.nan]))]

In [130]:
#checking for NaNs
d=summary_with_market_shares[['sales_quantity','per store market','so']]
d[d.isna().any(axis=1)]

Unnamed: 0,sales_quantity,per store market,so


In [131]:
summary_with_market_shares['so'].head()

0   -19.408827
1   -19.408827
2   -19.408827
3    -7.895529
4   -19.408827
Name: so, dtype: float64

In [132]:
summary_with_market_shares['per store market'].head()

0    2686.25
1    2686.25
2    2686.25
3    2686.25
4    2686.25
Name: per store market, dtype: float64

In [133]:
#extracting specific columns

col=[ 'item_no','period', 'state', 'region',
       'brand', 'stock_prevailing_mrp', 'store_type', 'store_location', 'city_type',
       'available_quantity',  'case_size_range',
       'gender', 'movement', 'material', 'dial_color', 'strap_type',
       'strap_color', 'precious_stone', 'glass', 'case_shape', 'watch_type','billing','sales_quantity','so']

summary_final=summary_with_market_shares.loc[:,col]
#df_north_final.fillna(0, inplace=True)

summary_final['item_no']=summary_final['item_no'].astype(str)
#summary_final['period']=summary_final['period'].astype(str)
summary_final['case_shape']=summary_final['case_shape'].astype(str)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [134]:
summary_final.shape

(397853, 24)

#### Defining a function for doing pareto analysis on features

The function combines all levels of a categorical features that cummulatively account for ~10% or less by Sales billings into a new level called "others". Features with less than 10 levels are not considered for pareto analysis.

In [159]:
def pareto(df,cols):
    lst=[]
    
    for col in cols:
                
        series=df.fillna(0).groupby([col]).agg({'billing':'sum'}).sort_values('billing',ascending=False)
        mask=series.cumsum()/series.sum()>0.9 
        #nos=mask.value_counts()[1]
        mask=mask.iloc[:,0]
        levels=len(df[col].unique())
        
        if levels>10:
            df[col] = np.where(df[col].isin(series[mask].index),'Other',df[col])         
        new_levels=len(df[col].unique())

        freq=df[col].value_counts()/df[col].value_counts().sum()*100
        freq=freq.round(2)

        sale_qty=df.groupby([col]).agg({'sales_quantity':'sum'}).sort_values('sales_quantity',ascending=False)
        sale_qty=sale_qty/sale_qty.sum()*100
        sale_qty=sale_qty.round(2)
        try:
            Other_Sales_Qty=sale_qty['sales_quantity']['Other']
        except:
            Other_Sales_Qty=0
        
        bill=df.groupby([col]).agg({'billing':'sum'}).sort_values('billing',ascending=False)
        bill=bill/bill.sum()*100
        bill=bill.round(2)
        try:
            Other_bill=bill['billing']['Other']
        except:
            Other_bill=0
        
        #comparison=mrp.merge(sale_qty, left_index=True, right_index=True)
        lst.append([col.upper(),levels, new_levels,Other_bill,Other_Sales_Qty])
        #print ("%s-Originally %d levels,combined %d levels into 'Other'.New Levels %d.By MRP,Other is %2.1f and by sale qty others is %2.1f"%(col.upper(),levels, levels-new_levels, new_levels,mrp['stock_prevailing_mrp']['Other'],sale_qty['sales_quantity']['Other']))
    
    cols=['Feature', 'Orig Levels', 'New Levels', 'Other%(Billing)', 'Other%(Sales Qty)']
    df1 = pd.DataFrame(lst, columns=cols)
    df1=df1.set_index("Feature")
    
    return df1,df

In [160]:
import numpy as np
cols=['case_size_range', 'gender','movement', 'material', 'dial_color', 'strap_type', 'strap_color','precious_stone', 'glass', 'case_shape', 'watch_type']
summary,summary_final_pareto=pareto(summary_final, cols)
summary


Unnamed: 0_level_0,Orig Levels,New Levels,Other%(Billing),Other%(Sales Qty)
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CASE_SIZE_RANGE,13,13,0,0
GENDER,3,3,0,0
MOVEMENT,6,6,0,0
MATERIAL,56,56,0,0
DIAL_COLOR,47,47,0,0
STRAP_TYPE,58,58,0,0
STRAP_COLOR,44,44,0,0
PRECIOUS_STONE,8,8,0,0
GLASS,7,7,0,0
CASE_SHAPE,6,6,0,0


In [137]:
summary_final_pareto.shape

(397853, 24)

#### Creating dummy variables

In [138]:
#creating dummy variables
cols=['brand','state','region', 'store_type', 'store_location', 'city_type',
       'case_size_range', 'gender', 'movement', 'material', 'dial_color',
       'strap_type', 'strap_color', 'precious_stone', 'glass', 'case_shape',
       'watch_type']
summary_final_dummies=pd.get_dummies(data=summary_final_pareto, columns=cols)

print('Done')

Done


In [139]:
summary_final_dummies.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 397853 entries, 0 to 399721
Columns: 360 entries, item_no to watch_type_smart watch
dtypes: float64(4), int64(2), object(1), uint8(353)
memory usage: 158.2+ MB


In [140]:
#for So as the target variable
#creating seperate df for independent and dependent features
y=summary_final_dummies.loc[:, summary_final_dummies.columns == 'so']
X=summary_final_dummies.drop(columns =['sales_quantity','item_no','billing', 'so', 'period'])



In [141]:
summary_final_dummies.shape

(397853, 360)

In [142]:
#checking for duplicate column names
duplicate_columns = summary_final_dummies.columns[summary_final_dummies.columns.duplicated()]
duplicate_columns

Index([], dtype='object')

In [143]:
#performing train and test split on data
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


"""
split=0.2
test=int(len(X)*split)
train=len(X)-test
X_train=X.head(train)
y_train=y.head(train)
X_test=X.tail(test)
y_test=y.tail(test)
"""

'\nsplit=0.2\ntest=int(len(X)*split)\ntrain=len(X)-test\nX_train=X.head(train)\ny_train=y.head(train)\nX_test=X.tail(test)\ny_test=y.tail(test)\n'

### Regression Modelling

In [144]:
from sklearn.linear_model import LinearRegression
reg_model = LinearRegression()  
reg_model.fit(X_train, y_train)

LinearRegression()

In [145]:
preds = reg_model.predict(X_test)

In [146]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
rmse = np.sqrt(mean_squared_error(y_test, preds))


In [147]:
r2=r2_score(y_test, preds)
print("Linear regression RMSE: %.2f, Test R2: %.2f" % (rmse,r2))

Linear regression RMSE: 4.84, Test R2: 0.13


## XGBoost

In [148]:
#!pip install xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [149]:

#
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bylevel=0.5, min_child_weight=7, colsample_bytree = 0.5, reg_alpha=0.7, reg_lambda=0.7, subsample=0.3, learning_rate = 0.15,max_depth = 10, alpha = 10, n_estimators = 5)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))


In [150]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test,preds)
print("Xgboost RMSE: %.2f, Test R2: %.2f" % (rmse,r2))

Xgboost RMSE: 8.92, Test R2: -1.97


In [151]:
params = {'colsample_bytree': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
          'subsample': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
          'colsample_bylevel':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
          'min_child_weight':[1,3,5,7] ,
          'reg_lambda':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
          'n_estimator':[3,5,7,10],
          'learning_rate': [0.05,0.1,0.15,0.2,0.25,0.3],
          'max_depth': [3,4,5,6,7,8,10,12,15], 
          'reg_alpha': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
         }

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')
#scoring='r2'
scoring='neg_mean_squared_error'
random_search=RandomizedSearchCV(xg_reg,param_distributions=params,scoring=scoring,n_iter=20,cv=5,verbose=3)

random_search.fit(X_train,y_train)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] subsample=0.9, reg_lambda=0.2, reg_alpha=0.6, n_estimator=3, min_child_weight=1, max_depth=12, learning_rate=0.2, colsample_bytree=0.1, colsample_bylevel=0.3 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  subsample=0.9, reg_lambda=0.2, reg_alpha=0.6, n_estimator=3, min_child_weight=1, max_depth=12, learning_rate=0.2, colsample_bytree=0.1, colsample_bylevel=0.3, score=-21.910, total=  48.3s
[CV] subsample=0.9, reg_lambda=0.2, reg_alpha=0.6, n_estimator=3, min_child_weight=1, max_depth=12, learning_rate=0.2, colsample_bytree=0.1, colsample_bylevel=0.3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   48.3s remaining:    0.0s


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  subsample=0.9, reg_lambda=0.2, reg_alpha=0.6, n_estimator=3, min_child_weight=1, max_depth=12, learning_rate=0.2, colsample_bytree=0.1, colsample_bylevel=0.3, score=-21.893, total=  46.6s
[CV] subsample=0.9, reg_lambda=0.2, reg_alpha=0.6, n_estimator=3, min_child_weight=1, max_depth=12, learning_rate=0.2, colsample_bytree=0.1, colsample_bylevel=0.3 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  subsample=0.9, reg_lambda=0.2, reg_alpha=0.6, n_estimator=3, min_child_weight=1, max_depth=12, learning_rate=0.2, colsample_bytree=0.1, colsample_bylevel=0.3, score=-22.056, total=  47.7s
[CV] subsample=0.9, reg_lambda=0.2, reg_alpha=0.6, n_estimator=3, min_child_weight=1, max_depth=12, learning_rate=0.2, colsample_bytree=0.1, colsample_bylevel=0.3 
Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[CV]  subsample=0.9, reg_lambda=0.2, reg_alpha=0.6, n_estimator=3, m

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 150.3min finished


Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n...
                                                             0.5, 0.6, 0.7, 0.8,
                                                             0.9],
                                        'l

In [152]:
random_search.best_params_

{'subsample': 0.9,
 'reg_lambda': 0.8,
 'reg_alpha': 0.1,
 'n_estimator': 3,
 'min_child_weight': 5,
 'max_depth': 12,
 'learning_rate': 0.15,
 'colsample_bytree': 0.5,
 'colsample_bylevel': 0.3}

In [153]:
#best model based on the output of random_search.best_estimator_
best_gb=xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=12,
             min_child_weight =7, missing=np.nan,
              n_estimator=3, n_estimators=100,
             n_jobs=0, num_parallel_tree=1, objective='reg:squarederror',
             random_state=0, reg_alpha=0.5, reg_lambda=0.3, scale_pos_weight=1,
             subsample=0.2, tree_method='exact', validate_parameters=1,
             verbosity=0)

In [154]:
from sklearn.model_selection import cross_val_score as cvs
score=cvs(best_gb,X_train,y_train,cv=2,scoring='r2')


In [155]:
score

array([0.1887955 , 0.18992373])

In [156]:
score.mean()

0.18935961073841756

In [157]:
best_gb.fit(X_train,y_train)
preds = best_gb.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 4.630097


In [158]:
from sklearn.metrics import r2_score
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2=r2_score(y_test,preds)
print("RMSE: %.2f, R2: %.2f" % (rmse,r2))

RMSE: 4.63, R2: 0.20
