In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics


##### Reading data

In [62]:
dirpath = '/Users/parulgaba/Desktop/Capstone-Ethos/ConfidentialData/csvdata/'

data_path = '/Users/parulgaba/Desktop/Capstone-Ethos/ethos-retail-model/data/'

filename = data_path + 'regression_data/' + 'aggregated_summary_period_2_weeks.csv'
chunksize = 10 ** 5
rows=0
summary_df = pd.DataFrame()
for chunk in pd.read_csv(filename, chunksize=chunksize):
    summary_df=pd.concat([summary_df,chunk])
    rows+=chunk.shape[0]
    
summary_df.fillna(0)
print(summary_df.shape)
print (rows)

(2857603, 48)
2857603


In [63]:
summary_df.columns

Index(['period', 'location_code', 'item_no', 'brand', 'store_type',
       'store_location', 'city_type', 'region', 'state', 'quantity',
       'purchase_quantity', 'transfer_quantity', 'available_quantity',
       'sales_quantity', 'purchase_cost_amount', 'purchase_mrp',
       'purchase_date', 'stock_prevailing_mrp', 'store_in',
       'product_group_code', 'transfer_cost_amount', 'sales_department',
       'days_to_sell', 'num_of_customers', 'total_price', 'line_discount',
       'crm_line_discount', 'discount', 'tax', 'cost', 'billing',
       'contribution', 'trade_incentive', 'trade_incentive_value',
       'total_contribution', 'case_size', 'case_size_range', 'gender',
       'movement', 'material', 'dial_color', 'strap_type', 'strap_color',
       'precious_stone', 'glass', 'case_shape', 'watch_type', 'area_code'],
      dtype='object')

In [64]:
summary_df['stock_prevailing_mrp'] = summary_df['stock_prevailing_mrp'].div(1000000)


In [65]:
print (len(summary_df))

2857603


## Adding log S/So - Location code

In [66]:
#reading market shares
market_share=pd.read_excel(data_path + "market_share_encoded.xlsx", header=0,index_col=0)

#computing market size for each state-period
market_sizes=summary_df.groupby(['state','period']).agg({'sales_quantity':'sum'})
market_sizes=market_sizes.reset_index()
market_sizes=pd.merge(market_sizes,market_share, left_on='state', right_on='SubCode', how='left')#.drop('Attribute_x', axis=1)
market_sizes['Market Size']=market_sizes['sales_quantity'].div(market_sizes['Market Share'], axis=0)

#computing number of stores per state
x=summary_df.groupby(['state','period'])['location_code'].unique()
l=[]
store_nos=pd.DataFrame()
for i in range(len(x)):
    l.append([x.index[i][0],x.index[i][1],len(x[i])])
cols=['state','period','Store numbers']
store_nos = pd.DataFrame(l, columns=cols)

#merging market sizes with number of stores per market
market_sizes=pd.merge(market_sizes,store_nos, how='inner')

#computing market size per store
market_sizes['per store market']=market_sizes['Market Size']/market_sizes['Store numbers']

#adding market share per store-period to the main data
market_sizes=market_sizes[['state','period','per store market']]#extracting only relevant columns from market_sizes
merge_cols=['state','period']
summary_with_market_shares=pd.merge(summary_df,market_sizes, on=merge_cols,how='inner')

#computing So
summary_with_market_shares['so']=summary_with_market_shares['per store market']-summary_with_market_shares['sales_quantity']
summary_with_market_shares = summary_with_market_shares[summary_with_market_shares['so'] != 0]

#computing log(S/So) [replacing zeros with 1e-08 so that logs dont create a problem]
summary_with_market_shares['so']=summary_with_market_shares['sales_quantity'].replace(0,10**(-5)).div(summary_with_market_shares['so'],axis=0)
summary_with_market_shares['so']=np.log(summary_with_market_shares['so'])



In [67]:
#checking for NaNs
d=summary_with_market_shares[['sales_quantity','per store market','so']]
d[d.isna().any(axis=1)]

Unnamed: 0,sales_quantity,per store market,so


In [68]:
summary_with_market_shares['so'].head()

0   -16.562781
1   -16.562781
2   -16.562781
3   -16.562781
4   -16.562781
Name: so, dtype: float64

In [69]:
summary_with_market_shares['per store market'].head()

0    156.0
1    156.0
2    156.0
3    156.0
4    156.0
Name: per store market, dtype: float64

In [70]:
#extracting specific columns

col=['location_code', 'item_no', 'state', 'region',
       'brand', 'stock_prevailing_mrp', 'store_type', 'store_location', 'city_type',
       'available_quantity', 'period', 'case_size_range',
       'gender', 'movement', 'material', 'dial_color', 'strap_type',
       'strap_color', 'precious_stone', 'glass', 'case_shape', 'watch_type','billing','sales_quantity','so']

summary_final=summary_with_market_shares.loc[:,col]
#df_north_final.fillna(0, inplace=True)

summary_final['item_no']=summary_final['item_no'].astype(str)
#df_north_final['case_size']=df_north_final['case_size'].astype(str)
summary_final['case_shape']=summary_final['case_shape'].astype(str)


In [71]:
summary_final.shape

(2853102, 25)

#### Defining a function for doing pareto analysis on features

The function combines all levels of a categorical features that cummulatively account for ~10% or less by Sales billings into a new level called "others". Features with less than 10 levels are not considered for pareto analysis.

In [72]:
def pareto(df,cols):
    lst=[]
    
    for col in cols:
                
        series=df.fillna(0).groupby([col]).agg({'billing':'sum'}).sort_values('billing',ascending=False)
        mask=series.cumsum()/series.sum()>0.9 
        #nos=mask.value_counts()[1]
        mask=mask.iloc[:,0]
        levels=len(df[col].unique())
        
        if levels>10:
            df[col] = np.where(df[col].isin(series[mask].index),'Other',df[col])         
        new_levels=len(df[col].unique())

        freq=df[col].value_counts()/df[col].value_counts().sum()*100
        freq=freq.round(2)

        sale_qty=df.groupby([col]).agg({'sales_quantity':'sum'}).sort_values('sales_quantity',ascending=False)
        sale_qty=sale_qty/sale_qty.sum()*100
        sale_qty=sale_qty.round(2)
        try:
            Other_Sales_Qty=sale_qty['sales_quantity']['Other']
        except:
            Other_Sales_Qty=0
        
        bill=df.groupby([col]).agg({'billing':'sum'}).sort_values('billing',ascending=False)
        bill=bill/bill.sum()*100
        bill=bill.round(2)
        try:
            Other_bill=bill['billing']['Other']
        except:
            Other_bill=0
        
        #comparison=mrp.merge(sale_qty, left_index=True, right_index=True)
        lst.append([col.upper(),levels, new_levels,Other_bill,Other_Sales_Qty])
        #print ("%s-Originally %d levels,combined %d levels into 'Other'.New Levels %d.By MRP,Other is %2.1f and by sale qty others is %2.1f"%(col.upper(),levels, levels-new_levels, new_levels,mrp['stock_prevailing_mrp']['Other'],sale_qty['sales_quantity']['Other']))
    
    cols=['Feature', 'Orig Levels', 'New Levels', 'Other%(Billing)', 'Other%(Sales Qty)']
    df1 = pd.DataFrame(lst, columns=cols)
    df1=df1.set_index("Feature")
    
    return df1,df

In [73]:
import numpy as np
cols=['case_size_range', 'gender','movement', 'material', 'dial_color', 'strap_type', 'strap_color','precious_stone', 'glass', 'case_shape', 'watch_type']
summary,summary_final_pareto=pareto(summary_final, cols)
summary

Unnamed: 0_level_0,Orig Levels,New Levels,Other%(Billing),Other%(Sales Qty)
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CASE_SIZE_RANGE,13,7,11.17,15.38
GENDER,3,3,0.0,0.0
MOVEMENT,7,7,0.0,0.0
MATERIAL,67,7,11.42,14.42
DIAL_COLOR,52,8,11.67,12.27
STRAP_TYPE,71,8,11.24,15.96
STRAP_COLOR,50,7,11.97,24.48
PRECIOUS_STONE,10,10,0.0,0.0
GLASS,7,7,0.0,0.0
CASE_SHAPE,6,6,0.0,0.0


In [74]:
summary_final_pareto.shape

(2853102, 25)

#### Creating dummy variables

In [75]:
#sorting by period
summary_final_pareto=summary_final_pareto.sort_values(['period'], ascending=True)
print('Done')

Done


In [76]:
#creating dummy variables
cols=['brand','state','region', 'store_type', 'store_location', 'city_type',
       'case_size_range', 'gender', 'movement', 'material', 'dial_color',
       'strap_type', 'strap_color', 'precious_stone', 'glass', 'case_shape',
       'watch_type']
summary_final_dummies=pd.get_dummies(data=summary_final_pareto, columns=cols)

print('Done')

Done


In [77]:
summary_final_dummies.fillna(0)
summary_final_dummies.shape


(2853102, 207)

In [78]:
#for So as the target variable
#creating seperate df for independent and dependent features
y=summary_final_dummies.loc[:, summary_final_dummies.columns == 'so']
X=summary_final_dummies.drop(columns =[ 'sales_quantity','location_code','item_no','billing', 'so', 'available_quantity'])



In [79]:
#checking for duplicate column names
duplicate_columns = summary_final_dummies.columns[summary_final_dummies.columns.duplicated()]
duplicate_columns

Index([], dtype='object')

In [80]:
#performing train and test split on data
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


"""
split=0.2
test=int(len(X)*split)
train=len(X)-test
X_train=X.head(train)
y_train=y.head(train)
X_test=X.tail(test)
y_test=y.tail(test)
"""

'\nsplit=0.2\ntest=int(len(X)*split)\ntrain=len(X)-test\nX_train=X.head(train)\ny_train=y.head(train)\nX_test=X.tail(test)\ny_test=y.tail(test)\n'

In [81]:
#!pip install -U imbalanced-learn
"""
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=11915008)
X_bal, y_bal = ros.fit_resample(X_train, y_train)

print('Done')
"""

"\nfrom imblearn.over_sampling import RandomOverSampler\nros = RandomOverSampler(random_state=11915008)\nX_bal, y_bal = ros.fit_resample(X_train, y_train)\n\nprint('Done')\n"

### Regression Modelling

In [82]:
from sklearn.linear_model import LinearRegression
reg_model = LinearRegression()  
reg_model.fit(X_train, y_train)

LinearRegression()

In [83]:
preds = reg_model.predict(X_test)

In [84]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
rmse = np.sqrt(mean_squared_error(y_test, preds))


In [85]:
r2=r2_score(y_test, preds)
print("Test RMSE: %.2f, Test R2: %.2f" % (rmse,r2))

Test RMSE: 2.66, Test R2: 0.06
