In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import matplotlib.pyplot as plt

In [None]:
upper = pd.read_csv('upper_city1_full.csv')
middle = pd.read_csv('middle_city1_full.csv')
lower = pd.read_csv('lower_city1_full.csv')

In [None]:
city1_full = pd.concat([upper,middle,lower], ignore_index=True)

In [None]:
city1_full['start'] = city1_full['start'].apply(str)

In [None]:
city1_full = city1_full.drop(['Unnamed: 0','OUTLET_ID'],axis=1)

In [None]:
city1_wo_na = city1_full.fillna(0)

In [None]:
col = list(city1_wo_na.columns)

In [None]:
col.remove('start')

In [None]:
col

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
city1_wo_na[col] = scaler.fit_transform(city1_wo_na[col].as_matrix())

In [None]:
city1_wo_na.head()

In [None]:
city1_wo_na_corr = city1_wo_na.corr()

In [None]:
city1_wo_na_corr.head()

In [None]:
sales = pd.read_csv('EDA File.csv')

In [None]:
sales = sales[['ID','Type','Sales']]

In [None]:
final_data = city1_wo_na.merge(sales,how='left',left_on = 'start',right_on ='ID').drop(['ID'],axis=1)

In [None]:
train_data = final_data[final_data['Type']=='Store1']

In [None]:
train_data.info()

In [None]:
col_names = list(train_data.columns)

In [None]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression,mutual_info_regression

In [None]:
X = train_data.drop(['start','Sales','Type'],axis=1).as_matrix()
y = np.array(train_data['Sales'])

In [None]:
def f_regression(X,Y):
   import sklearn
   return sklearn.feature_selection.f_regression(X,Y,center=False)

In [None]:
from sklearn.feature_selection import SelectKBest
featureSelector = SelectKBest(score_func=f_regression,k=50)
featureSelector.fit(X,y)
print ([1+zero_based_index for zero_based_index in list(featureSelector.get_support(indices=True))])

In [None]:
f_test, _ = f_regression(X, y)
f_test /= np.max(f_test)

mi = mutual_info_regression(X, y)
mi /= np.max(mi)

plt.figure(figsize=(15, 5))
for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.scatter(X[:, i], y, edgecolor='black', s=20)
    plt.xlabel("$x_{}$".format(i + 1), fontsize=14)
    if i == 0:
        plt.ylabel("$y$", fontsize=14)
    plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]),
              fontsize=16)
plt.show()

In [None]:
f_test, p_values = f_regression(X, y)

In [None]:
feature_pvalue = pd.DataFrame({'feature_name':list(train_data.drop(['start','Sales','Type'],axis=1).columns),'p_values':list(p_values)})

In [None]:
feat_imp_p_values = feature_pvalue.nsmallest(50,columns='p_values')

In [None]:
feat_imp_p_values.to_csv('feat_imp_p_values.csv')

In [None]:
param_grid = {'n_estimators': [200, 500, 1000], 'max_features': [5, 10, 20, 30],'max_depth':[4,6,8]}
clf = GridSearchCV(RandomForestRegressor(), 
                   param_grid, 
                   cv=5, 
                   scoring=make_scorer(mean_squared_error))

In [None]:
clf.fit(train_data[col_names], train_data['Sales'])

In [None]:
clf.best_params_

In [None]:
clf = RandomForestRegressor(max_depth=4, max_features=5, n_estimators=1000)
clf.fit(train_data[col_names], train_data['Sales'])

In [None]:
importance = clf.feature_importances_
importance = pd.DataFrame(importance, index=col_names, 
                          columns=["Importance"])
importance["Std"] = np.std([tree.feature_importances_
                            for tree in clf.estimators_], axis=0)

In [None]:
X = train_data[col_names]
y = train_data['Sales']

In [None]:
feat_importances = pd.Series(clf.feature_importances_, index=X.columns)
feat_importances = feat_importances.nlargest(20)
plt.figure(figsize=(15,10))
feat_importances.plot(kind='barh',color='blue',width=0.7)
plt.ylabel('Features',fontsize=15)
#plt.ylabel('ylabel', fontsize=16)
plt.xlabel('Feature_Importance_Value',fontsize=15)
#plt.savefig('feature_imp_10.jpg')

In [None]:
feat_importances = pd.Series(clf.feature_importances_, index=X.columns)

In [None]:
col_cos = list(final_data.columns)
col_cos.remove('Type')
col_cos.remove('Sales')

In [None]:
cos_data = final_data[col_cos]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cos_data.set_index('start', inplace = True)

In [None]:
cosine_sim_df = cosine_similarity(cos_data)

In [None]:
cosine_sim_df = pd.DataFrame(cosine_sim_df, index = cos_data.index, columns = cos_data.index)

In [None]:
cosine_sim_df.head()

## Feature Imp and Cosine Section wise

In [None]:
list(city1_wo_na.columns).sort()

In [None]:
list(train_data.columns)

In [None]:
proximity_city1 = train_data[['feature_cols...','Sales']]

In [None]:
prox_col = list(proximity_city1.drop(['start','Sales'],axis=1).columns)

In [None]:
prox_corr = final_data[prox_col].corr()

In [None]:
param_grid_prox_feat = {
    'n_estimators': [200, 500, 1000], 
    'max_features': [5, 10,15],
    'max_depth':[4,6,8]}
proximity_city1_feat= GridSearchCV(RandomForestRegressor(), 
                                   param_grid_prox_feat, 
                                   cv=5, 
                                   scoring=make_scorer(mean_squared_error))

In [None]:
proximity_city1_feat.fit(proximity_city1[prox_col], 
                         proximity_city['Sales'])

In [None]:
proximity_city1_feat.best_params_

In [None]:
proximity_city1_feat = RandomForestRegressor(max_depth=8, 
                                             max_features=15, 
                                             n_estimators=500)
proximity_city1_feat.fit(proximity_city1[prox_col], proximity_city1['Sales'])

In [None]:
importance = proximity_city1_feat.feature_importances_
importance = pd.DataFrame(importance, index=prox_col, 
                          columns=["Importance"])
# importance["Std"] = np.std([tree.feature_importances_
#                             for tree in clf.estimators_], axis=0)

In [None]:
X = proximity_city1[prox_col]
y = proximity_city1['OOH Sales']

In [None]:
feat_importances = pd.Series(proximity_city1_feat.feature_importances_, index=X.columns)
feat_importances = feat_importances.nlargest(20)
plt.figure(figsize=(15,10))
feat_importances.plot(kind='barh',color='blue',width=0.7)
plt.ylabel('Features',fontsize=15)
#plt.ylabel('ylabel', fontsize=16)
plt.xlabel('Feature_Importance_Value',fontsize=15)
#plt.savefig('feature_imp_10.jpg')

In [None]:
feat_importances = pd.Series(proximity_city1_feat.feature_importances_, index=X.columns)
feat_importances.head()

In [None]:
demographic_col = list(set(col_names).difference(prox_col).difference(ohe_col))

In [None]:
ohe_col = ['Variable1',....]

In [None]:
#demographic_col.insert(0,'OOH Sales')
demographic_col.insert(0,'start')

In [None]:
demographic_city1_feat = final_data[demographic_col]

In [None]:
demographic_col.remove('start')

In [None]:
param_grid_demo_feat = {'n_estimators': [200, 500, 1000], 
                        'max_features': [5, 10,15],
                        'max_depth':[4,6,8]}
clf_demo_feat= GridSearchCV(RandomForestRegressor(), 
                            param_grid_demo_feat, 
                            cv=5, 
                            scoring=make_scorer(mean_squared_error))

In [None]:
clf_demo_feat.fit(train_data[demographic_col], train_data['Sales'])

In [None]:
clf_demo_feat.best_params_

In [None]:
clf_demo_feat = RandomForestRegressor(max_depth= 8, 
                                      max_features= 15, 
                                      n_estimators= 500)

In [None]:
clf_demo_feat.fit(train_data[demographic_col], 
                  train_data['Sales'])

In [None]:
importance = clf_demo_feat.feature_importances_
importance = pd.DataFrame(importance, 
                          index=demographic_col, 
                          columns=["Importance"])
# importance["Std"] = np.std([tree.feature_importances_
#                             for tree in clf.estimators_], axis=0)

In [None]:
feat_importances = pd.Series(clf_demo_feat.feature_importances_, 
                             index=demographic_col)
feat_importances = feat_importances.nlargest(20)
plt.figure(figsize=(15,10))
feat_importances.plot(kind='barh',color='blue',width=0.7)
plt.ylabel('Features',fontsize=15)
#plt.ylabel('ylabel', fontsize=16)
plt.xlabel('Feature_Importance_Value',fontsize=15)
#plt.savefig('feature_imp_10.jpg')

In [None]:
all_feat = pd.read_csv('feature_importance_all_features.csv')
demographic_feat = pd.read_csv('feature_imp_demographic_city1.csv')
proximity_feat = pd.read_csv('feat_imp_proximity_city1.csv')

In [None]:
all_feat = all_feat.sort_values('Score',ascending=False)
demographic_feat = demographic_feat.sort_values('Score',ascending=False)
proximity_feat = proximity_feat.sort_values('Score',ascending=False)

In [None]:
top_50_feat = all_feat.nlargest(50,columns='Score')

In [None]:
top_50_feat['Score']= top_50_feat['Score']/top_50_feat['Score'].sum()

In [None]:
col_top_50=list(top_50_feat['Features'])
col_top_50.insert(0,'start')

In [None]:
top_50_f_data = final_data[col_top_50]

In [None]:
col_top_50.remove('start')

In [None]:
for i in col_top_50:
    top_50_f_data[i] = top_50_f_data[i]*top_50_feat[top_50_feat['Features']==i]['Score'].iloc[0]

In [None]:
top_50_f_data.set_index('start', inplace = True)

In [None]:
cosine_sim_top_50 = cosine_similarity(top_50_f_data)

In [None]:
top_50_f_data_col = list(top_50_f_data.index)

In [None]:
cosine_sim_top_50 = pd.DataFrame(cosine_sim_top_50, 
                                 index = top_50_f_data_col, 
                                 columns = top_50_f_data_col)

In [None]:
cosine_sim_top_50.reset_index(inplace = True)

In [None]:
cosine_sim_top_50 = cosine_sim_top_50.rename(columns={'index':'start'})

In [None]:
long_cosine_sim_top_50 = cosine_sim_top_50.melt(id_vars = ['start'], 
                                       value_vars = list(cosine_sim_top_50.iloc[:, 1:].columns) 
                                       )

In [None]:
long_cosine_sim_top_50 = long_cosine_sim_top_50.loc[(long_cosine_sim_top_50['start']!=long_cosine_sim_top_50['variable']),:]

In [None]:
long_cosine_sim_top_50.columns = ['Potential Store','Reference Store','Similarity']

In [None]:
final_product_df = pd.read_csv('product_data.csv')

In [None]:
Product_Sales_subset = final_product_df[['ID', 'Sales']]

In [None]:
Potential_Sales = long_cosine_sim_top_50.merge(Product_Sales_subset, 
                                               how = 'left', 
                                               left_on = 'Reference Store', 
                                               right_on = 'ID')

In [None]:
Potential_Sales = Potential_Sales[['Potential Store', 
                                   'Reference Store', 
                                   'Similarity', 
                                   'Sales']]

In [None]:
Potential_Sales['Sales'] = Potential_Sales['Sales'].fillna(0)

In [None]:
outlet_top5 = Potential_Sales.groupby(by = ['Potential Store', 
                                            'Reference Store']).apply(lambda x: x.sort_values('Similarity',ascending = False).head(5))

In [None]:
Potential_Sales['Predicted Sales cumulative'] = Potential_Sales['Similarity']*Potential_Sales['Sales']

In [None]:
Potential_Sales = Potential_Sales.groupby(by = ['Potential Store']).agg({'Predicted Sales cumulative':sum, 'Similarity':sum})

In [None]:
Potential_Sales['Predicted Sales Actual'] = Potential_Sales['Predicted Sales cumulative']/Potential_Sales['Similarity']

In [None]:
Potential_Sales.reset_index(inplace = True)

In [None]:
Potential_Sales_ideal = Potential_Sales[['Potential Store', 'Predicted Sales Actual']].merge(Product_Sales_subset, 
                                                                                             how = 'left', 
                                                                                             left_on = 'Potential Store', 
                                                                                             right_on = 'ID')

In [None]:
Potential_Sales_ideal = Potential_Sales_ideal[['Potential Store', 
                                               'Predicted Sales Actual', 
                                               'Sales']]

## Using Pvalue for feature imp

In [None]:
col_top_50=list(feat_imp_p_values['feature_name'])
col_top_50.insert(0,'start')

In [None]:
top_50_f_data = final_data[col_top_50]

In [None]:
top_50_f_data.set_index('start',inplace=True)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_sim_top_50 = cosine_similarity(top_50_f_data)

In [None]:
top_50_f_data_col = list(top_50_f_data.index)

cosine_sim_top_50 = pd.DataFrame(cosine_sim_top_50, 
                                 index = top_50_f_data_col, 
                                 columns = top_50_f_data_col)

In [None]:
cosine_sim_top_50.reset_index(inplace = True)

In [None]:
cosine_sim_top_50 = cosine_sim_top_50.rename(columns={'index':'start'})

In [None]:
long_cosine_sim_top_50 = cosine_sim_top_50.melt(id_vars = ['start'], 
                                       value_vars = list(cosine_sim_top_50.iloc[:, 1:].columns) 
                                       )

In [None]:
long_cosine_sim_top_50.columns = ['Potential Store','Reference Store','Similarity']


sales_info2 = pd.read_csv('Product_sales.csv')

sales_info2['ID'] = sales_info2['ID'].apply(str)

Product_Sales_subset = sales_info2.loc[sales_info2['Type'] == "Store Type 1", :]

Potential_Sales = long_cosine_sim_top_50.merge(Product_Sales_subset[['ID', 'Sales']], 
                                               how = 'left', 
                                               left_on = 'Reference Store', 
                                               right_on = 'ID')

In [None]:
Potential_Sales = Potential_Sales.dropna(axis = 0, how = 'any')

Potential_Sales['Potential Store'] = Potential_Sales['Potential Store'].apply(str)

Potential_Sales = Potential_Sales.loc[Potential_Sales['Potential Store'] != Potential_Sales['Reference Store'], :]

In [None]:
Potential_Sales = Potential_Sales.merge(sales_info2[['ID', 'Type']], 
                                        how = 'left', 
                                        left_on = 'Potential Store', 
                                        right_on = 'ID')

In [None]:
Potential_Sales = Potential_Sales.loc[Potential_Sales['Type'].apply(str) == 'Store Type 2', :]


In [None]:
Potential_Sales = Potential_Sales.loc[Potential_Sales['Sales'] != 0, :]

In [None]:
Potential_Sales = Potential_Sales[['Potential Store', 
                                   'Reference Store', 
                                   'Similarity', 
                                   'Sales']]

Potential_Sales['Potential Sales'] = Potential_Sales['Similarity']*Potential_Sales['Sales']

Potential_Sales = Potential_Sales.drop(['Sales'], axis = 1)

Potential_Sales.reset_index(drop = True, 
                            inplace = True)

In [None]:
top10list = Potential_Sales.groupby(by = 'Potential Store').apply(lambda x: x.sort_values('Similarity',
                                                                                          ascending = False).head(10))

In [None]:
top10list = top10list.drop(['Potential Store'], axis = 1)

top10list = top10list.reset_index()

In [None]:
top10list = top10list.drop(['level_1'], axis = 1)

In [None]:
pot_sales = top10list.groupby('Potential Store').agg({'Potential Sales':sum, 'Similarity':sum}).reset_index()

In [None]:
pot_sales['Sales Opportunity'] = pot_sales['Potential Sales']/pot_sales['Similarity']

pot_sales = pot_sales.drop(['Potential Sales'], axis = 1)

In [None]:
pot_sales = pot_sales.merge(sales_info2[['ID','Sales']],
                            how='left',
                            left_on='Potential Store',
                            right_on='ID').drop(['ID'],axis=1)

In [None]:
pot_sales.to_csv('potential_sales_city1_p_values.csv')