## Importing the necessary libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans

# machine learning
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error

## Loading the SuperStore data

In [2]:
# Laoding the data into a dataframe.
super_store_data = pd.read_excel("Data/Global_superstore_2016.xlsx")

In [3]:
super_store_data.shape

(51290, 24)

In [4]:
super_store_data.info() 
# info() gives us the number of non null values and the data type of each column.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51290 entries, 0 to 51289
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Row ID          51290 non-null  int64         
 1   Order ID        51290 non-null  object        
 2   Order Date      51290 non-null  datetime64[ns]
 3   Ship Date       51290 non-null  datetime64[ns]
 4   Ship Mode       51290 non-null  object        
 5   Customer ID     51290 non-null  object        
 6   Customer Name   51290 non-null  object        
 7   Segment         51290 non-null  object        
 8   Postal Code     9994 non-null   float64       
 9   City            51290 non-null  object        
 10  State           51290 non-null  object        
 11  Country         51290 non-null  object        
 12  Region          51290 non-null  object        
 13  Market          51290 non-null  object        
 14  Product ID      51290 non-null  object        
 15  Ca

Dropping the Customer Name column, as it contains personally identifiable information.

In [5]:
super_store_data = super_store_data.drop("Customer Name",axis=1)
# Looking at the top 5 rows of the dataframe.
super_store_data.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Segment,Postal Code,City,State,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority
0,40098,CA-2014-AB10015140-41954,2014-11-11,2014-11-13,First Class,AB-100151402,Consumer,73120.0,Oklahoma City,Oklahoma,...,TEC-PH-5816,Technology,Phones,Samsung Convoy 3,221.98,2,0.0,62.1544,40.77,High
1,26341,IN-2014-JR162107-41675,2014-02-05,2014-02-07,Second Class,JR-162107,Corporate,,Wollongong,New South Wales,...,FUR-CH-5379,Furniture,Chairs,"Novimex Executive Leather Armchair, Black",3709.395,9,0.1,-288.765,923.63,Critical
2,25330,IN-2014-CR127307-41929,2014-10-17,2014-10-18,First Class,CR-127307,Consumer,,Brisbane,Queensland,...,TEC-PH-5356,Technology,Phones,"Nokia Smart Phone, with Caller ID",5175.171,9,0.1,919.971,915.49,Medium
3,13524,ES-2014-KM1637548-41667,2014-01-28,2014-01-30,First Class,KM-1637548,Home Office,,Berlin,Berlin,...,TEC-PH-5267,Technology,Phones,"Motorola Smart Phone, Cordless",2892.51,5,0.1,-96.54,910.16,Medium
4,47221,SG-2014-RH9495111-41948,2014-11-05,2014-11-06,Same Day,RH-9495111,Consumer,,Dakar,Dakar,...,TEC-CO-6011,Technology,Copiers,"Sharp Wireless Fax, High-Speed",2832.96,8,0.0,311.52,903.04,Critical


# CALCULATING CLV MATHEMATICALLY

CLTV = ((Average Order Value x Purchase Frequency)/Churn Rate) x Profit margin.

 Customer Value = Average Order Value * Purchase Frequency

In [6]:
super_store_data

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Segment,Postal Code,City,State,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority
0,40098,CA-2014-AB10015140-41954,2014-11-11,2014-11-13,First Class,AB-100151402,Consumer,73120.0,Oklahoma City,Oklahoma,...,TEC-PH-5816,Technology,Phones,Samsung Convoy 3,221.980,2,0.0,62.1544,40.770,High
1,26341,IN-2014-JR162107-41675,2014-02-05,2014-02-07,Second Class,JR-162107,Corporate,,Wollongong,New South Wales,...,FUR-CH-5379,Furniture,Chairs,"Novimex Executive Leather Armchair, Black",3709.395,9,0.1,-288.7650,923.630,Critical
2,25330,IN-2014-CR127307-41929,2014-10-17,2014-10-18,First Class,CR-127307,Consumer,,Brisbane,Queensland,...,TEC-PH-5356,Technology,Phones,"Nokia Smart Phone, with Caller ID",5175.171,9,0.1,919.9710,915.490,Medium
3,13524,ES-2014-KM1637548-41667,2014-01-28,2014-01-30,First Class,KM-1637548,Home Office,,Berlin,Berlin,...,TEC-PH-5267,Technology,Phones,"Motorola Smart Phone, Cordless",2892.510,5,0.1,-96.5400,910.160,Medium
4,47221,SG-2014-RH9495111-41948,2014-11-05,2014-11-06,Same Day,RH-9495111,Consumer,,Dakar,Dakar,...,TEC-CO-6011,Technology,Copiers,"Sharp Wireless Fax, High-Speed",2832.960,8,0.0,311.5200,903.040,Critical
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,29002,IN-2015-KE1642066-42174,2015-06-19,2015-06-19,Same Day,KE-1642066,Corporate,,Kure,Hiroshima,...,OFF-FA-3072,Office Supplies,Fasteners,"Advantus Thumb Tacks, 12 Pack",65.100,5,0.0,4.5000,1.010,Medium
51286,34337,US-2014-ZD21925140-41765,2014-05-06,2014-05-10,Standard Class,ZD-219251408,Consumer,37421.0,Chattanooga,Tennessee,...,FUR-FU-4070,Furniture,Furnishings,"Eldon Image Series Desk Accessories, Burgundy",16.720,5,0.2,3.3440,1.930,High
51287,31315,CA-2012-ZD21925140-41147,2012-08-26,2012-08-31,Second Class,ZD-219251404,Consumer,94109.0,San Francisco,California,...,OFF-AR-5321,Office Supplies,Art,Newell 341,8.560,2,0.0,2.4824,1.580,High
51288,9596,MX-2013-RB1979518-41322,2013-02-17,2013-02-21,Standard Class,RB-1979518,Home Office,,Valinhos,São Paulo,...,OFF-BI-2919,Office Supplies,Binders,"Acco Index Tab, Economy",13.440,2,0.0,2.4000,1.003,Medium


Profit margin is the commonly used profitability ratio. It represents how much percentage of total sales has earned as the gain.

In [7]:
profit_margin = super_store_data["Profit"].sum()/ (super_store_data["Sales"].sum())

In [8]:
# Creating a customer level dataframe
customer_level_data = super_store_data.groupby("Customer ID").aggregate({"Order Date": lambda x: (x.max() - x.min()).days,
                                                   "Order ID": lambda x: len(x),
                                                   "Sales" : lambda x: x.sum()})
customer_level_data.columns = ['num_days','num_transactions','spent_money']
customer_level_data

Unnamed: 0_level_0,num_days,num_transactions,spent_money
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AA-10315102,918,6,544.6560
AA-10315120,0,1,2713.4100
AA-10315139,319,13,2955.7980
AA-103151402,483,6,4780.5520
AA-103151404,553,3,753.5080
...,...,...,...
ZD-2192548,385,4,1302.1680
ZD-2192564,0,4,1225.3920
ZD-219257,0,1,59.9400
ZD-2192582,569,2,678.1014


In [9]:
#Calculating the average order value for each customer
customer_level_data['avg_order_value']=customer_level_data['spent_money']/customer_level_data['num_transactions']

In [10]:
customer_level_data.head()

Unnamed: 0_level_0,num_days,num_transactions,spent_money,avg_order_value
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA-10315102,918,6,544.656,90.776
AA-10315120,0,1,2713.41,2713.41
AA-10315139,319,13,2955.798,227.369077
AA-103151402,483,6,4780.552,796.758667
AA-103151404,553,3,753.508,251.169333


In [11]:
purchase_frequency=sum(customer_level_data['num_transactions'])/customer_level_data.shape[0]
purchase_frequency

2.9451622164800457

In [12]:
# Repeat Rate
repeat_rate=customer_level_data[customer_level_data.num_transactions > 1].shape[0]/customer_level_data.shape[0]
repeat_rate

0.6517370083261557

In [13]:
#Churn Rate
churn_rate=1-repeat_rate
churn_rate

0.34826299167384434

In [14]:
purchase_frequency,repeat_rate,churn_rate

(2.9451622164800457, 0.6517370083261557, 0.34826299167384434)

In [15]:
customer_level_data["profit"] = customer_level_data["spent_money"]* profit_margin
customer_level_data.head()

Unnamed: 0_level_0,num_days,num_transactions,spent_money,avg_order_value,profit
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA-10315102,918,6,544.656,90.776,63.220035
AA-10315120,0,1,2713.41,2713.41,314.954533
AA-10315139,319,13,2955.798,227.369077,343.089316
AA-103151402,483,6,4780.552,796.758667,554.894588
AA-103151404,553,3,753.508,251.169333,87.462182


In [16]:
customer_level_data['CLV']=(customer_level_data['avg_order_value']*purchase_frequency)/churn_rate

In [17]:
#Customer Lifetime Value
customer_level_data['cust_lifetime_value']=customer_level_data['CLV']*customer_level_data['profit']
customer_level_data.head()

Unnamed: 0_level_0,num_days,num_transactions,spent_money,avg_order_value,profit,CLV,cust_lifetime_value
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AA-10315102,918,6,544.656,90.776,63.220035,767.667113,48531.94
AA-10315120,0,1,2713.41,2713.41,314.954533,22946.545573,7227119.0
AA-10315139,319,13,2955.798,227.369077,343.089316,1922.796365,659690.9
AA-103151402,483,6,4780.552,796.758667,554.894588,6737.964058,3738860.0
AA-103151404,553,3,753.508,251.169333,87.462182,2124.068443,185775.7


In [18]:
super_store_data2 = super_store_data.copy()

super_store_data2['month_yr'] = super_store_data2['Order Date'].apply(lambda x: x.strftime('%b-%Y'))
super_store_data2.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Segment,Postal Code,City,State,...,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,month_yr
0,40098,CA-2014-AB10015140-41954,2014-11-11,2014-11-13,First Class,AB-100151402,Consumer,73120.0,Oklahoma City,Oklahoma,...,Technology,Phones,Samsung Convoy 3,221.98,2,0.0,62.1544,40.77,High,Nov-2014
1,26341,IN-2014-JR162107-41675,2014-02-05,2014-02-07,Second Class,JR-162107,Corporate,,Wollongong,New South Wales,...,Furniture,Chairs,"Novimex Executive Leather Armchair, Black",3709.395,9,0.1,-288.765,923.63,Critical,Feb-2014
2,25330,IN-2014-CR127307-41929,2014-10-17,2014-10-18,First Class,CR-127307,Consumer,,Brisbane,Queensland,...,Technology,Phones,"Nokia Smart Phone, with Caller ID",5175.171,9,0.1,919.971,915.49,Medium,Oct-2014
3,13524,ES-2014-KM1637548-41667,2014-01-28,2014-01-30,First Class,KM-1637548,Home Office,,Berlin,Berlin,...,Technology,Phones,"Motorola Smart Phone, Cordless",2892.51,5,0.1,-96.54,910.16,Medium,Jan-2014
4,47221,SG-2014-RH9495111-41948,2014-11-05,2014-11-06,Same Day,RH-9495111,Consumer,,Dakar,Dakar,...,Technology,Copiers,"Sharp Wireless Fax, High-Speed",2832.96,8,0.0,311.52,903.04,Critical,Nov-2014


In [19]:
sale=super_store_data2.pivot_table(index=['Customer ID'],columns=['month_yr'],values='Sales',aggfunc='sum',fill_value=0).reset_index()
sale.head()


month_yr,Customer ID,Apr-2012,Apr-2013,Apr-2014,Apr-2015,Aug-2012,Aug-2013,Aug-2014,Aug-2015,Dec-2012,...,Nov-2014,Nov-2015,Oct-2012,Oct-2013,Oct-2014,Oct-2015,Sep-2012,Sep-2013,Sep-2014,Sep-2015
0,AA-10315102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AA-10315120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AA-10315139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2575.23,0.0,...,164.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,185.88,0.0
3,AA-103151402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AA-103151404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,26.96,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
sale['CLV']=sale.iloc[:,2:].sum(axis=1)
sale.head()

month_yr,Customer ID,Apr-2012,Apr-2013,Apr-2014,Apr-2015,Aug-2012,Aug-2013,Aug-2014,Aug-2015,Dec-2012,...,Nov-2015,Oct-2012,Oct-2013,Oct-2014,Oct-2015,Sep-2012,Sep-2013,Sep-2014,Sep-2015,CLV
0,AA-10315102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,544.656
1,AA-10315120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2713.41
2,AA-10315139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2575.23,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,185.88,0.0,2955.798
3,AA-103151402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4780.552
4,AA-103151404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,26.96,0.0,0.0,0.0,0.0,0.0,0.0,753.508


In [21]:
super_store_data2["Order Date"].max()

Timestamp('2015-12-31 00:00:00')

In [22]:
# Selecting features as the latest 6 month data

X2=sale[['Dec-2015','Nov-2015', 'Oct-2015','Sep-2015','Aug-2015','Jul-2015']]
y=sale[['CLV']]

## CLV prediction based on the latest 6 month sales data:

Here, I am performing predictive modeling on the customer lifetime value instead of the sales expected in next 90 days

#### Performing CLV predictive modelling:

##### XGBoost

In [23]:
xgb = XGBRegressor(objective='reg:squarederror', random_state=123)

In [24]:
xgb =GridSearchCV(estimator=xgb, param_grid = dict(learning_rate = [0.01, 0.1, 0.3, 0.5]), scoring='neg_mean_absolute_error', refit= True, cv = 5) 

In [25]:
xgb.fit(X2, y)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, predictor=None,
                                    random_state=123, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
        

In [26]:
xgb.best_score_

-553.2658364061244

In [27]:
xgb.best_params_

{'learning_rate': 0.01}

In [28]:
xgb.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=123,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [29]:
X2.merge(sale["Customer ID"],left_index=True, right_index=True)

Unnamed: 0,Dec-2015,Nov-2015,Oct-2015,Sep-2015,Aug-2015,Jul-2015,Customer ID
0,0.0,0.0,0.0,0.0,0.00,0.0,AA-10315102
1,0.0,0.0,0.0,0.0,0.00,0.0,AA-10315120
2,0.0,0.0,0.0,0.0,2575.23,0.0,AA-10315139
3,0.0,0.0,0.0,0.0,0.00,0.0,AA-103151402
4,0.0,0.0,0.0,0.0,0.00,0.0,AA-103151404
...,...,...,...,...,...,...,...
17410,0.0,0.0,0.0,0.0,0.00,0.0,ZD-2192548
17411,0.0,0.0,0.0,0.0,0.00,0.0,ZD-2192564
17412,0.0,0.0,0.0,0.0,0.00,0.0,ZD-219257
17413,0.0,0.0,0.0,0.0,0.00,0.0,ZD-2192582


In [30]:
predictions_xgb_reg_6model = xgb.predict(X2)


In [31]:
len(predictions_xgb_reg_6model[predictions_xgb_reg_6model>0])

17415

In [32]:
pd.DataFrame({"prediction_xgb_6m" : predictions_xgb_reg_6model.tolist()}).merge(X2,left_index=True, right_index=True).merge(sale["Customer ID"],left_index=True, right_index=True)

Unnamed: 0,prediction_xgb_6m,Dec-2015,Nov-2015,Oct-2015,Sep-2015,Aug-2015,Jul-2015,Customer ID
0,397.932434,0.0,0.0,0.0,0.0,0.00,0.0,AA-10315102
1,397.932434,0.0,0.0,0.0,0.0,0.00,0.0,AA-10315120
2,2416.921875,0.0,0.0,0.0,0.0,2575.23,0.0,AA-10315139
3,397.932434,0.0,0.0,0.0,0.0,0.00,0.0,AA-103151402
4,397.932434,0.0,0.0,0.0,0.0,0.00,0.0,AA-103151404
...,...,...,...,...,...,...,...,...
17410,397.932434,0.0,0.0,0.0,0.0,0.00,0.0,ZD-2192548
17411,397.932434,0.0,0.0,0.0,0.0,0.00,0.0,ZD-2192564
17412,397.932434,0.0,0.0,0.0,0.0,0.00,0.0,ZD-219257
17413,397.932434,0.0,0.0,0.0,0.0,0.00,0.0,ZD-2192582


In [33]:
pd.DataFrame({"prediction_xgb_6m" : predictions_xgb_reg_6model.tolist()}).merge(X2,left_index=True, right_index=True).merge(sale["Customer ID"]
                                                                                                                            ,left_index=True, right_index=True)\
.to_csv("XGB_reg_6m_prediction.csv")

##### Linear regression

In [34]:

lr = LinearRegression()

In [35]:
parameters = {'fit_intercept':[True,False],  'copy_X':[True, False]}
lr_reg_model = GridSearchCV(lr,parameters, cv=5, scoring='neg_mean_absolute_error')
lr_reg_model.fit(X2, y)

GridSearchCV(cv=5, estimator=LinearRegression(),
             param_grid={'copy_X': [True, False],
                         'fit_intercept': [True, False]},
             scoring='neg_mean_absolute_error')

In [36]:
lr_reg_model.best_score_

-575.0095331297374

In [37]:
lr_reg_model.best_params_

{'copy_X': True, 'fit_intercept': False}

In [38]:
predictions_lr_reg_6model = lr_reg_model.predict(X2).reshape(-1)
predictions_lr_reg_6model

array([   0.        ,    0.        , 3157.10327512, ...,    0.        ,
          0.        ,    0.        ])

In [39]:

pd.DataFrame({"prediction_lr_6m" : predictions_lr_reg_6model.tolist()}).merge(X2,left_index=True, right_index=True).merge(sale["Customer ID"],left_index=True, right_index=True)

Unnamed: 0,prediction_lr_6m,Dec-2015,Nov-2015,Oct-2015,Sep-2015,Aug-2015,Jul-2015,Customer ID
0,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,AA-10315102
1,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,AA-10315120
2,3157.103275,0.0,0.0,0.0,0.0,2575.23,0.0,AA-10315139
3,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,AA-103151402
4,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,AA-103151404
...,...,...,...,...,...,...,...,...
17410,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,ZD-2192548
17411,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,ZD-2192564
17412,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,ZD-219257
17413,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,ZD-2192582


In [40]:
len(predictions_lr_reg_6model[predictions_lr_reg_6model>0])

4914

In [41]:

pd.DataFrame({"prediction_lr_6m" : predictions_lr_reg_6model.tolist()}).merge(X2,left_index=True, right_index=True)\
.merge(sale["Customer ID"],left_index=True, right_index=True)\
.to_csv("lr_reg_6m_prediction.csv")

##### Randomforest

In [42]:
rf = RandomForestRegressor()
param_grid = dict(n_estimators = [10, 20, 50, 100, 500, 1000], max_depth = [2, 5], max_features = ['auto', 'sqrt', 'log2'])
rf_regressor =GridSearchCV(estimator=rf, param_grid = param_grid, scoring='neg_mean_absolute_error', refit= True, cv = 5) 

In [43]:
rf_regressor.fit(X2, y.values.ravel())

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [2, 5],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [10, 20, 50, 100, 500, 1000]},
             scoring='neg_mean_absolute_error')

In [44]:
rf_regressor.best_score_

-618.8253810675002

In [45]:
predictions_rf_reg_6model = rf_regressor.predict(X2)

In [46]:
len(predictions_rf_reg_6model[predictions_rf_reg_6model>0])

17415

In [47]:

pd.DataFrame({"prediction_rf_6m" : predictions_rf_reg_6model.tolist()}).merge(X2,left_index=True, right_index=True).merge(sale["Customer ID"],left_index=True, right_index=True)

Unnamed: 0,prediction_rf_6m,Dec-2015,Nov-2015,Oct-2015,Sep-2015,Aug-2015,Jul-2015,Customer ID
0,658.033047,0.0,0.0,0.0,0.0,0.00,0.0,AA-10315102
1,658.033047,0.0,0.0,0.0,0.0,0.00,0.0,AA-10315120
2,3615.848335,0.0,0.0,0.0,0.0,2575.23,0.0,AA-10315139
3,658.033047,0.0,0.0,0.0,0.0,0.00,0.0,AA-103151402
4,658.033047,0.0,0.0,0.0,0.0,0.00,0.0,AA-103151404
...,...,...,...,...,...,...,...,...
17410,658.033047,0.0,0.0,0.0,0.0,0.00,0.0,ZD-2192548
17411,658.033047,0.0,0.0,0.0,0.0,0.00,0.0,ZD-2192564
17412,658.033047,0.0,0.0,0.0,0.0,0.00,0.0,ZD-219257
17413,658.033047,0.0,0.0,0.0,0.0,0.00,0.0,ZD-2192582


In [48]:
pd.DataFrame({"prediction_rf_6m" : predictions_rf_reg_6model.tolist()}).merge(X2,left_index=True, right_index=True)\
.merge(sale["Customer ID"],left_index=True, right_index=True)\
.to_csv("rf_reg_6m_prediction.csv")