In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
df = pd.read_csv("Customer360Insights.csv")
# Removing irrelevant columns and keeping demographics/credit score
df = df.drop(['SessionStart', 'CustomerID', 'FullName', 'State', 'City','Category',
       'Product', 'Cost', 'Price', 'Quantity', 'CartAdditionTime','OrderConfirmationTime',
       'PaymentMethod', 'SessionEnd', 'OrderReturn', 'ReturnReason'], axis= 1)

In [3]:
df.sort_values(by = 'MonthlyIncome')



Unnamed: 0,Gender,Age,CreditScore,MonthlyIncome,Country,CampaignSchema,OrderConfirmation
664,Female,37,732,3001,China,Google-ads,True
1272,Female,37,732,3001,China,Billboard-QR code,True
8,Male,59,609,3003,Spain,Billboard-QR code,True
1308,Male,59,609,3003,Spain,Google-ads,False
1531,Male,35,753,3008,Canada,E-mails,True
...,...,...,...,...,...,...,...
1323,Female,40,626,7997,China,Billboard-QR code,True
579,Female,40,626,7997,China,Instagram-ads,True
1929,Male,39,627,7998,Japan,Billboard-QR code,True
1152,Male,39,627,7998,Japan,Instagram-ads,True


In [4]:
#{'Female': 0, 'Male': 1}
#{'Australia': 0, 'Canada': 1, 'China': 2, 'India': 3, 'Italy': 4, 'Japan': 5, 'Spain': 6, 'UK': 7, 'USA': 8}
#{'Billboard-QR code': 0, 'E-mails': 1, 'Facebook-ads': 2, 'Google-ads': 3, 'Instagram-ads': 4, 'Twitter-ads': 5}
#{'False': 0, 'True': 1}
import itertools

# All possible values to be passed in PowerBI. All are labeled as above
gender = np.array([0, 1], dtype=np.int32)
age = [20, 30, 40, 50, 60]
credit = [600, 640, 680, 720, 780]
inc = [3500, 4500, 5500, 6500, 7500]
country = np.array([0,1,2,3,4,5,6,7,8], dtype=np.int32)
camp = np.array([0,1,2,3,4,5], dtype=np.int32)

# Iterating all possible combinations of values above to integrate in powerbi
pbi_test = list(itertools.product(gender, age, credit, inc, country, camp))

In [5]:
pbi_test = pd.DataFrame(pbi_test, columns = df.drop(['OrderConfirmation'], axis=1).columns)
pbi_test

Unnamed: 0,Gender,Age,CreditScore,MonthlyIncome,Country,CampaignSchema
0,0,20,600,3500,0,0
1,0,20,600,3500,0,1
2,0,20,600,3500,0,2
3,0,20,600,3500,0,3
4,0,20,600,3500,0,4
...,...,...,...,...,...,...
13495,1,60,780,7500,8,1
13496,1,60,780,7500,8,2
13497,1,60,780,7500,8,3
13498,1,60,780,7500,8,4


In [10]:
# Labeling categorical data for training the model
label = LabelEncoder()
df_labeled = df

for i in df.drop(['Age', 'CreditScore', 'MonthlyIncome'], axis=1):
    df_labeled[i] = label.fit_transform(df_labeled[i].astype('str'))

X = df_labeled.drop(['OrderConfirmation'], axis= 1)
y = df_labeled['OrderConfirmation']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit(X_train)
pbi_test_scaled = scaler.transform(pbi_test)

In [12]:
model = joblib.load('LRmodel.joblib')

In [13]:
probabilities = model.predict_proba(pbi_test_scaled)
probabilities = pd.DataFrame(probabilities, columns = ['No Order', 'Order'])
probabilities

Unnamed: 0,No Order,Order
0,0.563485,0.436515
1,0.566994,0.433006
2,0.570497,0.429503
3,0.573992,0.426008
4,0.577480,0.422520
...,...,...
13495,0.329276,0.670724
13496,0.332437,0.667563
13497,0.335614,0.664386
13498,0.338805,0.661195


In [14]:
probabilities.sort_values(by= 'No Order')

Unnamed: 0,No Order,Order
6744,0.254086,0.745914
6745,0.256802,0.743198
6746,0.259537,0.740463
6738,0.260593,0.739407
6747,0.262290,0.737710
...,...,...
6752,0.653639,0.346361
6761,0.655626,0.344374
6753,0.656865,0.343135
6754,0.660076,0.339924


In [15]:
lgender = ['Female', 'Male']
lcountry = ['Australia', 'Canada', 'China' , 'India', 'Italy', 'Japan', 'Spain', 'UK', 'USA']
lcamp = ['Billboard-QR code', 'E-mails', 'Facebook-ads', 'Google-ads', 'Instagram-ads', 'Twitter-ads']

pbi_test_nolab = pbi_test
pbi_test_nolab['Gender'] = pbi_test['Gender'].replace(gender , lgender)
pbi_test_nolab['Country'] = pbi_test['Country'].replace(country , lcountry)
pbi_test_nolab['CampaignSchema '] = pbi_test['CampaignSchema '].replace(camp , lcamp)
pbi_test_nolab

Unnamed: 0,Gender,Age,CreditScore,MonthlyIncome,Country,CampaignSchema
0,Female,20,600,3500,Australia,Billboard-QR code
1,Female,20,600,3500,Australia,E-mails
2,Female,20,600,3500,Australia,Facebook-ads
3,Female,20,600,3500,Australia,Google-ads
4,Female,20,600,3500,Australia,Instagram-ads
...,...,...,...,...,...,...
13495,Male,60,780,7500,USA,E-mails
13496,Male,60,780,7500,USA,Facebook-ads
13497,Male,60,780,7500,USA,Google-ads
13498,Male,60,780,7500,USA,Instagram-ads


In [17]:
pbi_csv =pd.concat([pbi_test, round(probabilities,3)], axis=1)
pbi_csv.head()

Unnamed: 0,Gender,Age,CreditScore,MonthlyIncome,Country,CampaignSchema,No Order,Order
0,Female,20,600,3500,Australia,Billboard-QR code,0.563,0.437
1,Female,20,600,3500,Australia,E-mails,0.567,0.433
2,Female,20,600,3500,Australia,Facebook-ads,0.57,0.43
3,Female,20,600,3500,Australia,Google-ads,0.574,0.426
4,Female,20,600,3500,Australia,Instagram-ads,0.577,0.423


In [19]:
pbi_csv.to_csv("LR_PBI_testval.csv", index=False)