In [1]:
# Import the dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Read the CSV into a DataFrame
df = pd.read_csv("/content/sales.csv")
df.tail(50)

Unnamed: 0,PageVisitsCat1,PageVisitDurationCat1,PageVisitsCat2,PageVisitDurationCat2,PageVisitsCat3,PageVisitDurationCat3,BounceRates,ExitRates,PageValues,HowCloseToSpecialDay,Month,VisitorType,Weekend,Revenue
12233,1.0,73.75,0.0,0.0,45.0,2674.339216,0.00087,0.020139,0.0,0.0,Nov,Returning_Visitor,False,False
12234,1.0,88.0,3.0,540.333333,57.0,1656.410714,0.0,0.00226,0.0,0.0,Nov,Returning_Visitor,False,False
12235,0.0,0.0,0.0,0.0,125.0,7453.766667,0.001626,0.013132,0.0,0.0,Nov,Returning_Visitor,False,False
12236,0.0,0.0,0.0,0.0,37.0,854.55,0.005405,0.023166,0.0,0.0,Dec,Returning_Visitor,False,False
12237,0.0,0.0,0.0,0.0,45.0,2650.416667,0.013333,0.034074,0.0,0.0,Dec,Returning_Visitor,True,False
12238,6.0,369.333333,2.0,225.5,133.0,3918.363736,0.0,0.009275,7.147604,0.0,Nov,Returning_Visitor,False,False
12239,0.0,0.0,0.0,0.0,4.0,197.416667,0.016667,0.0875,0.0,0.0,Nov,Returning_Visitor,False,False
12240,8.0,167.910714,6.0,547.75,111.0,6340.152381,0.003361,0.009432,44.219794,0.0,Dec,Returning_Visitor,False,False
12241,3.0,100.0,0.0,0.0,27.0,730.791667,0.016,0.027556,0.0,0.0,Nov,Returning_Visitor,False,False
12242,0.0,0.0,0.0,0.0,3.0,88.5,0.0,0.033333,0.0,0.0,Nov,Returning_Visitor,True,False


In [3]:
# List the columns
for col in df.columns:
    print(col)

PageVisitsCat1
PageVisitDurationCat1
PageVisitsCat2
PageVisitDurationCat2
PageVisitsCat3
PageVisitDurationCat3
BounceRates
ExitRates
PageValues
HowCloseToSpecialDay
Month
VisitorType
Weekend
Revenue


### Description of columns:

**PageVisitsCatX:** Number of pages visited by a visitor. Cat1 is administrative, Cat2 is informational, and Cat3 is product related.

**PageVisitDurationCatX:** How long a visitor stayed on the category of pages.

**BounceRates:** Percentage of visitors who landed and exited a page.

**ExitRates:** Percentage of visitors who left the site from that page.

**PageValues:** A measurement of a page's contribution to a sale.

**HowCloseToSpecialDay:** How close is browsing date to a special day or a holiday, such as Valentine's day? Higher numbers are closer.

**Month:** Month of visit.

**VisitorType:** Is the visitor a new or returning visitor?

**Weekend:** Did the visit occur on a weekend day?

**Revenue:** Did the visit conclude in a sale?

## Perform Data Analysis

In [4]:
# Retrieve the number visits for each month.
Number_visit = df.groupby(['PageVisitsCat1', 'PageVisitsCat2', 'PageVisitsCat3']).Month.count().reset_index()
Number_visit

Unnamed: 0,PageVisitsCat1,PageVisitsCat2,PageVisitsCat3,Month
0,0.0,0.0,0.0,5
1,0.0,0.0,1.0,547
2,0.0,0.0,2.0,370
3,0.0,0.0,3.0,331
4,0.0,0.0,4.0,281
...,...,...,...,...
3126,24.0,5.0,34.0,1
3127,24.0,6.0,178.0,1
3128,24.0,6.0,189.0,1
3129,26.0,9.0,183.0,1


In [5]:
# Retrieve the number of visits that resulted in a purchase or not.
purchase_not = df.groupby(['PageVisitsCat3']).Revenue.sum().reset_index()
purchase_not

Unnamed: 0,PageVisitsCat3,Revenue
0,0.0,6
1,1.0,13
2,2.0,20
3,3.0,25
4,4.0,18
...,...,...
306,518.0,0
307,534.0,1
308,584.0,0
309,686.0,0


In [6]:
# What percentage of visits resulted in a purchase?
df['purchase'] = df.Revenue.notnull()
per_purchase = df.groupby(['PageVisitsCat3']).purchase.apply(lambda x:100 * x / float(x.sum()))
per_purchase

0        0.168919
1        0.217391
2        0.217391
3        0.303030
4        0.458716
           ...   
12278    1.666667
12279    0.263158
12280    0.253165
12281    0.370370
12282    0.219298
Name: purchase, Length: 12283, dtype: float64

In [7]:
# Filter the DataFrame for all purchases.
df.filter(["PageVisitsCat3","Month", "purchase"]).head()

Unnamed: 0,PageVisitsCat3,Month,purchase
0,1.0,Feb,True
1,2.0,Feb,True
2,2.0,Feb,True
3,10.0,Feb,True
4,19.0,Feb,True


In [8]:
# What is the number of visits for each month that resulted in a purchase?
Num_vis_purchase = df.groupby(['PageVisitsCat3', 'purchase']).Month.count().reset_index()
Num_vis_purchase

Unnamed: 0,PageVisitsCat3,purchase,Month
0,0.0,True,35
1,1.0,True,592
2,2.0,True,460
3,3.0,True,456
4,4.0,True,400
...,...,...,...
306,518.0,True,1
307,534.0,True,1
308,584.0,True,1
309,686.0,True,1


In [9]:
# Which month had the greatest number of visits where a purchase was made?
Max_vst = df.groupby(['PageVisitsCat3', 'purchase']).Month.max().reset_index()
Max_vst

Unnamed: 0,PageVisitsCat3,purchase,Month
0,0.0,True,Sep
1,1.0,True,Sep
2,2.0,True,Sep
3,3.0,True,Sep
4,4.0,True,Sep
...,...,...,...
306,518.0,True,Oct
307,534.0,True,Nov
308,584.0,True,Nov
309,686.0,True,Aug


In [10]:
# How many visits were from returning visitors?

df_ret = df.groupby(['PageVisitsCat1', 'PageVisitsCat2', 'PageVisitsCat3'])['VisitorType'].apply(lambda x: (x=='Returning_Visitor').count()).reset_index(name='Returning_Visitor')
df_ret

Unnamed: 0,PageVisitsCat1,PageVisitsCat2,PageVisitsCat3,Returning_Visitor
0,0.0,0.0,0.0,5
1,0.0,0.0,1.0,547
2,0.0,0.0,2.0,370
3,0.0,0.0,3.0,331
4,0.0,0.0,4.0,281
...,...,...,...,...
3126,24.0,5.0,34.0,1
3127,24.0,6.0,178.0,1
3128,24.0,6.0,189.0,1
3129,26.0,9.0,183.0,1


In [11]:
# How many visits were from new visitors?
new_vis = df.groupby(['PageVisitsCat1', 'PageVisitsCat2', 'PageVisitsCat3'])['VisitorType'].apply(lambda x: (x=='New_Visitor').count()).reset_index(name='New_Visitor')
new_vis

Unnamed: 0,PageVisitsCat1,PageVisitsCat2,PageVisitsCat3,New_Visitor
0,0.0,0.0,0.0,5
1,0.0,0.0,1.0,547
2,0.0,0.0,2.0,370
3,0.0,0.0,3.0,331
4,0.0,0.0,4.0,281
...,...,...,...,...
3126,24.0,5.0,34.0,1
3127,24.0,6.0,178.0,1
3128,24.0,6.0,189.0,1
3129,26.0,9.0,183.0,1


In [12]:
# How many visits took place on a weekday? And, on the weekend?
new_vis = df.groupby(['PageVisitsCat1', 'PageVisitsCat2', 'PageVisitsCat3'])['Weekend'].apply(lambda x: ((x=='True') & (x=='False')).count()).reset_index(name='Week_vst')
new_vis

Unnamed: 0,PageVisitsCat1,PageVisitsCat2,PageVisitsCat3,Week_vst
0,0.0,0.0,0.0,5
1,0.0,0.0,1.0,547
2,0.0,0.0,2.0,370
3,0.0,0.0,3.0,331
4,0.0,0.0,4.0,281
...,...,...,...,...
3126,24.0,5.0,34.0,1
3127,24.0,6.0,178.0,1
3128,24.0,6.0,189.0,1
3129,26.0,9.0,183.0,1


## Perform Logistic Regression 

In [24]:
# Convert categorical variables to binary variables 
df2 = pd.get_dummies(df)
df2

Unnamed: 0,PageVisitsCat1,PageVisitDurationCat1,PageVisitsCat2,PageVisitDurationCat2,PageVisitsCat3,PageVisitDurationCat3,BounceRates,ExitRates,PageValues,HowCloseToSpecialDay,...,Month_Jul,Month_June,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,VisitorType_New_Visitor,VisitorType_Other,VisitorType_Returning_Visitor
0,0.0,0.0,0.0,0.0,1.0,0.000000,0.200000,0.200000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,1
1,0.0,0.0,0.0,0.0,2.0,64.000000,0.000000,0.100000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,1
2,0.0,0.0,0.0,0.0,2.0,2.666667,0.050000,0.140000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,1
3,0.0,0.0,0.0,0.0,10.0,627.500000,0.020000,0.050000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,1
4,0.0,0.0,0.0,0.0,19.0,154.216667,0.015789,0.024561,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12278,3.0,145.0,0.0,0.0,53.0,1783.791667,0.007143,0.029031,12.241717,0.0,...,0,0,0,0,0,0,0,0,0,1
12279,0.0,0.0,0.0,0.0,5.0,465.750000,0.000000,0.021333,0.000000,0.0,...,0,0,0,0,1,0,0,0,0,1
12280,0.0,0.0,0.0,0.0,6.0,184.250000,0.083333,0.086667,0.000000,0.0,...,0,0,0,0,1,0,0,0,0,1
12281,4.0,75.0,0.0,0.0,15.0,346.000000,0.000000,0.021053,0.000000,0.0,...,0,0,0,0,1,0,0,0,0,1


In [25]:
# List the columns
for col in df2:
  print(col)

PageVisitsCat1
PageVisitDurationCat1
PageVisitsCat2
PageVisitDurationCat2
PageVisitsCat3
PageVisitDurationCat3
BounceRates
ExitRates
PageValues
HowCloseToSpecialDay
Weekend
Revenue
purchase
Month_Aug
Month_Dec
Month_Feb
Month_Jul
Month_June
Month_Mar
Month_May
Month_Nov
Month_Oct
Month_Sep
VisitorType_New_Visitor
VisitorType_Other
VisitorType_Returning_Visitor


In [26]:
# Separate the features and target variables.
y=df2["Revenue"]
X=df2.drop(columns="Revenue")

In [27]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, 
                                                   y, 
                                                   random_state=1, 
                                                   stratify=y)

In [29]:
print(X_train.shape)
print(X_test.shape)
print(X.shape)

# Instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

(9212, 25)
(3071, 25)
(12283, 25)


LogisticRegression(random_state=1)

In [30]:
# Train the model
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=1)

In [31]:
# Use the testing data to make predictions.
predictions = classifier.predict(X_test)

pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
3066,False,False
3067,False,False
3068,False,True
3069,False,False


In [32]:
# Calculate the accuracy. 
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")


Training Data Score: 0.8797221016066001
Testing Data Score: 0.8870074894171279


In [33]:
# Print the confusion matrix.

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)


array([[2541,   53],
       [ 294,  183]])

In [34]:
# Print a classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

       False       0.90      0.98      0.94      2594
        True       0.78      0.38      0.51       477

    accuracy                           0.89      3071
   macro avg       0.84      0.68      0.72      3071
weighted avg       0.88      0.89      0.87      3071



## Overall Model Performance
----

- **Question:** Can the logistic regression model predict whether visiting a website will result in a purchase? 

- **Answer:**  Yes, because of the accuracy of the model, which is 89%.
