### Logistic Reg Model - Binary Classifier


In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import pandas as pd

# Import and read the csv.
df = pd.read_csv('model_df1_official.csv', index_col=[0])
df.head()

Unnamed: 0,spend,Impressions,leads,State Tiers,Agency Tiers
770,7.28,427,0.0,4,4
771,13.32,1042,1.0,4,4
772,10.11,337,0.0,4,4
773,0.0,0,1.0,4,4
774,7.35,379,1.0,4,4


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157672 entries, 770 to 160484
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   spend         157672 non-null  float64
 1   Impressions   157672 non-null  int64  
 2   leads         157672 non-null  float64
 3   State Tiers   157672 non-null  int64  
 4   Agency Tiers  157672 non-null  int64  
dtypes: float64(2), int64(3)
memory usage: 7.2 MB


In [3]:
df2 = df.drop(columns=['State Tiers', 'Agency Tiers'])
df2

Unnamed: 0,spend,Impressions,leads
770,7.28,427,0.0
771,13.32,1042,1.0
772,10.11,337,0.0
773,0.00,0,1.0
774,7.35,379,1.0
...,...,...,...
160480,2.01,43,0.0
160481,1.38,31,0.0
160482,11.27,673,0.0
160483,11.34,800,0.0


In [4]:
# steps: 
# create the model with LogisticRegression()
# train the model with model.fit()
# make predictions with model.predict()
# validate with accuracy_score()

### Seperate features, x and target, y

In [5]:
y = df2["leads"]
X = df2.drop(columns="leads")

### Split into train and test features

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(118254, 2)

### Create Log Reg Model 

In [7]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

### Fit/train or model using training data

In [8]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

### Make predictions

In [9]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1.0,1.0
1,1.0,1.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
5,1.0,1.0
6,1.0,1.0
7,1.0,1.0
8,1.0,1.0
9,1.0,0.0


### Validate

In [10]:
# accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.5570551524684154


In [11]:
# confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)
# TP = 0
# FP = 0
# FN = 17,460
# TN = 21,958

# precision = TP/(TP+FP), in this case a precision of 0 means what?
# sensitivity = TP/(TP+FN), in this case a score of 0 means what? = recall

[[    0 17460]
 [    0 21958]]


In [12]:
# classification report
report = classification_report(y_test, y_pred)
print(report)

### note: 
# ytest are the outcomes, either yes or no on leads column target 
# y_pred are the predictions
# do we read the accuracy score as 56% on predictions as a lead? 
# F1 = harmonic mean, takes sensitivity and precision = .72 here means what?

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00     17460
         1.0       0.56      1.00      0.72     21958

    accuracy                           0.56     39418
   macro avg       0.28      0.50      0.36     39418
weighted avg       0.31      0.56      0.40     39418



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
