### SVM - Binary Classifier

In [None]:
# svm can categorize our leads target into 2 classes of yes or no leads
# how do we draw our boundary line? 
# soft margins for errors in classifying as yes/no lead?


In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import pandas as pd

# Import and read the csv.
# # b1 view
df = pd.read_csv("../ML_Data_&_Preprocessing/b1_df_nonEncoded.csv", index_col=[0])
df.head()

Unnamed: 0,state,spend,Impressions,leads,link_clicks,reach,Agency Tiers
0,TN,8.96,492,0,0,483,
1,GA,15.92,797,0,3,773,
2,MO,11.16,836,1,2,798,
3,RI,8.8,759,0,2,729,
4,CA,5.83,500,0,0,476,


In [2]:
df['leads'].value_counts()

0    57153
1    45676
Name: leads, dtype: int64

In [3]:
# check on different state values for testing 
df['state'].value_counts()

TX    12200
CA     9216
OH     7480
TN     5689
FL     4675
CO     4518
IL     4425
PA     4360
MO     4318
NJ     3974
NC     3823
MI     3791
AZ     3660
GA     2751
MN     2409
WA     2303
WI     2004
NY     1978
OR     1800
MA     1737
LA     1603
CT     1491
IN     1310
KY     1242
VA     1148
KS     1098
IA      927
RI      908
MS      873
ME      747
NM      707
OK      613
SC      565
UT      451
ID      392
ND      387
AR      368
NV      338
AL      290
NE      260
Name: state, dtype: int64

In [48]:
df2 = df.loc[df['state']== 'NE']
df2

Unnamed: 0,state,spend,Impressions,leads,link_clicks,reach,Agency Tiers
42405,NE,31.53,1552,1,12,1425,
42773,NE,32.75,1587,1,4,1477,
43645,NE,32.21,1740,0,5,1628,
43857,NE,32.17,1655,1,13,1542,
44050,NE,31.85,1570,1,8,1512,
...,...,...,...,...,...,...,...
99048,NE,37.72,2418,0,32,2130,
99066,NE,20.37,2170,0,8,1540,
99119,NE,31.31,2449,0,8,1727,
99121,NE,30.07,3100,0,8,1853,


In [49]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260 entries, 42405 to 99155
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   state         260 non-null    object 
 1   spend         260 non-null    float64
 2   Impressions   260 non-null    int64  
 3   leads         260 non-null    int64  
 4   link_clicks   260 non-null    int64  
 5   reach         260 non-null    int64  
 6   Agency Tiers  0 non-null      float64
dtypes: float64(2), int64(4), object(1)
memory usage: 16.2+ KB


In [50]:
df3 = df2.drop(df2.columns[[0,6]], axis=1)

In [51]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260 entries, 42405 to 99155
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   spend        260 non-null    float64
 1   Impressions  260 non-null    int64  
 2   leads        260 non-null    int64  
 3   link_clicks  260 non-null    int64  
 4   reach        260 non-null    int64  
dtypes: float64(1), int64(4)
memory usage: 12.2 KB


### Seperate features, x and target, y

In [52]:
# index, with iloc document for only portion of my rows to test
y = df3["leads"]
X = df3.drop(columns="leads")

### Split into train and test features

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1, stratify=y)
X_train.shape

(195, 4)

In [54]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

### Create SVM instance 

In [55]:
from sklearn.svm import SVC
model = SVC(kernel='linear')

### Fit/train or model using training data

In [56]:
model.fit(X_train, y_train)

SVC(kernel='linear')

### Make predictions

In [57]:
y_pred = model.predict(X_test)
results = pd.DataFrame({
   "Prediction": y_pred,
   "Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,0,1
1,0,0
2,0,0
3,0,0
4,0,0


### Validate

In [58]:
# accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9230769230769231

In [59]:
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[60,  0],
       [ 5,  0]], dtype=int64)

In [60]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        60
           1       0.00      0.00      0.00         5

    accuracy                           0.92        65
   macro avg       0.46      0.50      0.48        65
weighted avg       0.85      0.92      0.89        65



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
data_results_b1 = {
        'State': ['KY', 'OH', 'MO', 'NE'],
        'Predictive Accuracy': [.98, .90, .75, .92],
        'Tier': ['T2', 'T1', 'T3', 'T4']}
data_results_b1

{'State': ['KY', 'OH', 'MO', 'NE'],
 'Predictive Accuracy': [0.98, 0.9, 0.75, 0.92],
 'Tier': ['T2', 'T1', 'T3', 'T4']}

In [62]:
# create the new df to display brand, state, and model accuracy 
df_results_b1 = pd.DataFrame(data_results_b1)
df_results_b1

Unnamed: 0,State,Predictive Accuracy,Tier
0,KY,0.98,T2
1,OH,0.9,T1
2,MO,0.75,T3
3,NE,0.92,T4


In [63]:
df_results_b1.to_csv('b1_resultsSVM.csv')