### SVM - Binary Classifier

In [None]:
# svm can categorize our leads target into 2 classes of yes or no leads
# how do we draw our boundary line? 
# soft margins for errors in classifying as yes/no lead?


In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import pandas as pd

# Import and read the csv.
df = pd.read_csv("../ML_Data_&_Preprocessing/b3_df_nonEncoded.csv", index_col=[0])
df.head()

Unnamed: 0,state,spend,Impressions,leads,link_clicks,reach,Agency Tiers
0,IL,47.33,1780,0,7,1689,4.0
1,NY,63.65,1857,1,10,1737,4.0
2,OK,32.53,1718,0,7,1527,4.0
3,SC,32.31,1725,1,14,1645,4.0
4,CA,101.13,3745,1,28,3513,4.0


In [2]:
df['leads'].value_counts()

1    21698
0    12272
Name: leads, dtype: int64

In [3]:
# check on different state values for testing 
df['state'].value_counts()

CA    7685
TX    4254
VA    3239
CO    2386
FL    1758
MI    1655
GA    1620
NY    1486
NC    1050
TN     970
WA     960
NJ     919
CT     893
OH     842
MO     777
MN     628
IL     605
MA     454
SC     365
OK     365
MD     357
DC     250
SD     246
NE     173
PA      33
Name: state, dtype: int64

In [47]:
df2 = df.loc[df['state']== 'DC']
df2

Unnamed: 0,state,spend,Impressions,leads,link_clicks,reach,Agency Tiers
10533,DC,33.91,1532,0,15,1458,3.0
10543,DC,29.54,1625,1,7,1597,3.0
10546,DC,30.91,1227,0,11,1199,3.0
10550,DC,32.16,1538,1,8,1488,3.0
10576,DC,31.76,1542,1,12,1525,3.0
...,...,...,...,...,...,...,...
15224,DC,32.42,1310,1,5,1280,3.0
15226,DC,35.93,1623,1,7,1574,3.0
15254,DC,31.12,1641,1,11,1600,3.0
15266,DC,33.87,1303,1,6,1270,3.0


In [48]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250 entries, 10533 to 15275
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   state         250 non-null    object 
 1   spend         250 non-null    float64
 2   Impressions   250 non-null    int64  
 3   leads         250 non-null    int64  
 4   link_clicks   250 non-null    int64  
 5   reach         250 non-null    int64  
 6   Agency Tiers  250 non-null    float64
dtypes: float64(2), int64(4), object(1)
memory usage: 15.6+ KB


In [49]:
df3 = df2.drop(df2.columns[[0,6]], axis=1)

In [50]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250 entries, 10533 to 15275
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   spend        250 non-null    float64
 1   Impressions  250 non-null    int64  
 2   leads        250 non-null    int64  
 3   link_clicks  250 non-null    int64  
 4   reach        250 non-null    int64  
dtypes: float64(1), int64(4)
memory usage: 11.7 KB


### Seperate features, x and target, y

In [51]:
# index, with iloc document for only portion of my rows to test
y = df3["leads"]
X = df3.drop(columns="leads")

### Split into train and test features

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1, stratify=y)
X_train.shape

(187, 4)

In [53]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

### Create SVM instance 

In [54]:
from sklearn.svm import SVC
model = SVC(kernel='linear')

### Fit/train or model using training data

In [55]:
model.fit(X_train, y_train)

SVC(kernel='linear')

### Make predictions

In [56]:
y_pred = model.predict(X_test)
results = pd.DataFrame({
   "Prediction": y_pred,
   "Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1


### Validate

In [57]:
# accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7619047619047619

In [58]:
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 0, 15],
       [ 0, 48]], dtype=int64)

In [59]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        15
           1       0.76      1.00      0.86        48

    accuracy                           0.76        63
   macro avg       0.38      0.50      0.43        63
weighted avg       0.58      0.76      0.66        63



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
data_results_b3 = {
        'State': ['CA', 'NY', 'IL', 'DC'],
        'Predictive Accuracy': [.80, .85, .78, .76],
        'Tier': ['T1', 'T2', 'T3', 'T4']}
data_results_b3

{'State': ['CA', 'NY', 'IL', 'DC'],
 'Predictive Accuracy': [0.8, 0.85, 0.78, 0.76],
 'Tier': ['T1', 'T2', 'T3', 'T4']}

In [63]:
# create the new df to display brand, state, and model accuracy 
df_results_b3 = pd.DataFrame(data_results_b3)
df_results_b3

Unnamed: 0,State,Predictive Accuracy,Tier
0,CA,0.8,T1
1,NY,0.85,T2
2,IL,0.78,T3
3,DC,0.76,T4


In [64]:
df_results_b3.to_csv('b3_resultsSVM.csv')