### SVM - Binary Classifier

In [None]:
# svm can categorize our leads target into 2 classes of yes or no leads
# how do we draw our boundary line? 
# soft margins for errors in classifying as yes/no lead?


In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import pandas as pd

# Import and read the csv.
df = pd.read_csv("../ML_Data_&_Preprocessing/b2_df_nonEncoded.csv", index_col=[0])
df.head()

Unnamed: 0,state,spend,Impressions,leads,link_clicks,reach,Agency Tiers
0,IL,47.33,1780,0,7,1689,4.0
1,NY,63.65,1857,1,10,1737,4.0
2,OK,32.53,1718,0,7,1527,4.0
3,SC,32.31,1725,1,14,1645,4.0
4,CA,101.13,3745,1,28,3513,4.0


In [2]:
df['leads'].value_counts()

1    21698
0    12272
Name: leads, dtype: int64

In [3]:
# check on different state values for testing 
df['state'].value_counts()

CA    7685
TX    4254
VA    3239
CO    2386
FL    1758
MI    1655
GA    1620
NY    1486
NC    1050
TN     970
WA     960
NJ     919
CT     893
OH     842
MO     777
MN     628
IL     605
MA     454
SC     365
OK     365
MD     357
DC     250
SD     246
NE     173
PA      33
Name: state, dtype: int64

In [49]:
df2 = df.loc[df['state']== 'SD']
df2

Unnamed: 0,state,spend,Impressions,leads,link_clicks,reach,Agency Tiers
27429,SD,0.00,0,0,0,0,
27430,SD,47.75,3774,1,18,3507,
27431,SD,0.00,0,0,0,0,
27451,SD,0.00,0,0,0,0,
27452,SD,47.22,3483,1,29,3222,
...,...,...,...,...,...,...,...
32957,SD,26.24,1624,1,36,1519,
33007,SD,14.72,974,1,29,934,
33008,SD,36.69,2453,1,75,2159,
33043,SD,53.53,3144,1,83,2824,


In [50]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246 entries, 27429 to 33087
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   state         246 non-null    object 
 1   spend         246 non-null    float64
 2   Impressions   246 non-null    int64  
 3   leads         246 non-null    int64  
 4   link_clicks   246 non-null    int64  
 5   reach         246 non-null    int64  
 6   Agency Tiers  0 non-null      float64
dtypes: float64(2), int64(4), object(1)
memory usage: 15.4+ KB


In [51]:
df3 = df2.drop(df2.columns[[0,6]], axis=1)

In [52]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246 entries, 27429 to 33087
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   spend        246 non-null    float64
 1   Impressions  246 non-null    int64  
 2   leads        246 non-null    int64  
 3   link_clicks  246 non-null    int64  
 4   reach        246 non-null    int64  
dtypes: float64(1), int64(4)
memory usage: 11.5 KB


### Seperate features, x and target, y

In [53]:
# index, with iloc document for only portion of my rows to test
y = df3["leads"]
X = df3.drop(columns="leads")

### Split into train and test features

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1, stratify=y)
X_train.shape

(184, 4)

In [55]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

### Create SVM instance 

In [56]:
from sklearn.svm import SVC
model = SVC(kernel='linear')

### Fit/train or model using training data

In [57]:
model.fit(X_train, y_train)

SVC(kernel='linear')

### Make predictions

In [58]:
y_pred = model.predict(X_test)
results = pd.DataFrame({
   "Prediction": y_pred,
   "Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,0
3,1,1
4,1,1


### Validate

In [59]:
# accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9032258064516129

In [60]:
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[15,  6],
       [ 0, 41]], dtype=int64)

In [61]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.71      0.83        21
           1       0.87      1.00      0.93        41

    accuracy                           0.90        62
   macro avg       0.94      0.86      0.88        62
weighted avg       0.92      0.90      0.90        62



In [67]:
data_results_b2 = {
        'State': ['CA', 'GA', 'WA', 'SD'],
        'Predictive Accuracy': [.79, .81, .74, .93],
        'Tier': ['T1', 'T2', 'T3', 'T4']}
data_results_b2

{'State': ['CA', 'GA', 'WA', 'SD'],
 'Predictive Accuracy': [0.79, 0.81, 0.74, 0.93],
 'Tier': ['T4', 'T3', 'T2', 'T1']}

In [68]:
# create the new df to display brand, state, and model accuracy 
df_results_b2 = pd.DataFrame(data_results_b2)
df_results_b2

Unnamed: 0,State,Predictive Accuracy,Tier
0,CA,0.79,T4
1,GA,0.81,T3
2,WA,0.74,T2
3,SD,0.93,T1


In [66]:
df_results_b2.to_csv('b2_resultsSVM.csv')