# Import dependencies

In [33]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Find the data file

In [26]:
data = Path('../Resources/data/data_small.csv')
raw_df = pd.read_csv(data)
raw_df.head()

Unnamed: 0.1,Unnamed: 0,name,job_title,industry,location,overall_rating,work_life_balance,culture_values,diversity_inclusion,career_opp,comp_benefits,total employee estimate
0,1182,aldi,Customer Service Representative,retail,"Bristol, England, England",3,1.0,2.0,2.0,2.0,1.0,268
1,1183,aldi,Customer Service Representative,printing,"Bristol, England, England",3,1.0,2.0,2.0,2.0,1.0,7
2,1184,aldi,Sales Assistant,retail,"York, England, England",4,5.0,4.0,4.0,5.0,4.0,268
3,1185,aldi,Sales Assistant,printing,"York, England, England",4,5.0,4.0,4.0,5.0,4.0,7
4,1186,aldi,Assistant Manager,retail,"Derby, England, England",4,3.0,4.0,5.0,5.0,4.0,268


# Preprocessing stage
## prepare the X and y variables by dropping the appropriate columns from the raw dataframe

In [20]:
y = raw_df['overall_rating']
X = raw_df.drop('overall_rating', axis=1)
X = pd.get_dummies(X)

# train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  random_state=1, stratify=y)
X_train.shape

(750, 562)

# SVM Model

In [28]:
model = SVC(kernel='linear')

# Train the model

In [29]:
model.fit(X_train, y_train)

SVC(kernel='linear')

# Create predictions with the model

In [31]:
y_pred = model.predict(X_test)

results = pd.DataFrame({
   "Prediction": y_pred,
   "Actual": y_test
}).reset_index(drop=True)

results.head()

Unnamed: 0,Prediction,Actual
0,3,4
1,3,1
2,3,3
3,4,4
4,5,5


In [32]:
accuracy_score(y_test, y_pred)

0.6

In [35]:
confusion_matrix(y_test, y_pred)

array([[24,  3,  5,  0,  0],
       [ 6,  2, 13,  1,  0],
       [ 1,  5, 33, 18,  2],
       [ 1,  1, 11, 42, 13],
       [ 0,  0,  4, 16, 49]], dtype=int64)

In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.75      0.75      0.75        32
           2       0.18      0.09      0.12        22
           3       0.50      0.56      0.53        59
           4       0.55      0.62      0.58        68
           5       0.77      0.71      0.74        69

    accuracy                           0.60       250
   macro avg       0.55      0.55      0.54       250
weighted avg       0.59      0.60      0.59       250

