# Linear Support Vector Classifier

In [1]:
# Import packages
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Load data into a dataset
df = pd.read_csv("mbit_preprocessed_features.csv", index_col=0)
display(df.head())

Unnamed: 0,type,posts,encodedType,preprocessed_posts,extro_intro,intu_obs,feel_think,prosp_judg,avg_word_count,avg_exclam_count,avg_fullstop_count,avg_emoji_count,avg_count_of_hello,avg_count_of_hi,avg_count_of_extroverted_bigrams,avg_count_of_extroverted_stylistic_impressions,avg_count_of_interoverted_quantifiers,avg_count_of_introverted_first_person_singular_pronoun,avg_count_of_introverted_negations
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8,"['youtube', 'tumblr', 'enfp', 'intj', 'moment'...",0,1,1,0,12.1,0.0,0.16,0.08,0.0,0.0,0.14,0.0,0.06,3.54,0.02
1,ENTP,'I'm finding the lack of me in these posts ver...,3,"['im', 'finding', 'the', 'lack', 'of', 'post',...",1,1,0,1,24.38,0.0,0.04,0.08,0.0,0.0,0.28,0.0,0.14,5.58,0.1
2,INTP,'Good one _____ https://www.youtube.com/wat...,11,"['good', 'one', 'youtube', 'of', 'course', 'i'...",0,1,0,1,17.7,0.0,0.08,0.0,0.0,0.0,0.26,0.0,0.0,5.18,0.04
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10,"['dear', 'intp', 'i', 'enjoyed', 'conversation...",0,1,0,0,22.26,0.0,0.14,0.02,0.0,0.0,0.2,0.0,0.1,6.12,0.1
4,ENTJ,'You're fired.|||That's another silly misconce...,2,"['youre', 'fired', 'thats', 'another', 'silly'...",1,1,0,0,20.32,0.0,0.08,0.06,0.0,0.0,0.14,0.0,0.0,6.1,0.1


In [3]:
# Create the train/test split
def create_train_test_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42069)
    return X_train, X_test, Y_train, Y_test

In [4]:
# Loads input data from dataset and vectorizes it
X = df.iloc[:, 3].values #Preprocessed_posts

# Create vectorizer
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(X)
X = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())

print("Vectorizer created {} features.".format(len(vectorizer.get_feature_names_out())))

Vectorizer created 140798 features.


### Gridsearch

In [6]:
# Gridsearch
Y = df.iloc[:, 2].values

X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

parameters = {       
'penalty': ['l1', 'l2'],
'loss': ['hinge', 'squared_hinge'],
'dual': [True, False],
}

estimator = LinearSVC()

stratified_3_fold_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search_estimator = GridSearchCV(estimator, parameters, scoring='accuracy', cv=stratified_3_fold_cv)

grid_search_estimator.fit(X_train, Y_train)
display(grid_search_estimator.best_params_)

12 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\AGANDO\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\AGANDO\anaconda3\lib\site-packages\sklearn\svm\_classes.py", line 257, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "C:\Users\AGANDO\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 1204, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "C:\Users\AGANDO\anaconda3\lib\site-packages\sklearn\svm\_base.py", line

{'dual': False, 'loss': 'squared_hinge', 'penalty': 'l1'}

The errors were expected because of some unavailable combinations.

### Multi-class model (with found parameters)

In [7]:
# Multi-class model
types = sorted(list(set(df.iloc[:, 0].values)))

# Train the model
estimator = LinearSVC(penalty='l1', dual=False)
estimator.fit(X_train, Y_train)
    
# Prediction
predicted = estimator.predict(X_test)
    
# Evaluation
print("Classification Report:")
print(classification_report(Y_test, predicted, target_names=types, zero_division=0))

Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.55      0.31      0.40        55
        ENFP       0.72      0.63      0.67       212
        ENTJ       0.76      0.43      0.55        74
        ENTP       0.62      0.64      0.63       196
        ESFJ       1.00      0.36      0.53        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       0.67      0.18      0.29        11
        ESTP       0.69      0.32      0.44        28
        INFJ       0.67      0.70      0.69       436
        INFP       0.67      0.81      0.73       545
        INTJ       0.65      0.62      0.64       365
        INTP       0.65      0.76      0.70       378
        ISFJ       0.81      0.60      0.69        58
        ISFP       0.55      0.44      0.49        73
        ISTJ       0.79      0.51      0.62        61
        ISTP       0.67      0.57      0.62        86

    accuracy                           0.67      2603
   

## Including additional features

In [5]:
# Add features
X['avg_word_count'] = df.iloc[:, 8].values
X['avg_exclam_count'] = df.iloc[:, 9].values
X['avg_fullstop_count'] = df.iloc[:, 10].values
X['avg_emoji_count'] = df.iloc[:, 11].values
X['avg_count_of_hello'] = df.iloc[:, 12].values
X['avg_count_of_hi'] = df.iloc[:, 13].values
X['avg_count_of_extroverted_bigrams'] = df.iloc[:, 14].values
X['avg_count_of_extroverted_stylistic_impressions'] = df.iloc[:, 15].values
X['avg_count_of_interoverted_quantifiers'] = df.iloc[:, 16].values
X['avg_count_of_introverted_first_person_singular_pronoun'] = df.iloc[:, 17].values
X['avg_count_of_introverted_negations'] = df.iloc[:, 18].values

scaler = preprocessing.MinMaxScaler()
X[['avg_word_count', 'avg_count_of_introverted_first_person_singular_pronoun']] = scaler.fit_transform(
    X[['avg_word_count', 'avg_count_of_introverted_first_person_singular_pronoun']])
display(X)

Unnamed: 0,aa,aaa,aaaa,aaaagh,aaaah,aaaahh,aaaahhh,aaaaim,aaaand,aaaanyways,...,avg_exclam_count,avg_fullstop_count,avg_emoji_count,avg_count_of_hello,avg_count_of_hi,avg_count_of_extroverted_bigrams,avg_count_of_extroverted_stylistic_impressions,avg_count_of_interoverted_quantifiers,avg_count_of_introverted_first_person_singular_pronoun,avg_count_of_introverted_negations
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.16,0.08,0.0,0.0,0.14,0.0,0.06,0.276995,0.02
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.04,0.08,0.0,0.0,0.28,0.0,0.14,0.436620,0.10
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.08,0.00,0.0,0.0,0.26,0.0,0.00,0.405321,0.04
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.14,0.02,0.0,0.0,0.20,0.0,0.10,0.478873,0.10
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.08,0.06,0.0,0.0,0.14,0.0,0.00,0.477308,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.04,0.08,0.0,0.0,0.18,0.0,0.12,0.339593,0.10
8671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06,0.26,0.12,0.0,0.0,0.12,0.0,0.06,0.557121,0.12
8672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.14,0.04,0.0,0.0,0.16,0.0,0.02,0.475743,0.06
8673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.08,0.00,0.0,0.0,0.30,0.0,0.16,0.699531,0.22


### Multi-class model (default parameters)

In [6]:
# Multi-class model
Y = df.iloc[:, 2].values
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# Train the model
estimator = LinearSVC()
estimator.fit(X_train, Y_train)
    
# Prediction
predicted = estimator.predict(X_test)
    
# Evaluation
print("Classification Report:")
types = sorted(list(set(df.iloc[:, 0].values)))
print(classification_report(Y_test, predicted, target_names=types, zero_division=0))

Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.48      0.24      0.32        55
        ENFP       0.68      0.61      0.65       212
        ENTJ       0.78      0.43      0.56        74
        ENTP       0.64      0.62      0.63       196
        ESFJ       0.80      0.36      0.50        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       0.50      0.09      0.15        11
        ESTP       0.78      0.25      0.38        28
        INFJ       0.65      0.69      0.67       436
        INFP       0.63      0.82      0.71       545
        INTJ       0.68      0.60      0.63       365
        INTP       0.66      0.76      0.70       378
        ISFJ       0.82      0.57      0.67        58
        ISFP       0.62      0.41      0.50        73
        ISTJ       0.73      0.49      0.59        61
        ISTP       0.71      0.62      0.66        86

    accuracy                           0.66      2603
   

### Multi-class model (found parameters)

In [7]:
# Train the model (number of iterations had to be increased because of a ConvergenceWarning)
estimator = LinearSVC(penalty='l1', dual=False, max_iter=4000)
estimator.fit(X_train, Y_train)
    
# Prediction
predicted = estimator.predict(X_test)
    
# Evaluation
print("Classification Report:")
types = sorted(list(set(df.iloc[:, 0].values)))
print(classification_report(Y_test, predicted, target_names=types, zero_division=0))

Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.55      0.33      0.41        55
        ENFP       0.71      0.64      0.67       212
        ENTJ       0.73      0.43      0.54        74
        ENTP       0.62      0.64      0.63       196
        ESFJ       1.00      0.36      0.53        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       0.67      0.18      0.29        11
        ESTP       0.67      0.29      0.40        28
        INFJ       0.67      0.69      0.68       436
        INFP       0.67      0.82      0.73       545
        INTJ       0.66      0.62      0.64       365
        INTP       0.64      0.76      0.69       378
        ISFJ       0.83      0.59      0.69        58
        ISFP       0.55      0.42      0.48        73
        ISTJ       0.79      0.49      0.61        61
        ISTP       0.67      0.57      0.62        86

    accuracy                           0.66      2603
   