# Linear Support Vector Classifier

In [None]:
# Install packages
%pip install -q -U imbalanced-learn

In [1]:
# Import packages
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Load data into a dataset
df = pd.read_csv("mbit_preprocessed_features.csv", index_col=0)
display(df.head())

Unnamed: 0,type,posts,encodedType,preprocessed_posts,extro_intro,intu_obs,feel_think,prosp_judg,avg_word_count,avg_exclam_count,avg_fullstop_count,avg_emoji_count,avg_count_of_hello,avg_count_of_hi,avg_count_of_extroverted_bigrams,avg_count_of_extroverted_stylistic_impressions,avg_count_of_interoverted_quantifiers,avg_count_of_introverted_first_person_singular_pronoun,avg_count_of_introverted_negations
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8,"['youtube', 'tumblr', 'enfp', 'intj', 'moment'...",0,1,1,0,12.1,0.0,0.16,0.08,0.0,0.0,0.14,0.0,0.06,3.54,0.02
1,ENTP,'I'm finding the lack of me in these posts ver...,3,"['im', 'finding', 'the', 'lack', 'of', 'post',...",1,1,0,1,24.38,0.0,0.04,0.08,0.0,0.0,0.28,0.0,0.14,5.58,0.1
2,INTP,'Good one _____ https://www.youtube.com/wat...,11,"['good', 'one', 'youtube', 'of', 'course', 'i'...",0,1,0,1,17.7,0.0,0.08,0.0,0.0,0.0,0.26,0.0,0.0,5.18,0.04
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10,"['dear', 'intp', 'i', 'enjoyed', 'conversation...",0,1,0,0,22.26,0.0,0.14,0.02,0.0,0.0,0.2,0.0,0.1,6.12,0.1
4,ENTJ,'You're fired.|||That's another silly misconce...,2,"['youre', 'fired', 'thats', 'another', 'silly'...",1,1,0,0,20.32,0.0,0.08,0.06,0.0,0.0,0.14,0.0,0.0,6.1,0.1


In [3]:
# Create the train/test split
def create_train_test_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42069)
    return X_train, X_test, Y_train, Y_test

# Plot the confusion matrix (copied from Exercise 3)
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
# Loads input data from dataset and vectorizes it
X = df.iloc[:, 3].values #Preprocessed_posts

# Create vectorizer
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(X)
X = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Add features
scaler = preprocessing.MinMaxScaler()
X['avg_word_count'] = df.iloc[:, 8].values
X['avg_exclam_count'] = df.iloc[:, 9].values
X['avg_fullstop_count'] = df.iloc[:, 10].values
X['avg_emoji_count'] = df.iloc[:, 11].values
X['avg_count_of_hello'] = df.iloc[:, 12].values
X['avg_count_of_hi'] = df.iloc[:, 13].values
X['avg_count_of_extroverted_bigrams'] = df.iloc[:, 14].values
X['avg_count_of_extroverted_stylistic_impressions'] = df.iloc[:, 15].values
X['avg_count_of_interoverted_quantifiers'] = df.iloc[:, 16].values
X['avg_count_of_introverted_first_person_singular_pronoun'] = df.iloc[:, 17].values
X['avg_count_of_introverted_negations'] = df.iloc[:, 18].values

X[['avg_word_count', 'avg_count_of_introverted_first_person_singular_pronoun']] = scaler.fit_transform(
    X[['avg_word_count', 'avg_count_of_introverted_first_person_singular_pronoun']])

print("Vectorizer created {} features.".format(len(vectorizer.get_feature_names_out())))

In [None]:
# Get actual types of test data
Y_multi = df.iloc[:, 2].values
X_train, X_test, Y_train, Y_test_basis = train_test_split(X, Y_multi, test_size=0.3, random_state=42069)
types = sorted(list(set(df.iloc[:, 0].values)))

def combine_evaluate(intro, sensi, think, judgi):
    # Create dataframe
    df1 = df[['encodedType', 'extro_intro', 'intu_obs', 'feel_think', 'prosp_judg']].drop_duplicates().reset_index(drop=True)

    predict_df = pd.DataFrame(np.transpose([intro, sensi, think, judgi]), 
                             columns =['extro_intro', 'intu_obs', 'feel_think', 'prosp_judg'])
    predict_df = predict_df.merge(df1, how = 'left', on = predict_df.columns.tolist())
    
    # Evaluate
    print("Classification Report:")
    print(classification_report(Y_test_basis, predict_df['encodedType'], target_names=types, zero_division=0))

    cnf_matrix = confusion_matrix(Y_test_basis, predict_df['encodedType'])
    np.set_printoptions(precision=2)
    plot_confusion_matrix(cnf_matrix, classes=types)

### Gridsearch

In [5]:
Y = df.iloc[:, 4].values # Introverted-Extroverted

X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

parameters = {       
'penalty': ['l1', 'l2'],
'loss': ['hinge', 'squared_hinge'],
'dual': [True, False],
}

estimator = LinearSVC()

stratified_3_fold_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search_estimator = GridSearchCV(estimator, parameters, scoring='accuracy', cv=stratified_3_fold_cv)

grid_search_estimator.fit(X_train, Y_train)
display(grid_search_estimator.best_params_)

12 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\AGANDO\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\AGANDO\anaconda3\lib\site-packages\sklearn\svm\_classes.py", line 257, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "C:\Users\AGANDO\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 1204, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "C:\Users\AGANDO\anaconda3\lib\site-packages\sklearn\svm\_base.py", line

{'dual': False, 'loss': 'squared_hinge', 'penalty': 'l1'}

In [6]:
Y_intro = df.iloc[:, 4].values
Y_sensi = df.iloc[:, 5].values
Y_think = df.iloc[:, 6].values
Y_judgi = df.iloc[:, 7].values

# Train the model and print the classification report (modified)
def train_predict_model(X, Y, types, balancing, found_parameters):
    X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)
    
    if (balancing):
        sampler = RandomOverSampler()
        X_train, Y_train = sampler.fit_resample(X_train, Y_train)
    
    # Train the model
    estimator = LinearSVC()
    if (found_parameters):
        estimator = LinearSVC(penalty='l1', dual=False)
    
    estimator.fit(X_train, Y_train)
    
    # Prediction
    predicted = estimator.predict(X_test)
    
    # Evaluation
    print("Classification Report:")
    print(classification_report(Y_test, predicted, target_names=types, zero_division=0))
    
    cnf_matrix = confusion_matrix(Y_test, predicted)
    np.set_printoptions(precision=2)
    plot_confusion_matrix(cnf_matrix, classes=types)
        
    return predicted

# Binary models (unbalanced, found parameters)

### Binary model: Introverted-Extroverted

In [None]:
# Binary: Introverted-Extroverted
predicted_intro = train_predict_model(X, Y_intro, ['Introverted', 'Extroverted'], False, True)

### Binary model: Sensing-Intuition

In [None]:
# Binary: Sensing-Intuition
predicted_sensi = train_predict_model(X, Y_sensi, ['Sensing', 'Intuition'], False, True)

### Binary model: Thinking-Feeling

In [None]:
# Binary: Thinking-Feeling
predicted_think = train_predict_model(X, Y_think, ['Thinking', 'Feeling'], False, True)

### Binary model: Judging-Perceiving

In [None]:
# Judging-Perceiving
predicted_judgi = train_predict_model(X, Y_judgi, ['Judging', 'Perceiving'], False, True)

### Estimating MBTI

In [None]:
combine_evaluate(predicted_intro, predicted_sensi, predicted_think, predicted_judgi)

# Binary models (balanced, default parameters)

### Binary model: Introverted-Extroverted

In [None]:
# Binary: Introverted-Extroverted
predicted_intro = train_predict_model(X, Y_intro, ['Introverted', 'Extroverted'], True, False)

### Binary model: Sensing-Intuition

In [None]:
# Binary: Sensing-Intuition
predicted_sensi = train_predict_model(X, Y_sensi, ['Sensing', 'Intuition'], True, False)

### Binary model: Thinking-Feeling

In [None]:
# Binary: Thinking-Feeling
predicted_think = train_predict_model(X, Y_think, ['Thinking', 'Feeling'], True, False)

### Binary model: Judging-Perceiving

In [None]:
# Judging-Perceiving
predicted_judgi = train_predict_model(X, Y_judgi, ['Judging', 'Perceiving'], True, False)

### Estimating MBTI

In [None]:
combine_evaluate(predicted_intro, predicted_sensi, predicted_think, predicted_judgi)

# Binary models (balanced, found parameters)

### Binary model: Introverted-Extroverted

In [None]:
# Binary: Introverted-Extroverted
predicted_intro = train_predict_model(X, Y_intro, ['Introverted', 'Extroverted'], True, True)

### Binary model: Sensing-Intuition

In [None]:
# Binary: Sensing-Intuition
predicted_sensi = train_predict_model(X, Y_sensi, ['Sensing', 'Intuition'], True, True)

### Binary model: Thinking-Feeling

In [None]:
# Binary: Thinking-Feeling
predicted_think = train_predict_model(X, Y_think, ['Thinking', 'Feeling'], True, True)

### Binary model: Judging-Perceiving

In [None]:
# Judging-Perceiving
predicted_judgi = train_predict_model(X, Y_judgi, ['Judging', 'Perceiving'], True, True)

### Estimating MBTI

In [None]:
combine_evaluate(predicted_intro, predicted_sensi, predicted_think, predicted_judgi)