# Linear Support Vector Classifier

In [7]:
# Import packages
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn import feature_extraction, feature_selection, model_selection
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Load data into a dataset
df = pd.read_csv("mbit_preprocessed_features.csv", index_col=0)
display(df.head())

Unnamed: 0,type,posts,encodedType,preprocessed_posts,extro_intro,intu_obs,feel_think,prosp_judg,avg_word_count,avg_exclam_count,avg_fullstop_count,avg_emoji_count,avg_count_of_hello,avg_count_of_hi,avg_count_of_extroverted_bigrams,avg_count_of_extroverted_stylistic_impressions,avg_count_of_interoverted_quantifiers,avg_count_of_introverted_first_person_singular_pronoun,avg_count_of_introverted_negations
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8,"['youtube', 'tumblr', 'enfp', 'intj', 'moment'...",0,1,1,0,12.1,0.0,0.16,0.08,0.0,0.0,0.14,0.0,0.06,3.54,0.02
1,ENTP,'I'm finding the lack of me in these posts ver...,3,"['im', 'finding', 'the', 'lack', 'of', 'post',...",1,1,0,1,24.38,0.0,0.04,0.08,0.0,0.0,0.28,0.0,0.14,5.58,0.1
2,INTP,'Good one _____ https://www.youtube.com/wat...,11,"['good', 'one', 'youtube', 'of', 'course', 'i'...",0,1,0,1,17.7,0.0,0.08,0.0,0.0,0.0,0.26,0.0,0.0,5.18,0.04
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10,"['dear', 'intp', 'i', 'enjoyed', 'conversation...",0,1,0,0,22.26,0.0,0.14,0.02,0.0,0.0,0.2,0.0,0.1,6.12,0.1
4,ENTJ,'You're fired.|||That's another silly misconce...,2,"['youre', 'fired', 'thats', 'another', 'silly'...",1,1,0,0,20.32,0.0,0.08,0.06,0.0,0.0,0.14,0.0,0.0,6.1,0.1


In [14]:
# Create the train/test split
def train_test_split(df, test_size, rs, balancing, binary):
    """
    :param balancing: if true return balanced train set
    :param binary: if true return 0 (extroverted) and 1 (introverted) classes
    :param df: (preprocessed) dataset
    :param test_size: choose train/test split ratio <float>
    :param rs: choose random state <float>
    :return: Train and Test Split with and without encoding of <string> labels
    """

    enc = OrdinalEncoder()
    df["encoded_types"] = enc.fit_transform(df[["type"]])
    # print("Raw Dataset:")
    # print(df)

    if binary:

        df = df.replace({"type": {"INTJ": "Introverted",
                                "INTP": "Introverted",
                                "ENTJ": "Extroverted",
                                "ENTP": "Extroverted",
                                "INFJ": "Introverted",
                                "INFP": "Introverted",
                                "ENFJ": "Extroverted",
                                "ENFP": "Extroverted",
                                "ISTJ": "Introverted",
                                "ISFJ": "Introverted",
                                "ESTJ": "Extroverted",
                                "ESFJ": "Extroverted",
                                "ISTP": "Introverted",
                                "ISFP": "Introverted",
                                "ESTP": "Extroverted",
                                "ESFP": "Extroverted"}})



    # Create training and test split
    ## get X
    X_train, X_test = model_selection.train_test_split(df, test_size=test_size, random_state=rs)
    # print("X Training Set:" + "länge=" + str(len(X_train)))
    # print(X_train)
    # print("X Test Set:" + "länge=" + str(len(X_test)))
    # print(X_test)

    ## get target
    y_train = X_train["encodedType"].values
    print("Y Train Set:" + "länge=" + str(len(y_train)))
    # print(y_train)
    y_test = X_test["encodedType"].values
    print("Y Test Set:" + "länge=" + str(len(y_test)))
    # print(y_test)

    # Balancing
    if balancing:
        oversample = RandomOverSampler(sampling_strategy="minority")
        X_over, y_over = oversample.fit_resample(X_train, y_train)
        X_train = X_over
        y_train = y_over

    else:
        None

    return X_train, X_test, y_train, y_test

def feature_generator(X_train, X_test, y_train, n_gram, p_value):
    ## TF-IDF (advanced variant of BoW)
    TFIDF = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1, n_gram))

    ##Extract Vocabulary
    corpus = X_train["preprocessed_posts"].values.astype(str)
    TFIDF.fit(corpus)
    X_training = TFIDF.transform(corpus)
    dic_vocabulary = TFIDF.vocabulary_
    # print("Training vocabulary size before dimension reduction: " + str(len(dic_vocabulary)))

    ##Look up Position of a certain word in the Sparse Matrix
    word = "think"
    # print("Position of the word " + word + " in matrix: " + str(dic_vocabulary[word]))

    # Feature Selection

    ##Reduce Dimensionality for sparse data with Chi-Quadrat
    X_names = TFIDF.get_feature_names_out()
    p_value_limit = p_value
    features = pd.DataFrame()
    # print("Top Features for Each Class:")
    for cat in np.unique(y_train):
        chi2, p = feature_selection.chi2(X_training, y_train == cat)
        features = features.append(pd.DataFrame(
            {"feature": X_names, "score": 1 - p, "y": cat}))
        features = features.sort_values(["y", "score"], ascending=[True, False])
        features = features[features["score"] > p_value_limit]
    X_names = features["feature"].unique().tolist()
    print(X_names)
    print(len(X_names))

    for cat in np.unique(y_train):
        print("# {}:".format(cat))
        print("  . selected features:",
              len(features[features["y"] == cat]))
        print("  . top features:", ",".join(
            features[features["y"] == cat]["feature"].values[:10]))
        print(" ")

    ##Re-Fit vectorizer on corpus with new set of words and create new sparse matrix
    TFIDF = feature_extraction.text.TfidfVectorizer(vocabulary=X_names)
    TFIDF.fit(corpus)
    X_train_vec = TFIDF.transform(corpus)
    dic_vocabulary = TFIDF.vocabulary_
    print("Training vocabulary size after dimension reduction: " + str(len(dic_vocabulary)))

    return X_train_vec, None, TFIDF

# Train the model and print the classification report
def train_predict_model(X, Y, types, show_matrix):
    X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)
    
    # Train the model
    estimator = LinearSVC()

    estimator.fit(X_train, Y_train)
    
    # Prediction
    predicted = estimator.predict(X_test)
    
    # Evaluation
    print("Classification Report:")
    print(classification_report(Y_test, predicted, target_names=types, zero_division=0))
    
    if show_matrix:
        cnf_matrix = confusion_matrix(Y_test, predicted)
        np.set_printoptions(precision=2)
        plot_confusion_matrix(cnf_matrix, classes=types)
        
    return predicted

# Plot the confusion matrix (copied from Exercise 3)
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [15]:
X_train, X_test_base, y_train, y_test = train_test_split(df=df, test_size=0.3, rs=42069, 
                                                                             balancing=False, binary=False)
X_train_vec, none, vectorizer = feature_generator(X_train, X_test_base, y_train, 2, 0.95)

Y Train Set:länge=6072
Y Test Set:länge=2603


  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(
  features = features.append(pd.DataFrame(


['enfj', 'enfjs', 'the enfj', 'enfj the', 'seeker', 'welcome welcome', 'thanks input', 'giggle', 'enfp', 'enfps', 'im enfp', 'am enfp', 'the enfp', 'emojid', 'of enfps', 'xd', 'enfps the', 'enfp im', 'of enfp', 'the enfps', 'sosx', 'be enfp', 'enfp the', 'entj', 'entjs', 'the entj', 'entj the', 'entp', 'entps', 'im entp', 'of entps', 'the entp', 'entp im', 'entp the', 'of entp', 'entps the', 'ne', 'be entp', 'parrot', 'esfjs', 'esfj', 'the esfj', 'esfp', 'sims', 'esfps', 'the esfp', 'stark', 'tmlt', 'estjs', 'estj', 'eagle', 'wat', 'ap', 'pony', 'estp', 'estps', 'the estp', 'cough', 'estp the', 'infj', 'infjs', 'im infj', 'am infj', 'the infj', 'of infj', 'youre infj', 'another infj', 'of infjs', 'think infjs', 'know infj', 'type infj', 'infj can', 'infj the', 'ni', 'be infj', 'infjs the', 'infj forum', 'infj type', 'infj would', 'infj male', 'infj not', 'infj dont', 'dear', 'think infj', 'infp', 'infps', 'im infp', 'am infp', 'the infp', 'of infps', 'youre infp', 'of infp', 'think inf

### Multi-class model

In [20]:
# Multi-class model
types = sorted(list(set(df.iloc[:, 0].values)))

# Train the model
estimator = LinearSVC()
estimator.fit(X_train_vec, y_train)
    
# Prediction
predicted = estimator.predict(vectorizer.transform(X_test_base["preprocessed_posts"]))
    
# Evaluation
print("Classification Report:")
print(classification_report(y_test, predicted, target_names=types, zero_division=0))

Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.49      0.40      0.44        55
        ENFP       0.63      0.69      0.66       212
        ENTJ       0.67      0.61      0.64        74
        ENTP       0.63      0.65      0.64       196
        ESFJ       0.50      0.45      0.48        11
        ESFP       0.33      0.07      0.12        14
        ESTJ       0.86      0.55      0.67        11
        ESTP       0.64      0.50      0.56        28
        INFJ       0.71      0.71      0.71       436
        INFP       0.71      0.77      0.74       545
        INTJ       0.71      0.58      0.64       365
        INTP       0.69      0.74      0.72       378
        ISFJ       0.68      0.71      0.69        58
        ISFP       0.57      0.53      0.55        73
        ISTJ       0.64      0.67      0.66        61
        ISTP       0.64      0.72      0.68        86

    accuracy                           0.68      2603
   

### Gridsearch

In [18]:
# Gridsearch

parameters = {       
'penalty': ['l1', 'l2'],
'loss': ['hinge', 'squared_hinge'],
'dual': [True, False],
}

estimator = LinearSVC(max_iter=15000)

# Because of reduced data -> 10 folds instead of 3
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search_estimator = GridSearchCV(estimator, parameters, scoring='accuracy', cv=stratified_10_fold_cv)

grid_search_estimator.fit(X_train_vec, y_train)
display(grid_search_estimator.best_params_)

40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\AGANDO\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\AGANDO\anaconda3\lib\site-packages\sklearn\svm\_classes.py", line 257, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "C:\Users\AGANDO\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 1204, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "C:\Users\AGANDO\anaconda3\lib\site-packages\sklearn\svm\_base.py", lin

{'dual': False, 'loss': 'squared_hinge', 'penalty': 'l2'}

### Multi-class model (with found parameters)

In [21]:
# Train the model
estimator = LinearSVC(dual=False)
estimator.fit(X_train_vec, y_train)
    
# Prediction
predicted = estimator.predict(vectorizer.transform(X_test_base["preprocessed_posts"]))
    
# Evaluation
print("Classification Report:")
print(classification_report(y_test, predicted, target_names=types, zero_division=0))

Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.49      0.40      0.44        55
        ENFP       0.63      0.69      0.66       212
        ENTJ       0.67      0.61      0.64        74
        ENTP       0.63      0.65      0.64       196
        ESFJ       0.50      0.45      0.48        11
        ESFP       0.33      0.07      0.12        14
        ESTJ       0.86      0.55      0.67        11
        ESTP       0.64      0.50      0.56        28
        INFJ       0.71      0.71      0.71       436
        INFP       0.71      0.77      0.74       545
        INTJ       0.71      0.58      0.64       365
        INTP       0.69      0.74      0.72       378
        ISFJ       0.68      0.71      0.69        58
        ISFP       0.57      0.53      0.55        73
        ISTJ       0.64      0.67      0.66        61
        ISTP       0.64      0.72      0.68        86

    accuracy                           0.68      2603
   