#### Basic preprocessing
- stop-words removal

#### Basic transformation
- no dimensionality reduction
- tf-idf normalization

#### LinearSVC
- no parameter tuning
- default parameters
- OvR approach using parameter multi_class='ovr'

In [62]:
import nltk
import sklearn
import pandas as pd
import re
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from nltk import WordNetLemmatizer
from nltk import PorterStemmer
from nltk import word_tokenize
from nltk.corpus import wordnet

# 1. Load data

In [63]:
df = pd.read_csv('../mbti_1.csv')

In [64]:
df['posts'] = df['posts'].replace(to_replace="\|\|\|", value=" ", regex=True)
df['posts'] = df['posts'].str[1:-1]

In [65]:
df

Unnamed: 0,type,posts
0,INFJ,http://www.youtube.com/watch?v=qsXHcwe3krw htt...
1,ENTP,I'm finding the lack of me in these posts very...
2,INTP,Good one _____ https://www.youtube.com/watc...
3,INTJ,"Dear INTP, I enjoyed our conversation the ot..."
4,ENTJ,You're fired. That's another silly misconcepti...
5,INTJ,18/37 @.@ Science is not perfect. No scientis...
6,INFJ,"No, I can't draw on my own nails (haha). Those..."
7,INTJ,I tend to build up a collection of things on m...
8,INFJ,"'m not sure, that's a good question. The disti..."
9,INTP,https://www.youtube.com/watch?v=w8-egj0y8Qs I'...


# 2. Train - Test split

In [66]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [67]:
X_train = train['posts'].values
X_test = test['posts'].values
Y_train = train['type'].values
Y_test = test['type'].values

# 3. Encoding labels

In [68]:
le = LabelEncoder()
le.fit(Y_train)

LabelEncoder()

In [69]:
le.classes_

array(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP',
       'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'],
      dtype=object)

In [70]:
y_train_enc = le.transform(Y_train)
y_train_enc

array([ 8,  1,  3, ...,  8,  9, 10])

# 4. Classification pipeline
### with feature extraction and tf-idf

In [71]:
def get_wordnet_pos(treebank_tag):
    """
    Maps treebank tags to wordnet.
    This step is needed when doing lemmatization
    in combination with pos tagging.
    :return: corresponding wordnet tag object
    """
    # Decision block for mapping treebank tags to wordnet.
    # Source: http://stackoverflow.com/a/15590384/5491423
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [72]:
class LemmaTokenizer(object):
    """
    Based on: http://scikit-learn.org/stable/modules/feature_extraction.html#customizing-the-vectorizer-classes
    """
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [self.wnl.lemmatize(t, get_wordnet_pos(pt)) for t, pt in nltk.pos_tag(word_tokenize(doc))]


class StemTokenizer(object):
    """
    Based on: http://scikit-learn.org/stable/modules/feature_extraction.html#customizing-the-vectorizer-classes
    """
    def __init__(self):
        self.ps = PorterStemmer()

    def __call__(self, doc):
        return [self.ps.stem(t) for t in word_tokenize(doc)]

In [73]:
# SVC Pipeline
def create_svc_pipe(norm='l2'):
            
    svc_classifier = Pipeline([
        ('vectorizer', CountVectorizer(lowercase=True, 
                                       tokenizer=LemmaTokenizer(), 
                                       stop_words='english'
                                        )),
        ('tfidf', TfidfTransformer(norm=norm)),
        ('clf', LinearSVC(multi_class='ovr', verbose=100, random_state=42))])
    return svc_classifier

In [74]:
svc_pipe = create_svc_pipe()

In [100]:
print(len(svc_pipe.named_steps["vectorizer"].get_feature_names()))

155502


In [75]:
svc_pipe.fit(X_train, y_train_enc)

[LibLinear]

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
  ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=100))])

# 6. Testing

In [76]:
predictions = svc_pipe.predict(X_test)

In [77]:
y_test_enc = le.transform(Y_test)

In [95]:
precision_recall_fscore_support(y_test_enc, predictions, average='weighted')

  'precision', 'predicted', average, warn_for)


(0.6748822965770683, 0.6760806916426513, 0.663769424138718, None)

In [96]:
print(classification_report(y_test_enc, predictions, target_names=le.classes_))

             precision    recall  f1-score   support

       ENFJ       0.44      0.17      0.25        41
       ENFP       0.69      0.65      0.67       125
       ENTJ       0.72      0.48      0.58        44
       ENTP       0.65      0.61      0.63       135
       ESFJ       1.00      0.14      0.25         7
       ESFP       0.00      0.00      0.00         8
       ESTJ       0.67      0.29      0.40         7
       ESTP       0.71      0.33      0.45        15
       INFJ       0.66      0.69      0.67       288
       INFP       0.67      0.84      0.74       370
       INTJ       0.64      0.70      0.67       193
       INTP       0.71      0.80      0.75       293
       ISFJ       0.88      0.49      0.63        45
       ISFP       0.72      0.43      0.54        53
       ISTJ       0.70      0.32      0.44        44
       ISTP       0.71      0.55      0.62        67

avg / total       0.67      0.68      0.66      1735



  'precision', 'predicted', average, warn_for)
