In [1]:
import os
import re
import warnings
from typing import Dict, List, Tuple

import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (GridSearchCV, StratifiedKFold,
                                     cross_val_score)
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline
mpl.style.use('ggplot')
sns.set(rc={'figure.figsize': (9, 7)})
sns.set_style('white')
warnings.filterwarnings('ignore')
print(os.listdir('../input'));

['mbti_1.csv']


## 0. Data Retrieval

In [2]:
# Load dataset using pandas.read_csv() 
dataset = pd.read_csv("../input/mbti_1.csv")

# Examine dataset's dimensions
print(dataset.shape)

(8675, 2)


In [3]:
text = dataset["posts"][0]

In [4]:
def remove_urls(text: str) -> str:
    return re.sub(r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""", " ", text)

In [5]:
text = remove_urls(text)
print(text)

'  and intj moments     sportscenter not top ten plays     pranks|||What has been the most life-changing experience in your life?|||       On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~     ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390       ...|||Welcome and stuff.|||   Game. Set. Match.|||Prozac, wellbrutin, at least thirty minutes of moving your legs (and I don't mean moving them while sitting in your same desk chair), weed in moderation (maybe try edibles as a healthier alternative...|||Basically come up with three items you've determined that each type (or whichever types you want to do) would more than likely use, given each types' cognitive functions and whatnot, when left by...|||All thin

In [6]:
def tokenize(text: str) -> List[str]:
    return word_tokenize(text)    

In [7]:
text = tokenize(text)
print(text)

["'", 'and', 'intj', 'moments', 'sportscenter', 'not', 'top', 'ten', 'plays', 'pranks|||What', 'has', 'been', 'the', 'most', 'life-changing', 'experience', 'in', 'your', 'life', '?', '|||', 'On', 'repeat', 'for', 'most', 'of', 'today.|||May', 'the', 'PerC', 'Experience', 'immerse', 'you.|||The', 'last', 'thing', 'my', 'INFJ', 'friend', 'posted', 'on', 'his', 'facebook', 'before', 'committing', 'suicide', 'the', 'next', 'day', '.', 'Rest', 'in', 'peace~', 'ENFJ7', '.', 'Sorry', 'to', 'hear', 'of', 'your', 'distress', '.', 'It', "'s", 'only', 'natural', 'for', 'a', 'relationship', 'to', 'not', 'be', 'perfection', 'all', 'the', 'time', 'in', 'every', 'moment', 'of', 'existence', '.', 'Try', 'to', 'figure', 'the', 'hard', 'times', 'as', 'times', 'of', 'growth', ',', 'as', '...', '|||84389', '84390', '...', '|||Welcome', 'and', 'stuff.|||', 'Game', '.', 'Set', '.', 'Match.|||Prozac', ',', 'wellbrutin', ',', 'at', 'least', 'thirty', 'minutes', 'of', 'moving', 'your', 'legs', '(', 'and', 'I',

In [8]:
def remove_stopwords(tokens: List[str]) ->  str:
    return [word for word in tokens if word in stopwords]

In [9]:
# def clean_corpus(text: str) -> List[str]:
    