In [1]:
import pandas as pd

df = pd.read_csv("data\mbti_1.csv")
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [2]:
def convert_posts_to_list(post: str) -> list:
    
    post = post.replace("'", "")
    post = post.replace('"', "")

    list = post.split("|||")

    return list

In [3]:
import re
import tldextract


def replace_url_with_domain(posts: str) -> str:
    
    url_list = url_list = re.findall(r'(https?://[^\s]+)', posts)

    if len(url_list) > 0:
        for url in url_list:
            domain = tldextract.extract(url).domain
            posts = posts.replace(url, domain)

    return posts

In [4]:
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
import string
from nltk.stem.porter import PorterStemmer

def removal(tokens: pd.Series):

    stopwords_list = stopwords.words("english")

    tokens = tokens.apply(lambda token: token.translate(str.maketrans('', '', string.punctuation)))
    tokens = tokens.apply(lambda token: token if token not in stopwords_list and token != '' else None).dropna()

    return tokens

In [5]:
 
def stemming(tokens: pd.Series):

    stemmer = PorterStemmer()

    return tokens.apply(lambda token: stemmer.stem(token))

In [6]:
def tokenization(text: str):
    return pd.Series(nltk.word_tokenize(text.lower()))

In [7]:
df["posts"] = df["posts"].apply(convert_posts_to_list)
df["posts"]  = df["posts"].apply(" ".join)
df["posts"] = df["posts"].apply(replace_url_with_domain)
df["posts"] = df["posts"].apply(lambda x: re.sub(r'(.)\1{3,}', r'\1', x))

In [8]:
import numpy as np
import string
df["posts"] = df["posts"].apply(lambda text: np.array(stemming(removal(tokenization(text)))))
df.head()

Unnamed: 0,type,posts
0,INFJ,"[youtub, tumblr, enfp, intj, moment, youtub, s..."
1,ENTP,"[im, find, lack, post, alarm, sex, bore, posit..."
2,INTP,"[good, one, youtub, cours, say, know, that, bl..."
3,INTJ,"[dear, intp, enjoy, convers, day, esoter, gab,..."
4,ENTJ,"[your, fire, that, anoth, silli, misconcept, a..."


In [9]:
df.to_csv("mbti_preprocessed.csv")

In [1]:
import pandas as pd

df = pd.read_csv('mbti_preprocessed.csv', index_col=[0])
df.head()

Unnamed: 0,type,posts
0,INFJ,['youtub' 'tumblr' 'enfp' 'intj' 'moment' 'you...
1,ENTP,['im' 'find' 'lack' 'post' 'alarm' 'sex' 'bore...
2,INTP,['good' 'one' 'youtub' 'cours' 'say' 'know' 't...
3,INTJ,['dear' 'intp' 'enjoy' 'convers' 'day' 'esoter...
4,ENTJ,['your' 'fire' 'that' 'anoth' 'silli' 'misconc...


In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df["encodedType"] = le.fit_transform(df["type"])
df.head()

Unnamed: 0,type,posts,encodedType
0,INFJ,['youtub' 'tumblr' 'enfp' 'intj' 'moment' 'you...,8
1,ENTP,['im' 'find' 'lack' 'post' 'alarm' 'sex' 'bore...,3
2,INTP,['good' 'one' 'youtub' 'cours' 'say' 'know' 't...,11
3,INTJ,['dear' 'intp' 'enjoy' 'convers' 'day' 'esoter...,10
4,ENTJ,['your' 'fire' 'that' 'anoth' 'silli' 'misconc...,2


In [3]:
X = df.iloc[:, 1].values
Y = df.iloc[:, 2].values

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X).toarray()


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42069)

In [7]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, Y_train)

LinearRegression()

In [9]:
Y_pred = reg.predict(X_test)

In [None]:
#TODO Evaluate 