In [17]:
import pandas as pd

df = pd.read_csv("data\mbti_1.csv")
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [2]:
#Function Definitions
#TODO: Move into preprocessing.py
import re
import tldextract
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
import string

#Remove non-alpha characters and preservers whitespaces (me420 i'm done that's it!!! --> me im done thats it)

def split_posts(posts: str) -> str:
    return posts.replace("|||", " ")

    
def remove_non_alpha_words(posts: str) -> str:
    
    #Removes non alphabet chars and \s for matching whitespace
    regex = re.compile('[^a-zA-Z\s]')
    posts = regex.sub('', posts)
    return posts


#regex match 2 colons : around a string and replace with just the string
def handle_emojis(posts: str) -> str:
    return re.sub(r':(.*?):', r'\1' + "emoji", posts)


#Replaces URLs with Second-Level-Domain (https://www.youtube.com/watch?v=dQw4w9WgXcQ --> youtube)
def replace_url_with_domain(posts: str) -> str:
    
    url_list = url_list = re.findall(r'(https?://[^\s]+)', posts)

    if len(url_list) > 0:
        for url in url_list:
            domain = tldextract.extract(url).domain
            posts = posts.replace(url, domain)

    return posts


#Corrects expressive lengthening / word lengthening (hellooo --> hello)
def correct_expressive_lengthening(posts: str) -> str:
    
    return re.sub(r'(.)\1{3,}', r'\1', posts)


def tokenize_posts(posts: str) -> list[str]:
    tokens = nltk.word_tokenize(posts.lower())
    return tokens


def remove_stopwords(tokens):
    stopwords_ = set(stopwords.words("english"))

    #https://escholarship.org/content/qt6n5652cx/qt6n5652cx.pdf
    paper_words = ["got", "a", "i", "il", "be", "the", "of", "do", "not", "can", "am"]

    custom_stopwords = stopwords_ - set(paper_words)

    clean_tokens = []
    for token in tokens:
        if token not in custom_stopwords:
            clean_tokens.append(token)

    return clean_tokens
    
    
def lemmatize_posts(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for w in tokens:
        # print("Lemma for {} is {}".format(w, lemmatizer.lemmatize(w)))
        lemmas.append(lemmatizer.lemmatize(w))
    return lemmas

In [19]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["encodedType"] = le.fit_transform(df["type"])
df.head()

Unnamed: 0,type,posts,encodedType
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8
1,ENTP,'I'm finding the lack of me in these posts ver...,3
2,INTP,'Good one _____ https://www.youtube.com/wat...,11
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10
4,ENTJ,'You're fired.|||That's another silly misconce...,2


In [20]:
df["preprocessed_posts"] = df["posts"].apply(split_posts)
df.head()

Unnamed: 0,type,posts,encodedType,preprocessed_posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8,'http://www.youtube.com/watch?v=qsXHcwe3krw ht...
1,ENTP,'I'm finding the lack of me in these posts ver...,3,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,11,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...,2,'You're fired. That's another silly misconcept...


In [21]:
df["preprocessed_posts"] = df["preprocessed_posts"].apply(replace_url_with_domain)
df.head()

Unnamed: 0,type,posts,encodedType,preprocessed_posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8,'youtube tumblr enfp and intj moments youtube...
1,ENTP,'I'm finding the lack of me in these posts ver...,3,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,11,"'Good one _____ youtube Of course, to which..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...,2,'You're fired. That's another silly misconcept...


In [None]:
df["preprocessed_posts"] = df["preprocessed_posts"].apply(handle_emojis)

In [22]:
df["preprocessed_posts"] = df["preprocessed_posts"].apply(remove_non_alpha_words)
df.head()

Unnamed: 0,type,posts,encodedType,preprocessed_posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8,youtube tumblr enfp and intj moments youtube ...
1,ENTP,'I'm finding the lack of me in these posts ver...,3,Im finding the lack of me in these posts very ...
2,INTP,'Good one _____ https://www.youtube.com/wat...,11,Good one youtube Of course to which I say ...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10,Dear INTP I enjoyed our conversation the oth...
4,ENTJ,'You're fired.|||That's another silly misconce...,2,Youre fired Thats another silly misconception ...


In [23]:
df["preprocessed_posts"] = df["preprocessed_posts"].apply(correct_expressive_lengthening)
df.head()

Unnamed: 0,type,posts,encodedType,preprocessed_posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8,youtube tumblr enfp and intj moments youtube ...
1,ENTP,'I'm finding the lack of me in these posts ver...,3,Im finding the lack of me in these posts very ...
2,INTP,'Good one _____ https://www.youtube.com/wat...,11,Good one youtube Of course to which I say I kn...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10,Dear INTP I enjoyed our conversation the oth...
4,ENTJ,'You're fired.|||That's another silly misconce...,2,Youre fired Thats another silly misconception ...


In [24]:
df["preprocessed_posts"] = df["preprocessed_posts"].apply(tokenize_posts)
df.head()


Unnamed: 0,type,posts,encodedType,preprocessed_posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8,"[youtube, tumblr, enfp, and, intj, moments, yo..."
1,ENTP,'I'm finding the lack of me in these posts ver...,3,"[im, finding, the, lack, of, me, in, these, po..."
2,INTP,'Good one _____ https://www.youtube.com/wat...,11,"[good, one, youtube, of, course, to, which, i,..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10,"[dear, intp, i, enjoyed, our, conversation, th..."
4,ENTJ,'You're fired.|||That's another silly misconce...,2,"[youre, fired, thats, another, silly, misconce..."


In [25]:
df["preprocessed_posts"] = df["preprocessed_posts"].apply(remove_stopwords)
df.head()


Unnamed: 0,type,posts,encodedType,preprocessed_posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8,"[youtube, tumblr, enfp, intj, moments, youtube..."
1,ENTP,'I'm finding the lack of me in these posts ver...,3,"[im, finding, the, lack, of, posts, alarming, ..."
2,INTP,'Good one _____ https://www.youtube.com/wat...,11,"[good, one, youtube, of, course, i, say, i, kn..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10,"[dear, intp, i, enjoyed, conversation, the, da..."
4,ENTJ,'You're fired.|||That's another silly misconce...,2,"[youre, fired, thats, another, silly, misconce..."


In [26]:
df["preprocessed_posts"] = df["preprocessed_posts"].apply(lemmatize_posts)
df.head()

Unnamed: 0,type,posts,encodedType,preprocessed_posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8,"[youtube, tumblr, enfp, intj, moment, youtube,..."
1,ENTP,'I'm finding the lack of me in these posts ver...,3,"[im, finding, the, lack, of, post, alarming, s..."
2,INTP,'Good one _____ https://www.youtube.com/wat...,11,"[good, one, youtube, of, course, i, say, i, kn..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10,"[dear, intp, i, enjoyed, conversation, the, da..."
4,ENTJ,'You're fired.|||That's another silly misconce...,2,"[youre, fired, thats, another, silly, misconce..."


In [27]:
import numpy as np

#Add Binary Differentiations. 

"""
E - I Extroverted - Introverted
N - S Intuitive - Observant
F - T Feeling - Thinking
P - J Prospective - Judging
"""

#First Part of the Column name is the Attribute when value == 1 (f.e. Extroverted) and Second part when value == 0 (f.e. introverted)
df["extro_intro"] = np.where(df["type"].str.contains("E"), 1, 0)
df["intu_obs"] = np.where(df["type"].str.contains("N"), 1, 0)
df["feel_think"] = np.where(df["type"].str.contains("F"), 1, 0)
df["prosp_judg"] = np.where(df["type"].str.contains("P"), 1, 0)
df.head()

Unnamed: 0,type,posts,encodedType,preprocessed_posts,extro_intro,intu_obs,feel_think,prosp_judg
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8,"[youtube, tumblr, enfp, intj, moment, youtube,...",0,1,1,0
1,ENTP,'I'm finding the lack of me in these posts ver...,3,"[im, finding, the, lack, of, post, alarming, s...",1,1,0,1
2,INTP,'Good one _____ https://www.youtube.com/wat...,11,"[good, one, youtube, of, course, i, say, i, kn...",0,1,0,1
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10,"[dear, intp, i, enjoyed, conversation, the, da...",0,1,0,0
4,ENTJ,'You're fired.|||That's another silly misconce...,2,"[youre, fired, thats, another, silly, misconce...",1,1,0,0


In [31]:
df.to_csv("mbti_preprocessed_complete.csv")

#How to load
#df = pd.read_csv("mbti_base_preprocessed.csv", index_col=0)

In [29]:
#How to Create Train_Test_Splits 

#Determine X and Y For Example (Or any Other like X = Preprocessed_Posts and Y = Extroverted)
#X = df.iloc[:, 3].values #Preprocessed_posts
#Y = df.iloc[:, 2].values #EncodedType

#Create Train_Test_Split. IMPORTANT: Use Same test_size and random_state every time!!!! 
#from sklearn.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42069)

In [30]:
#Function to create Splits. TODO Also move into python file and call from there
from sklearn.model_selection import train_test_split

def create_train_test_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42069)
    return X_train, X_test, Y_train, Y_test