In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('mbti_1.csv')
data.isna().sum()

type     0
posts    0
dtype: int64

In [3]:
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [4]:
data['comments_list'] = data['posts'].apply(lambda x: x.split('|||'))


In [5]:
data.head()

Unnamed: 0,type,posts,comments_list
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"['http://www.youtube.com/watch?v=qsXHcwe3krw, ..."
1,ENTP,'I'm finding the lack of me in these posts ver...,['I'm finding the lack of me in these posts ve...
2,INTP,'Good one _____ https://www.youtube.com/wat...,['Good one _____ https://www.youtube.com/wa...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","['Dear INTP, I enjoyed our conversation the ..."
4,ENTJ,'You're fired.|||That's another silly misconce...,"['You're fired., That's another silly misconce..."


In [7]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import re

# Download the required NLTK resources
nltk.download("punkt")
nltk.download("wordnet")

# Create a WordNet Lemmatizer object
lemmatizer = WordNetLemmatizer()

# Function to lemmatize the text with correct POS tag
def lemmatize_list_with_pos(word_list):
    lemmatized_words = []
    for text in word_list:
        words = word_tokenize(text)  # Tokenize the text into individual words
        tagged_words = pos_tag(words)  # Get POS tags for each word
        for word, tag in tagged_words:
            wntag = tag[0].lower()  # Get the first character of the POS tag (e.g., 'n' for noun)
            wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None  # Map POS tag to WordNet tags
            if not wntag:
                lemma = word
            else:
                lemma = lemmatizer.lemmatize(word, wntag)
            lemmatized_words.append(lemma)
    return lemmatized_words

# Function to remove special characters from the text
def remove_special_characters(word_list):
    cleaned_list = []
    for text in word_list:
        # Define a regex pattern to match special characters (excluding alphanumeric and spaces)
        pattern = r"[^a-zA-Z0-9\s]"
        # Use regex to remove special characters and return the cleaned text
        cleaned_text = re.sub(pattern, "", text)
        cleaned_list.append(cleaned_text)
    return cleaned_list

# Assume the DataFrame is named df and the column with the lists of comments is 'comments_list'

# Apply the remove_special_characters function to the "comments_list" column and store the results in a new column "cleaned_comments_list"
data["cleaned_comments_list"] = data["comments_list"].apply(remove_special_characters)

# Apply the lemmatize_list_with_pos function to the "cleaned_comments_list" column and store the results in another new column "lemmatized_comments_list"
data["lemmatized_comments_list"] = data["cleaned_comments_list"].apply(lemmatize_list_with_pos)

print("Lemmatization and special character removal complete.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Furkan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Furkan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Lemmatization and special character removal complete.


In [8]:
data.head()

Unnamed: 0,type,posts,comments_list,cleaned_comments_list,lemmatized_comments_list
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"['http://www.youtube.com/watch?v=qsXHcwe3krw, ...","[httpwwwyoutubecomwatchvqsXHcwe3krw, http41med...","[httpwwwyoutubecomwatchvqsXHcwe3krw, http41med..."
1,ENTP,'I'm finding the lack of me in these posts ver...,['I'm finding the lack of me in these posts ve...,[Im finding the lack of me in these posts very...,"[Im, find, the, lack, of, me, in, these, post,..."
2,INTP,'Good one _____ https://www.youtube.com/wat...,['Good one _____ https://www.youtube.com/wa...,[Good one httpswwwyoutubecomwatchvfHiGbolF...,"[Good, one, httpswwwyoutubecomwatchvfHiGbolFFG..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","['Dear INTP, I enjoyed our conversation the ...",[Dear INTP I enjoyed our conversation the ot...,"[Dear, INTP, I, enjoy, our, conversation, the,..."
4,ENTJ,'You're fired.|||That's another silly misconce...,"['You're fired., That's another silly misconce...","[Youre fired, Thats another silly misconceptio...","[Youre, fire, Thats, another, silly, misconcep..."


In [9]:
mbti_types = ['ISTJ','ISFJ','INFJ','INTJ','ISTP','ISFP','INFP','INTP','ESTP','ESFP','ENFP','ENTP','ESTJ','ESFJ', 'ENFJ', 'ENTJ']


# Create a mapping from MBTI types to numerical labels
mbti_label_map = {mbti_type: idx for idx, mbti_type in enumerate(mbti_types)}

# Convert MBTI types into numerical labels and store in a new column "numerical-label"
data['numerical-label'] = data['type'].map(mbti_label_map)

### Ideally, we could do list of a list to seperate every comment but training them with groups will take so long, I already trained it a lot and only doing this for improvement of the current model so I will not do it.

In [10]:
data["lemmatized-comment"] = data["lemmatized_comments_list"].apply(" ".join)

In [11]:
data.drop(['posts', "comments_list", "cleaned_comments_list", "lemmatized_comments_list"], axis=1, inplace=True)

In [12]:
data.head()

Unnamed: 0,type,numerical-label,lemmatized-comment
0,INFJ,2,httpwwwyoutubecomwatchvqsXHcwe3krw http41media...
1,ENTP,11,Im find the lack of me in these post very alar...
2,INTP,7,Good one httpswwwyoutubecomwatchvfHiGbolFFGw O...
3,INTJ,3,Dear INTP I enjoy our conversation the other d...
4,ENTJ,15,Youre fire Thats another silly misconception T...


In [13]:
data.to_csv('data1.csv', index=False)