In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('mbti_full_pull.csv')
data.isna().sum()

author_flair_text     0
body                 55
subreddit             0
dtype: int64

In [2]:
data.dropna(subset=['body'], inplace=True)

In [3]:
data.isna().sum()

author_flair_text    0
body                 0
subreddit            0
dtype: int64

In [4]:
data.head()

Unnamed: 0,author_flair_text,body,subreddit
0,INTJ,Knowing you're in INTJ is a tool for you to us...,intj
1,INTJ,You are truly an enlightened mastermind.,intj
2,"INFJ, 26F",You should :) it will help if you have a down ...,infj
3,INTP,I watch a bit of everything (including hentai)...,INTP
4,INTJ,I don't know if I would count this as a pet pe...,intj


In [5]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import re

# Download the required NLTK resources
nltk.download("punkt")
nltk.download("wordnet")

# Load the data from the CSV file into a pandas DataFrame
data_df = data.copy()

# Create a WordNet Lemmatizer object
lemmatizer = WordNetLemmatizer()

# Function to lemmatize the text with correct POS tag
def lemmatize_text_with_pos(text):
    words = word_tokenize(text)  # Tokenize the text into individual words
    tagged_words = pos_tag(words)  # Get POS tags for each word
    lemmatized_words = []
    for word, tag in tagged_words:
        wntag = tag[0].lower()  # Get the first character of the POS tag (e.g., 'n' for noun)
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None  # Map POS tag to WordNet tags
        if not wntag:
            lemma = word
        else:
            lemma = lemmatizer.lemmatize(word, wntag)
        lemmatized_words.append(lemma)
    return " ".join(lemmatized_words)  # Join the lemmatized words back into a string

# Function to remove special characters from the text
def remove_special_characters(text):
    # Define a regex pattern to match special characters (excluding alphanumeric and spaces)
    pattern = r"[^a-zA-Z0-9\s]"
    # Use regex to remove special characters and return the cleaned text
    cleaned_text = re.sub(pattern, "", text)
    return cleaned_text

# Apply the remove_special_characters function to the "post-comment" column and store the results in a new column "cleaned-comment"
data_df["cleaned-comment"] = data_df["body"].apply(remove_special_characters)

# Apply the lemmatize_text_with_pos function to the "cleaned-comment" column and store the results in another new column "lemmatized-comment"
data_df["lemmatized-comment"] = data_df["cleaned-comment"].apply(lemmatize_text_with_pos)

print("Lemmatization and special character removal complete.")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Furkan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Furkan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Lemmatization and special character removal complete.


In [6]:
mbti_replacements = {
    'istj': 'ISTJ',
    'isfj': 'ISFJ',
    'infj': 'INFJ',
    'intj': 'INTJ',
    'istp': 'ISTP',
    'isfp': 'ISFP',
    'infp': 'INFP',
    'intp': 'INTP',
    'estp': 'ESTP',
    'esfp': 'ESFP',
    'enfp': 'ENFP',
    'entp': 'ENTP',
    'estj': 'ESTJ',
    'esfj': 'ESFJ',
    'enfj': 'ENFJ',
    'entj': 'ENTJ',
}

mbti_list = ['istj', 'isfj', 'infj', 'intj', 'istp', 'isfp', 'infp', 'intp', 'estp', 'esfp', 'enfp', 'entp', 'estj', 'esfj', 'enfj', 'entj']


# Function to replace flair with MBTI keyword if present
def replace_flair_with_mbti(flair):
    for keyword in mbti_list:
        if keyword in flair.lower():
            return mbti_replacements[keyword]
    return flair

# Apply the flair replacement function to the "flair" column
data_df['flair'] = data_df['author_flair_text'].apply(replace_flair_with_mbti)

In [7]:
data_df.head()

Unnamed: 0,author_flair_text,body,subreddit,cleaned-comment,lemmatized-comment,flair
0,INTJ,Knowing you're in INTJ is a tool for you to us...,intj,Knowing youre in INTJ is a tool for you to use...,Knowing youre in INTJ be a tool for you to use...,INTJ
1,INTJ,You are truly an enlightened mastermind.,intj,You are truly an enlightened mastermind,You be truly an enlightened mastermind,INTJ
2,"INFJ, 26F",You should :) it will help if you have a down ...,infj,You should it will help if you have a down mo...,You should it will help if you have a down mom...,INFJ
3,INTP,I watch a bit of everything (including hentai)...,INTP,I watch a bit of everything including hentai I...,I watch a bit of everything include hentai I t...,INTP
4,INTJ,I don't know if I would count this as a pet pe...,intj,I dont know if I would count this as a pet pee...,I dont know if I would count this as a pet pee...,INTJ


In [8]:
mbti_types = ['ISTJ','ISFJ','INFJ','INTJ','ISTP','ISFP','INFP','INTP','ESTP','ESFP','ENFP','ENTP','ESTJ','ESFJ', 'ENFJ', 'ENTJ']


# Create a mapping from MBTI types to numerical labels
mbti_label_map = {mbti_type: idx for idx, mbti_type in enumerate(mbti_types)}

# Convert MBTI types into numerical labels and store in a new column "numerical-label"
data_df['numerical-label'] = data_df['flair'].map(mbti_label_map)

data_df.drop(['cleaned-comment', "body", "author_flair_text", "subreddit"], axis=1, inplace=True)

data_df.head()

Unnamed: 0,lemmatized-comment,flair,numerical-label
0,Knowing youre in INTJ be a tool for you to use...,INTJ,3
1,You be truly an enlightened mastermind,INTJ,3
2,You should it will help if you have a down mom...,INFJ,2
3,I watch a bit of everything include hentai I t...,INTP,7
4,I dont know if I would count this as a pet pee...,INTJ,3


In [9]:
# Write the DataFrame with lemmatized texts and numerical labels to a new CSV file
data_df.to_csv("data2.csv", index=False)