In [14]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import re

# Download the required NLTK resources
nltk.download("punkt")
nltk.download("wordnet")

# Load the data from the CSV file into a pandas DataFrame
data_df = pd.read_csv("filtered_data.csv")

# Replace NaN values in the "post-comment" column with an empty string
data_df["post-comment"].fillna("", inplace=True)

# Create a WordNet Lemmatizer object
lemmatizer = WordNetLemmatizer()

# Function to lemmatize the text with correct POS tag
def lemmatize_text_with_pos(text):
    words = word_tokenize(text)  # Tokenize the text into individual words
    tagged_words = pos_tag(words)  # Get POS tags for each word
    lemmatized_words = []
    for word, tag in tagged_words:
        wntag = tag[0].lower()  # Get the first character of the POS tag (e.g., 'n' for noun)
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None  # Map POS tag to WordNet tags
        if not wntag:
            lemma = word
        else:
            lemma = lemmatizer.lemmatize(word, wntag)
        lemmatized_words.append(lemma)
    return " ".join(lemmatized_words)  # Join the lemmatized words back into a string

# Function to remove special characters from the text
def remove_special_characters(text):
    # Define a regex pattern to match special characters (excluding alphanumeric and spaces)
    pattern = r"[^a-zA-Z0-9\s]"
    # Use regex to remove special characters and return the cleaned text
    cleaned_text = re.sub(pattern, "", text)
    return cleaned_text

# Apply the remove_special_characters function to the "post-comment" column and store the results in a new column "cleaned-comment"
data_df["cleaned-comment"] = data_df["post-comment"].apply(remove_special_characters)

# Apply the lemmatize_text_with_pos function to the "cleaned-comment" column and store the results in another new column "lemmatized-comment"
data_df["lemmatized-comment"] = data_df["cleaned-comment"].apply(lemmatize_text_with_pos)

print("Lemmatization and special character removal complete.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Furkan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Furkan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Lemmatization and special character removal complete.


In [15]:
mbti_types = ['ISTJ','ISFJ','INFJ','INTJ','ISTP','ISFP','INFP','INTP','ESTP','ESFP','ENFP','ENTP','ESTJ','ESFJ', 'ENFJ', 'ENTJ']


# Create a mapping from MBTI types to numerical labels
mbti_label_map = {mbti_type: idx for idx, mbti_type in enumerate(mbti_types)}

# Convert MBTI types into numerical labels and store in a new column "numerical-label"
data_df['numerical-label'] = data_df['flair'].map(mbti_label_map)

data_df.drop(['cleaned-comment', "post-comment", "username", "url"], axis=1, inplace=True)

# Write the DataFrame with lemmatized texts and numerical labels to a new CSV file
data_df.to_csv("data_clean.csv", index=False)
