# Import libraries

In [74]:
import pandas as pd
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\arman\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Load CSV file into a pandas dataframe

In [75]:
base_path = "C:/Users/arman/OneDrive/Desktop/Automatic-Privacy-Detection/Data/"

df = pd.read_csv(base_path + 'train_set.csv')

# Define a function to tokenize and remove stopwords from a sentence

In [76]:
def preprocess_sentence(sentence):
    # tokenize the sentence
    tokens = nltk.word_tokenize(sentence)
    
    # remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # stem the remaining tokens
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    # lemmatize the stemmed tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for token in stemmed_tokens:
        pos_tag = nltk.pos_tag([token])[0][1]
        if pos_tag.startswith('J'):
            pos = wordnet.ADJ
        elif pos_tag.startswith('V'):
            pos = wordnet.VERB
        elif pos_tag.startswith('N'):
            pos = wordnet.NOUN
        elif pos_tag.startswith('R'):
            pos = wordnet.ADV
        else:
            pos = wordnet.NOUN
        lemmatized_token = lemmatizer.lemmatize(token, pos=pos)
        lemmatized_tokens.append(lemmatized_token)
    return lemmatized_tokens

# Apply the preprocess_sentence function to the 'text' column of the dataframe

In [78]:
df['tokenized_text'] = df['text'].apply(nltk.word_tokenize)
df['stopwords_text'] = df['tokenized_text'].apply(lambda x: [word for word in x if word.lower() not in stopwords.words('english')])
df['stemmed_text'] = df['text'].apply(preprocess_sentence)
df['processed_sentences'] = df['text'].apply(preprocess_sentence)

# Write the updated dataframe to a new CSV file

In [80]:
base_path = "C:/Users/arman/OneDrive/Desktop/Automatic-Privacy-Detection/Data/"
df.to_csv(base_path + 'processed_text.csv', index = False)