In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
# Load dataset
# Load dataset
df = pd.read_csv('combined_emotion.csv', encoding='latin1', sep=',') # or ';' or '|' or other delimiter


In [None]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):

    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered_tokens]
    lemmatized = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return filtered_tokens, stemmed, lemmatized

In [None]:

df[['filtered_tokens', 'stemmed', 'lemmatized']] = df['sentence'].apply(
    lambda x: pd.Series(preprocess_text(x))
)

In [None]:

for index, row in df.iterrows():
    print(f"Original Text: {row['sentence']}") # Changed 'text' to 'Country'
    print(f"Filtered Tokens: {row['filtered_tokens']}")
    print(f"Stemmed Tokens: {row['stemmed']}")
    print(f"Lemmatized Tokens: {row['lemmatized']}")
    print("-" * 100)

Original Text: France
Filtered Tokens: ['france']
Stemmed Tokens: ['franc']
Lemmatized Tokens: ['france']
----------------------------------------------------------------------------------------------------
Original Text: Spain
Filtered Tokens: ['spain']
Stemmed Tokens: ['spain']
Lemmatized Tokens: ['spain']
----------------------------------------------------------------------------------------------------
Original Text: Germany
Filtered Tokens: ['germany']
Stemmed Tokens: ['germani']
Lemmatized Tokens: ['germany']
----------------------------------------------------------------------------------------------------
Original Text: Spain
Filtered Tokens: ['spain']
Stemmed Tokens: ['spain']
Lemmatized Tokens: ['spain']
----------------------------------------------------------------------------------------------------
Original Text: Germany
Filtered Tokens: ['germany']
Stemmed Tokens: ['germani']
Lemmatized Tokens: ['germany']
--------------------------------------------------------------

In [None]:
# Save the processed dataset to a new CSV
df.to_csv('optimized_processed_data.csv', index=False)

In [None]:
print("Processed dataset saved as 'optimized_processed_data.csv'")

Processed dataset saved as 'optimized_processed_data.csv'


In [None]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def filter_tokens(sentence):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence)
    return [word for word in tokens if word.lower() not in stop_words]


def stem_tokens(tokens):
    return [stemmer.stem(word) for word in tokens]

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

user_input = input("Enter a sentence (or type 'exit' to finish): ")

data = []

while user_input.lower() != 'exit':

    filtered = filter_tokens(user_input)
    stemmed = stem_tokens(filtered)
    lemmatized = lemmatize_tokens(filtered)

    data.append({
        'sentence': user_input,
        'filtered_tokens': filtered,
        'stemmed': stemmed,
        'lemmatized': lemmatized
    })

    user_input = input("Enter another sentence (or type 'exit' to finish): ")


df = pd.DataFrame(data)


for index, row in df.iterrows():
    print(f"Original Text: {row['sentence']}")
    print(f"Filtered Tokens: {row['filtered_tokens']}")
    print(f"Stemmed Tokens: {row['stemmed']}")
    print(f"Lemmatized Tokens: {row['lemmatized']}")
    print("-" * 100)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Enter a sentence (or type 'exit' to finish): i dont know i feel so lost
Enter another sentence (or type 'exit' to finish): i can still lose the weight without feeling deprived
Enter another sentence (or type 'exit' to finish): exit
Original Text: i dont know i feel so lost
Filtered Tokens: ['dont', 'know', 'feel', 'lost']
Stemmed Tokens: ['dont', 'know', 'feel', 'lost']
Lemmatized Tokens: ['dont', 'know', 'feel', 'lost']
----------------------------------------------------------------------------------------------------
Original Text: i can still lose the weight without feeling deprived
Filtered Tokens: ['still', 'lose', 'weight', 'without', 'feeling', 'deprived']
Stemmed Tokens: ['still', 'lose', 'weight', 'without', 'feel', 'depriv']
Lemmatized Tokens: ['still', 'lose', 'weight', 'without', 'feeling', 'deprived']
----------------------------------------------------------------------------------------------------
