In [1]:
print("mmm");

mmm


In [2]:
print("m")

m


In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Load Dataset
df = pd.read_csv("sentimentdataset.csv")

# Select relevant columns
text_column = "Text"       # Text data
label_column = "Sentiment" # Sentiment labels

# 1. Text Cleaning
def clean_text(text):
    text = str(text).lower()  # Lowercasing
    text = re.sub(r"[^a-z\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

df["cleaned_text"] = df[text_column].apply(clean_text)

# 2. Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

df["lemmatized_text"] = df["cleaned_text"].apply(lemmatize_text)

# 3. Stop Word Removal
stop_words = set(stopwords.words("english"))
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

df["final_text"] = df["lemmatized_text"].apply(remove_stopwords)

# 4. Label Encoding
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df[label_column])

# Save Label Encoder
with open("label_encoder.pkl", "wb") as file:
    pickle.dump(label_encoder, file)

# 5. TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df["final_text"])

# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Save TF-IDF Model and Data
with open("tfidf_vectorizer.pkl", "wb") as file:
    pickle.dump(tfidf_vectorizer, file)

df.to_csv("cleaned_data.csv", index=False)
tfidf_df.to_csv("tfidf_representation.csv", index=False)

print("Text processing and TF-IDF representation completed. Outputs saved!")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\COMP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\COMP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\COMP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Text processing and TF-IDF representation completed. Outputs saved!


In [5]:
import pickle

# Load Label Encoder
with open("label_encoder.pkl", "rb") as file:
    label_encoder = pickle.load(file)

# Check Label Mappings
print("Label Classes:", label_encoder.classes_)  # Shows original sentiment labels
print("Encoded Values:", list(range(len(label_encoder.classes_))))  # Shows corresponding encoded values


Label Classes: [' Acceptance   ' ' Acceptance      ' ' Accomplishment ' ' Admiration '
 ' Admiration   ' ' Admiration    ' ' Adoration    ' ' Adrenaline     '
 ' Adventure ' ' Affection    ' ' Amazement ' ' Ambivalence '
 ' Ambivalence     ' ' Amusement    ' ' Amusement     ' ' Anger        '
 ' Anticipation ' ' Anticipation  ' ' Anxiety   ' ' Anxiety         '
 ' Appreciation  ' ' Apprehensive ' ' Arousal       ' ' ArtisticBurst '
 ' Awe ' ' Awe    ' ' Awe          ' ' Awe           ' ' Bad '
 ' Betrayal ' ' Betrayal      ' ' Bitter       ' ' Bitterness '
 ' Bittersweet ' ' Blessed       ' ' Boredom ' ' Boredom         '
 ' Breakthrough ' ' Calmness     ' ' Calmness      ' ' Captivation '
 ' Celebration ' ' Celestial Wonder ' ' Challenge ' ' Charm ' ' Colorful '
 ' Compassion' ' Compassion    ' ' Compassionate ' ' Confidence    '
 ' Confident ' ' Confusion ' ' Confusion    ' ' Confusion       '
 ' Connection ' ' Contemplation ' ' Contentment ' ' Contentment   '
 ' Coziness     ' ' Cre

In [6]:
import pickle

# Load TF-IDF Vectorizer
with open("tfidf_vectorizer.pkl", "rb") as file:
    tfidf_vectorizer = pickle.load(file)

# Check Vocabulary (Top 10 Features)
print("TF-IDF Vocabulary (Top 10):", list(tfidf_vectorizer.vocabulary_.keys())[:10])


TF-IDF Vocabulary (Top 10): ['enjoying', 'beautiful', 'day', 'park', 'traffic', 'wa', 'terrible', 'morning', 'finished', 'amazing']


In [7]:
# Example new text
new_text = ["I love sunny days at the park!"]

# Transform text into TF-IDF representation
new_text_tfidf = tfidf_vectorizer.transform(new_text)

# Convert to array for viewing
print("TF-IDF Representation:\n", new_text_tfidf.toarray())


TF-IDF Representation:
 [[0. 0. 0. ... 0. 0. 0.]]
