# Data Information

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import mlxtend
import sklearn.cluster as cluster
import sklearn.neighbors
import sklearn.metrics as metrics
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix, roc_curve, auc, precision_score, recall_score, f1_score, precision_recall_curve
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load and explore the data
#### The data is available at this source, and you can learn more about how and why this dataset is created from this paper.
Data source: https://www.kaggle.com/praveengovi/emotions-dataset-for-nlp

In [None]:
train_data = pd.read_csv('train.txt',names=['sentence','emotion'],header=None, sep=';')
test_data = pd.read_csv('test.txt',names=['sentence','emotion'],header=None, sep=';')
val_data= pd.read_csv('val.txt',names=['sentence','emotion'],header=None, sep=';')
df = pd.concat([train_data,test_data, val_data])
print('Total data:',df.shape)

In [None]:
# Null Check
train_data.isnull().sum()
test_data.isnull().sum()
val_data.isnull().sum()

In [None]:
df = df.drop_duplicates(keep="first") # Drop duplicated data and reindex the data
df_reidx = df.reset_index(drop=True)
df_reidx.shape

(19999, 2)

In [None]:
# convert the emotions to binary labels. love and joy emotions are "not-stressed ==1", and sadness, anger, fear, and surprise are "stressed == 0".
df_reidx['label']=df_reidx['emotion'].replace({'joy': "not-stressed" , 'love': "not-stressed",'surprise': "medium-stressed" ,
                                   'sadness': "stressed", 'anger': "stressed", 'fear': "stressed"})

In [None]:
# check if pos and neg sentiments
df_reidx.label.value_counts()

stressed           10879
not-stressed        8401
medium-stressed      719
Name: label, dtype: int64

In [None]:
df_reidx['length'] = df_reidx['sentence'].apply(len) # number of characters
df_reidx['length'].describe() # info()

count    19999.000000
mean        96.671784
std         55.778779
min          7.000000
25%         53.000000
50%         86.000000
75%        129.000000
max        300.000000
Name: length, dtype: float64

In [None]:
df_reidx.tail()

Unnamed: 0,sentence,emotion,label,length
19994,im having ssa examination tomorrow in the morn...,sadness,stressed,191
19995,i constantly worry about their fight against n...,joy,not-stressed,173
19996,i feel its important to share this info for th...,joy,not-stressed,80
19997,i truly feel that if you are passionate enough...,joy,not-stressed,105
19998,i feel like i just wanna buy any cute make up ...,joy,not-stressed,74


# Text Preprocessing
#### To clean the sentences,we do text preprocessing.

*   Decontracted
*   Data cleaning
Additionally,
*   Spell check
*   Lemmatization
*   Nomalization





######  lemmatization

In [None]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Downloading necessary NLTK packages
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a string
    cleaned_text = ' '.join(tokens)

    return cleaned_text

# Apply text cleaning to 'sentence' column
df_reidx['cleaned_sentence'] = df_reidx['sentence'].apply(clean_text)

# Display the first few rows of the DataFrame
df_reidx.head()


In [None]:
print(df_reidx.columns)


Index(['sentence', 'emotion', 'label', 'length', 'cleaned_sentence'], dtype='object')


In [None]:
#stemming for extract the actual meaning of the words
from nltk.stem import PorterStemmer

def stemming(phrase):
    stemmer = PorterStemmer()
    stem_output=[]
    stemmed=[]
   for review_text in tqdm(phrase):
        stemmed = [stemmer.stem(word) for word in review_text]
        stem_output.append(stemmed)
    return stem_output

df_reidx['cleaned_sentence'] = stemming(df_reidx['cleaned_sentence'])
df_reidx['cleaned_sentence'].head()

In [None]:
def to_sentence(phrase):
    sentence=[]
    for words in tqdm(phrase):
        sentence.append((" ").join(words))
    return sentence
df_reidx['cleaned_sentence']=to_sentence(df_reidx['cleaned_sentence'])
df_reidx['cleaned_sentence'].head()

In [None]:
from transformers import pipeline

# Load the text classification pipeline with a pre-trained model (e.g., 'distilbert-base-uncased')
classifier = pipeline('text-classification', model='distilbert-base-uncased')

# Prepare your data (assuming df_reidx['sentence'] contains the text data)
sentences = df_reidx['cleaned_sentence'].tolist()

# Perform text classification on the sentences
results = classifier(sentences)

# Display the classification results
for idx, result in enumerate(results):
    print(f"Sentence: {sentences[idx]}")
    print(f"Label: {result['label']}")
    print(f"Score: {result['score']}\n")


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Sentence: laughing husband still feeling skeptical cooking nice gozelemes
Label: LABEL_1
Score: 0.5305254459381104

Sentence: feeling little resentful appeared poor planning organizer
Label: LABEL_1
Score: 0.5290473699569702

Sentence: really like feeling accomplishing something worthwhile
Label: LABEL_1
Score: 0.5247830152511597

Sentence: feel suck mad sad
Label: LABEL_1
Score: 0.5288465619087219

Sentence: im feeling particularly smug create
Label: LABEL_1
Score: 0.5216125845909119

Sentence: feel see bothered
Label: LABEL_1
Score: 0.5306596755981445

Sentence: want reader friend feel like need feel sorry
Label: LABEL_1
Score: 0.5242463946342468

Sentence: im saying feel fake
Label: LABEL_1
Score: 0.5206494927406311

Sentence: love giddy feeling finding someone little bit cute wanting know
Label: LABEL_1
Score: 0.5118481516838074

Sentence: write feel afraid silly little thought enough help
Label: LABEL_1
Score: 0.5289

In [None]:
from transformers import pipeline

# Load the text classification pipeline with a pre-trained model (e.g., 'albert-base-v2')
classifier = pipeline('text-classification', model='albert-base-v2')

# Prepare your data (assuming df_reidx['sentence'] contains the text data)
sentences = df_reidx['cleaned_sentence'].tolist()

# Perform text classification on the sentences
results = classifier(sentences)

# Display the classification results
for idx, result in enumerate(results):
    print(f"Sentence: {sentences[idx]}")
    print(f"Label: {result['label']}")
    print(f"Score: {result['score']}\n")

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Sentence: didnt feel humiliated
Label: LABEL_0
Score: 0.569797158241272

Sentence: go feeling hopeless damned hopeful around someone care awake
Label: LABEL_0
Score: 0.5457795262336731

Sentence: im grabbing minute post feel greedy wrong
Label: LABEL_0
Score: 0.5628489851951599

Sentence: ever feeling nostalgic fireplace know still property
Label: LABEL_0
Score: 0.5013441443443298

Sentence: feeling grouchy
Label: LABEL_0
Score: 0.514109194278717

Sentence: ive feeling little burdened lately wasnt sure
Label: LABEL_0
Score: 0.5857308506965637

Sentence: ive taking milligram time recommended amount ive fallen asleep lot faster also feel like funny
Label: LABEL_0
Score: 0.5457446575164795

Sentence: feel confused life teenager jaded year old man
Label: LABEL_1
Score: 0.5715424418449402

Sentence: petronas year feel petronas performed well made huge profit
Label: LABEL_1
Score: 0.5350083708763123

Sentence: feel romantic
Label: LABEL_1
Score: 0.5100115537643433

Sentence: feel like make s

# Feature Engineering

### CounterVectorize: tokenization:



In [None]:
# convert the cleaned sentences to vectors
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
# a built-in stop word list for english is used
# all values of n such than min_n<=n<= max_n will be used. (1,1): only unigrams, (1,2):uni&bigram, (2,2): only bigrams
# max_df: when building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold.
# min_df: ignore terms that have a document frequency strictly lower than the given threshold.

vectorizer = CountVectorizer(stop_words='english', max_df=0.5, min_df=3, ngram_range=(1,1),tokenizer = token.tokenize)
x = vectorizer.fit_transform(df_reidx.cleaned_sentence)
y = df_reidx.label.values

print("X.shape : ",x.shape)
print("y.shape : ",y.shape)

### TF-IDF integration :


In [None]:
!pip install scikit-learn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=3, ngram_range=(1, 1), tokenizer=token.tokenize)

# Transform the data
x_tfidf = tfidf_vectorizer.fit_transform(df_reidx.cleaned_sentence)

In [None]:
train_idx, test_idx = train_test_split(np.arange(df_reidx.shape[0]), test_size=0.3,shuffle=True, random_state=42)

# Split the data
x_train_tfidf = x_tfidf[train_idx]
x_test_tfidf = x_tfidf[test_idx]

y_train = y[train_idx]

y_test = y[test_idx]
print("Number of training examples:{}".format(len(train_idx)))
print("Number of testing examples:{}\n".format(len(test_idx)))
print("Training data: X_train : {}, y_train : {}".format(x_train_tfidf.shape, y_train.shape))
print("Testing data: X_test : {}, y_test : {}".format(x_test_tfidf.shape, y_test.shape))

# Train Test split

```
# This is formatted as code
```



In [None]:
# do shuffle to make neg and pos data of data set split equaly in the test and training set
# do random_sate for making it settle when we run this code repeatedly
train_idx, test_idx = train_test_split(np.arange(df_reidx.shape[0]), test_size=0.3,shuffle=True, random_state=42)

x_train = x[train_idx]
y_train = y[train_idx]

x_test = x[test_idx]
y_test = y[test_idx]
print("Number of training examples:{}".format(len(train_idx)))
print("Number of testing examples:{}\n".format(len(test_idx)))
print("Training data: X_train : {}, y_train : {}".format(x_train.shape, y_train.shape))
print("Testing data: X_test : {}, y_test : {}".format(x_test.shape, y_test.shape))

In [None]:
x_train.shape

# Model Training

#### Logistic Regression

In [None]:
# fit a logistic regression classifier on the training data use default settings
lr_clf = LogisticRegression()
lr_clf.fit(x_train, y_train)

# make prediction on testing data
y_pred_test_lr = lr_clf.predict(x_test)
y_predprob_lr = lr_clf.predict_proba(x_test)
matrix_lr = confusion_matrix(y_test,y_pred_test_lr)
print(classification_report(y_test, y_pred_test_lr))
print("\nAccuracy for Logistic Regression model:",metrics.accuracy_score(y_test, y_pred_test_lr))
print("\n")
y_predict = lr_clf.predict(x_test)
matrix_display = ConfusionMatrixDisplay(matrix_lr).plot()

#### Naive Bayes classifier

##### BernouliNB

A binary algorithm used when the feature is present or not.

In [None]:
from sklearn.naive_bayes import BernoulliNB
nb_clf = BernoulliNB()
nb_clf.fit(x_train, y_train)
# make prediction on testing data
y_pred_test_nb = nb_clf.predict(x_test)
y_predprob_nb = nb_clf.predict_proba(x_test)
matrix_nb = confusion_matrix(y_test,y_pred_test_nb)
print(classification_report(y_test, y_pred_test_nb))
print("\nAccuracy for Bernouli Naive Bayes model:",metrics.accuracy_score(y_test, y_pred_test_nb))
print("\n")
matrix_display = ConfusionMatrixDisplay(matrix_nb).plot()

##### MultinominaliNB

It consider a feature vector where a given term represents the number of times it appears or very ofen, such as frequency.

In [None]:
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
# make prediction on testing data
y_pred_test_mnb = mnb.predict(x_test)
y_predprob_mnb = mnb.predict_proba(x_test)
matrix = confusion_matrix(y_test,y_pred_test_mnb)
print(classification_report(y_test, y_pred_test_mnb))
print("\nAccuracy for multinominal Naive Bayes model:",metrics.accuracy_score(y_test, y_pred_test_mnb))
print("\n")

y_predict = mnb.predict(x_test)
cm = confusion_matrix(y_test, y_predict)
cm_display = ConfusionMatrixDisplay(cm).plot()


# Cross validataion

In [None]:
acc_score_lr = metrics.accuracy_score(y_pred_test_lr,y_test)
prec_score_lr = precision_score(y_test,y_pred_test_lr, average='macro')
recall_lr = recall_score(y_test, y_pred_test_lr,average='macro')
f1_lr = f1_score(y_test,y_pred_test_nb,average='macro')
matrix_lr = confusion_matrix(y_test,y_pred_test_lr)
print('Logistic Regression Model\n')
print(str('Accuracy: '+'{:04.2f}'.format(acc_score_lr*100))+'%')
print(str('Precision: '+'{:04.2f}'.format(prec_score_lr*100))+'%')
print(str('Recall: '+'{:04.2f}'.format(recall_lr*100))+'%')
print('F1 Score: ',f1_lr)
print(matrix_lr)

##### BernouliNB

In [None]:
acc_score_nb = metrics.accuracy_score(y_pred_test_nb,y_test)
prec_score_nb = precision_score(y_test,y_pred_test_nb, average='macro')
recall_nb = recall_score(y_test, y_pred_test_nb,average='macro')
f1_nb = f1_score(y_test,y_pred_test_nb,average='macro')
matrix_nb = confusion_matrix(y_test,y_pred_test_nb)
print('Bernouli Naive Bayes Model\n')
print(str('Accuracy: '+'{:04.2f}'.format(acc_score_nb*100))+'%')
print(str('Precision: '+'{:04.2f}'.format(prec_score_nb*100))+'%')
print(str('Recall: '+'{:04.2f}'.format(recall_nb*100))+'%')
print('F1 Score: ',f1_nb)
print(matrix_nb)

##### MultinominaliNB

In [None]:
acc_score_mnb = metrics.accuracy_score(y_pred_test_mnb,y_test)
prec_score_mnb = precision_score(y_test,y_pred_test_mnb, average='macro')
recall_mnb = recall_score(y_test, y_pred_test_mnb,average='macro')
f1_mnb = f1_score(y_test,y_pred_test_mnb,average='macro')
matrix_mnb = confusion_matrix(y_test,y_pred_test_mnb)
print('Multimominal Naive Bayes Model\n')
print(str('Accuracy: '+'{:04.2f}'.format(acc_score_mnb*100))+'%')
print(str('Precision: '+'{:04.2f}'.format(prec_score_mnb*100))+'%')
print(str('Recall: '+'{:04.2f}'.format(recall_mnb*100))+'%')
print('F1 Score: ',f1_mnb)
print(matrix_mnb)

# Explain the model prediction

Multimominal Naive Bayes Model has higher accuracy than Bernouli Naive Bayes Model.

In [None]:
test_data = df_reidx.iloc[test_idx]
test_data['pred_label'] = y_pred_test_lr
test_data.head()[['sentence','label','pred_label']]
# shows what the prediction label fit to the real label

In [None]:
# shows what the prediction label does not fit to the real label
test_data[test_data['label'] != test_data['pred_label']].head()[['sentence','label','pred_label']].head()

##### Predicted features of logistic regression model

In [None]:
feature_to_coef = {word: float("%.3f" % coef) for word, coef in zip(vectorizer.get_feature_names_out(), lr_clf.coef_[0])}

print("Top positive features:")
sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:10]


In [None]:
# Create the feature_to_coef dictionary
feature_to_coef = {word: float("%.3f" % coef) for word, coef in zip(vectorizer.get_feature_names_out(), lr_clf.coef_[0])}

# Print top negative features
print("Top negative features:")
sorted_negative_features = sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=False)[:10]
for feature in sorted_negative_features:
    print(feature)


##### Predicted features of BernouliNB

In [None]:
# Create the feature_to_coef dictionary for the positive class (usually class 1 in binary classification)
feature_to_coef = {word: abs(float("%.3f" % coef)) for word, coef in zip(vectorizer.get_feature_names_out(), nb_clf.feature_log_prob_[1])}

# Print top positive features
print("Top positive features:")
sorted_positive_features = sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:10]
for feature in sorted_positive_features:
    print(feature)


In [None]:
# most of the words are reliable evidence of indicating negative sentiments
print("Top negative features:")
sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=False)[:10]

##### Predicted features of multinomial NB

In [None]:
import math

# If you're working with binary classification, index 1 is usually the positive class
feature_to_coef = {word: float("%.3f" % math.exp(coef)) for word, coef in zip(vectorizer.get_feature_names_out(), mnb.feature_log_prob_[1])}

# Print top positive features
print("Top positive features:")
sorted_positive_features = sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:10]
for feature in sorted_positive_features:
    print(feature)

In [None]:
# most of the words are reliable evidence of indicating negative sentiments
print("Top negative features:")
sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=False)[:10]

In [None]:
text=['i am not feeling well', 'i want to make this project better', 'i feel aaaaaaah', 'i feel like a bit of a strange one', 'i feel overwhelmed']
test_result = lr_clf.predict(vectorizer.transform(text))
print(test_result)

NameError: ignored

POS

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Specify the path to your text file
file_path = '/content/val.txt'

# Read the content of the text file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Tokenize the text into words
words = word_tokenize(text)

# Perform POS tagging
pos_tags = nltk.pos_tag(words)

# Print the POS tags
print(pos_tags)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('im', 'NN'), ('feeling', 'VBG'), ('quite', 'RB'), ('sad', 'JJ'), ('and', 'CC'), ('sorry', 'NN'), ('for', 'IN'), ('myself', 'PRP'), ('but', 'CC'), ('ill', 'VB'), ('snap', 'VBZ'), ('out', 'IN'), ('of', 'IN'), ('it', 'PRP'), ('soon', 'RB'), (';', ':'), ('sadness', 'NN'), ('i', 'VBP'), ('feel', 'VBP'), ('like', 'IN'), ('i', 'NN'), ('am', 'VBP'), ('still', 'RB'), ('looking', 'VBG'), ('at', 'IN'), ('a', 'DT'), ('blank', 'JJ'), ('canvas', 'NN'), ('blank', 'NN'), ('pieces', 'NNS'), ('of', 'IN'), ('paper', 'NN'), (';', ':'), ('sadness', 'CC'), ('i', 'VB'), ('feel', 'VBP'), ('like', 'IN'), ('a', 'DT'), ('faithful', 'JJ'), ('servant', 'NN'), (';', ':'), ('love', 'CC'), ('i', 'VB'), ('am', 'VBP'), ('just', 'RB'), ('feeling', 'VBG'), ('cranky', 'NN'), ('and', 'CC'), ('blue', 'NN'), (';', ':'), ('anger', 'CC'), ('i', 'VB'), ('can', 'MD'), ('have', 'VB'), ('for', 'IN'), ('a', 'DT'), ('treat', 'NN'), ('or', 'CC'), ('if', 'IN'), ('i', 'JJ'), ('am', 'VBP'), ('feeling', 'VBG'), ('festive', 'NN'), (';',

In [None]:
# Specify the path to your text file
file_path = '/content/val.txt'

# Define a function to generate N-grams
def generate_ngrams(text, n):
    words = text.split()  # Split the text into words
    ngrams = zip(*[words[i:] for i in range(n)])  # Create N-grams
    return [' '.join(ngram) for ngram in ngrams]

# Read the content of the text file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Specify the desired value of N (e.g., 2 for bigrams, 3 for trigrams)
n = 2  # Change this to the desired N-gram value

# Generate and print N-grams
ngrams = generate_ngrams(text, n)
for ngram in ngrams:
    print(ngram)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
idiotic stunt
stunt you
you pulled
pulled in
in the
the other
other room
room either
either though
though i
i do
do ask
ask that
that you
you dont
dont repeat
repeat it;sadness
it;sadness ive
ive come
come to
to appreciate
appreciate in
in the
the uk
uk where
where the
the general
general lack
lack of
of chilli
chilli and
and other
other spicy
spicy foods
foods usually
usually leaves
leaves me
me feeling
feeling somewhat
somewhat appalled;anger
appalled;anger i
i feel
feel honestly
honestly sorry
sorry for
for you;sadness
you;sadness i
i was
was feeling
feeling hesitant
hesitant to
to part
part with
with any
any more
more money
money after
after my
my spendy
spendy trip
trip to
to melbourne
melbourne i
i chose
chose instead
instead to
to modify
modify my
my existing
existing copy
copy of
of a
a href
href http
http www;fear
www;fear i
i didn
didn t
t leave
leave feeling
feeling sarcastic
sarcastic and
and annoyed
annoyed a

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Download necessary NLTK data
nltk.download('punkt')

# Path to the file
file_path = "/content/val.txt"

# Read the file
with open(file_path, 'r') as file:
    text = file.read()

# Tokenize the text into sentences and words
sentences = sent_tokenize(text)
words = word_tokenize(text)

# Print sentences and words for comparison
print("Sentences:")
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")

print("\nWords:")
for i, word in enumerate(words):
    print(f"Word {i+1}: {word}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Word 36755: sure
Word 36756: its
Word 36757: just
Word 36758: apart
Word 36759: of
Word 36760: growing
Word 36761: up
Word 36762: ;
Word 36763: joy
Word 36764: i
Word 36765: feel
Word 36766: that
Word 36767: perfume
Word 36768: ought
Word 36769: to
Word 36770: last
Word 36771: all
Word 36772: day
Word 36773: long
Word 36774: and
Word 36775: never
Word 36776: having
Word 36777: to
Word 36778: reapply
Word 36779: which
Word 36780: is
Word 36781: certainly
Word 36782: not
Word 36783: the
Word 36784: case
Word 36785: with
Word 36786: dorothy
Word 36787: jessica
Word 36788: parker
Word 36789: s
Word 36790: lovely
Word 36791: ;
Word 36792: love
Word 36793: i
Word 36794: mulled
Word 36795: this
Word 36796: idea
Word 36797: over
Word 36798: in
Word 36799: my
Word 36800: head
Word 36801: as
Word 36802: much
Word 36803: as
Word 36804: i
Word 36805: loved
Word 36806: it
Word 36807: i
Word 36808: also
Word 36809: noticed
Word 36810: 

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

# Download necessary NLTK data
nltk.download('punkt')

# Path to the file
file_path = "/content/val.txt"

# Read the file
with open(file_path, 'r') as file:
    text = file.read()

# Tokenize the text into sentences
sentences = sent_tokenize(text)

# Print the original text
print("Original Text:")
print(text)

# Print tokenized sentences for comparison
print("\nTokenized Sentences:")
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")


Original Text:
im feeling quite sad and sorry for myself but ill snap out of it soon;sadness
i feel like i am still looking at a blank canvas blank pieces of paper;sadness
i feel like a faithful servant;love
i am just feeling cranky and blue;anger
i can have for a treat or if i am feeling festive;joy
i start to feel more appreciative of what god has done for me;joy
i am feeling more confident that we will be able to take care of this baby;joy
i feel incredibly lucky just to be able to talk to her;joy
i feel less keen about the army every day;joy
i feel dirty and ashamed for saying that;sadness
i feel bitchy but not defeated yet;anger
i was dribbling on mums coffee table looking out of the window and feeling very happy;joy
i woke up often got up around am feeling pukey radiation and groggy;sadness
i was feeling sentimental;sadness
i walked out of there an hour and fifteen minutes later feeling like i had been beaten with a stick and then placed on the rack and stretched;sadness
i never 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

# Path to the file
file_path = "/content/val.txt"

# Read the file and process its content
with open(file_path, "r") as file:
    text = file.read()

# Tokenize the text and prepare input for the model
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

# Get predictions from the model
outputs = model(**inputs)

# Convert logits to probabilities (optional)
probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)

# Print the results
print("Logits:", outputs.logits)
print("Probabilities:", probabilities)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Logits: tensor([[-0.5346, -0.3295]], grad_fn=<AddmmBackward0>)
Probabilities: tensor([[0.4489, 0.5511]], grad_fn=<SoftmaxBackward0>)
