In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import nltk
from nltk.corpus import gutenberg, opinion_lexicon, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import bigrams
from collections import Counter
import re


In [None]:
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('opinion_lexicon')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Unzipping corpora/opinion_lexicon.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(str(text).lower())
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)


In [None]:
def extract_keywords(quotes):
    stop_words = set(stopwords.words('english'))
    all_words = ' '.join(quotes).split()
    filtered_words = [word for word in all_words if word.lower() not in stop_words]
    return Counter(filtered_words)

In [None]:
def score_sentence_by_keywords(sentence, word_freq_dict):
    words = sentence.split()
    bigrams = [' '.join(bigram) for bigram in nltk.bigrams(words)]
    trigrams = [' '.join(trigram) for trigram in nltk.trigrams(words)]
    all_ngrams = words + bigrams + trigrams
    score = sum(word_freq_dict.get(ngram, 0) for ngram in all_ngrams)
    return score

In [None]:
def readability_score(sentence):
    words = sentence.split()
    num_words = len(words)
    num_sentences = 1
    num_syllables = sum(len(re.findall(r'[aeiouy]+', word)) for word in words)
    if num_words == 0:
        return 0
    flesch_kincaid = 206.835 - 1.015 * (num_words / num_sentences) - 84.6 * (num_syllables / num_words)
    return flesch_kincaid

In [None]:
def sentiment_score(sentence):
    pos_words = set(opinion_lexicon.positive())
    neg_words = set(opinion_lexicon.negative())

    additional_pos_words = {
        'adore', 'affection', 'beloved', 'cherish', 'devoted', 'enamored', 'passion', 'amorous',
        'ardent', 'attachment', 'endearment', 'infatuated', 'fondness', 'warmth', 'idolize',
        'romantic', 'sweetheart', 'devotedly', 'tender', 'affectionately', 'darling', 'precious',
        'sweetheart', 'lovely', 'beautiful', 'graceful', 'enchanting', 'delightful', 'captivating',
        'alluring', 'heartthrob', 'soulmate', 'darling', 'devotion', 'adulation', 'appreciation',
        'attraction', 'besotted', 'bliss', 'caring', 'compassion', 'darling', 'dearest', 'enamor',
        'enchant', 'endearing', 'enthusiastic', 'euphoric', 'exquisite', 'fancy', 'fervent',
        'flirt', 'flirtatious', 'fond', 'generous', 'gentle', 'giddy', 'gorgeous', 'happy',
        'heartwarming', 'honey', 'hug', 'infatuation', 'joy', 'kiss', 'kindness', 'like', 'love',
        'lovesick', 'loving', 'loyal', 'passionate', 'poetic', 'positive', 'precious', 'ravishing',
        'romanticism', 'sentimental', 'serenade', 'smitten', 'special', 'splendid', 'stunning',
        'sweetheart', 'thrilled', 'touching', 'treasure', 'valentine', 'vibrant', 'warmth', 'worship',
        'yearn'
    }

    additional_neg_words = {
        'heartbroken', 'unrequited', 'lonely', 'sorrow', 'grief', 'anguish', 'despair', 'heartache',
        'forlorn', 'abandoned', 'desolate', 'mournful', 'wretched', 'tormented', 'pining', 'lovelorn',
        'disappointed', 'unfulfilled', 'dejected', 'despondent', 'crestfallen', 'melancholy',
        'disheartened', 'regret', 'remorse', 'longing', 'yearning', 'hopeless', 'hurt', 'betrayed',
        'agony', 'alienated', 'alone', 'anguish', 'bereft', 'bitter', 'bleak', 'brokenhearted',
        'cheerless', 'cold', 'comfortless', 'crushed', 'defeated', 'depressed', 'despondent',
        'detached', 'devastated', 'dire', 'disconsolate', 'dismal', 'distressed', 'doleful',
        'dreary', 'empty', 'estranged', 'forlorn', 'forsaken', 'grieving', 'heartache', 'heartbreak',
        'heartsick', 'hopeless', 'isolated', 'lament', 'lamentable', 'longing', 'loveless',
        'lugubrious', 'miserable', 'mourn', 'mourning', 'mournful', 'neglected', 'pain', 'pathetic',
        'rejected', 'regretful', 'resigned', 'rueful', 'sad', 'sorrowful', 'suffering', 'suicidal',
        'tearful', 'tragic', 'unbearable', 'unhappy', 'unrequited', 'unsatisfied', 'upset',
        'wistful', 'woeful', 'wretched'
    }

    pos_words.update(additional_pos_words)
    neg_words.update(additional_neg_words)

    words = word_tokenize(sentence)
    score = 0
    for word in words:
        if word in pos_words:
            score += 1
        elif word in neg_words:
            score -= 1
    return score

In [None]:
def lexical_diversity_score(sentence):
    words = word_tokenize(sentence)
    if len(words) == 0:
        return 0
    unique_words = set(words)
    return len(unique_words) / len(words)

In [32]:
KEYWORD_WEIGHT = 0.5
SENTIMENT_WEIGHT = 0.3
READABILITY_WEIGHT = 0.1
LEXICAL_DIVERSITY_WEIGHT = 0.1

# Function to extract top romantic quotes
def extract_top_romantic_quotes(sentences, word_freq_dict, top_n=10):
    keyword_scores = [score_sentence_by_keywords(preprocess_text(sentence), word_freq_dict) for sentence in sentences]
    sentiment_scores = [sentiment_score(sentence) for sentence in sentences]
    readability_scores = [readability_score(sentence) for sentence in sentences]
    lexical_diversity_scores = [lexical_diversity_score(sentence) for sentence in sentences]

    combined_scores = [
        (keyword_score * KEYWORD_WEIGHT + sentiment_score * SENTIMENT_WEIGHT +
         readability_score * READABILITY_WEIGHT + lexical_diversity_score * LEXICAL_DIVERSITY_WEIGHT)
        for keyword_score, sentiment_score, readability_score, lexical_diversity_score
        in zip(keyword_scores, sentiment_scores, readability_scores, lexical_diversity_scores)
    ]

    top_indices = sorted(range(len(combined_scores)), key=lambda i: combined_scores[i], reverse=True)[:top_n]

    top_sentences = [(sentences[i], combined_scores[i]) for i in top_indices]
    return top_sentences

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/quotes.csv')
# Filter romantic quotes
word = 'love'
column_name = 'category'

# Handle NaN values
dataset = dataset.dropna(subset=[column_name])

romantic_quotes = dataset[dataset[column_name].str.contains(word, case=False, na=False)]

# Split the romantic quotes into training (80%) and test (20%) sets
train_set, remaining_romantic_quotes = train_test_split(romantic_quotes, test_size=0.2, random_state=42)

proportion_non_romantic = 0.2  # You can adjust this proportion
num_non_romantic = int(len(remaining_romantic_quotes) * proportion_non_romantic)
non_romantic_quotes = dataset[~dataset[column_name].str.contains(word, case=False, na=False)]
non_romantic_sample = non_romantic_quotes.sample(n=num_non_romantic, random_state=42)

# Combine the remaining romantic quotes with the non-romantic sample to form the test set
test_set = pd.concat([remaining_romantic_quotes, non_romantic_sample])


# Display the resulting sets
print("Training Set:")
print(len(train_set))
print("\nTest Set:")
print(len(test_set))
print("\nSample Non-Romantic Quote (row 10 in iloc):")
print(non_romantic_quotes.iloc[18])

print("\nSample Romantic Quote (row 10 in iloc):")
print(romantic_quotes.iloc[15])

Training Set:
43188

Test Set:
12957

Sample Non-Romantic Quote (row 10 in iloc):
quote         Everything you can imagine is real.
author                              Pablo Picasso
category    art, imagination, inspirational, life
Name: 2915, dtype: object

Sample Romantic Quote (row 10 in iloc):
quote       This life is what you make it. No matter what,...
author                                         Marilyn Monroe
category    attributed-no-source, friends, heartbreak, ins...
Name: 15, dtype: object


In [None]:
print(dataset.columns)


Index(['quote', 'author', 'category'], dtype='object')


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

# Assuming the column containing the quotes is 'quote' and category is 'category'
quote_column_name = 'quote'
category_column_name = 'category'

# Preprocess text in training and test sets
train_set['processed_text'] = train_set[quote_column_name].apply(preprocess_text)
test_set['processed_text'] = test_set[quote_column_name].apply(preprocess_text)

# Extract keywords from training set
train_quotes = train_set['processed_text'].tolist()
keyword_freq = extract_keywords(train_quotes)

# Define threshold for classification
threshold = np.percentile(
    [score_sentence_by_keywords(text, keyword_freq) for text in train_set['processed_text']], 8
)

# Score each sentence in the test set
test_set['keyword_score'] = test_set['processed_text'].apply(lambda x: score_sentence_by_keywords(x, keyword_freq))
test_set['sentiment_score'] = test_set[quote_column_name].apply(sentiment_score)
test_set['readability_score'] = test_set[quote_column_name].apply(readability_score)
test_set['lexical_diversity_score'] = test_set[quote_column_name].apply(lexical_diversity_score)

test_set['combined_score'] = (
    test_set['keyword_score'] * KEYWORD_WEIGHT +
    test_set['sentiment_score'] * SENTIMENT_WEIGHT +
    test_set['readability_score'] * READABILITY_WEIGHT +
    test_set['lexical_diversity_score'] * LEXICAL_DIVERSITY_WEIGHT
)
# Classify sentences in the test set
test_set['predicted_label'] = test_set['combined_score'].apply(lambda x: 'romantic' if x > threshold else 'not romantic')

# Actual labels (binary)
test_set['actual_label'] = test_set[category_column_name].str.contains(word, case=False, na=False).apply(lambda x: 'romantic' if x else 'not romantic')

# Convert labels to binary format for metrics calculation
y_true = test_set['actual_label'].apply(lambda x: 1 if x == 'romantic' else 0)
y_pred = test_set['predicted_label'].apply(lambda x: 1 if x == 'romantic' else 0)

# Calculate Precision, Recall, F1 Score, and Confusion Matrix
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

# Filter and print the quotes classified as romantic
#romantic_quotes = test_set[test_set['predicted_label'] == 'romantic']

#print("\nQuotes classified as romantic:")
#for idx, row in romantic_quotes.iterrows():
 #   print(f"{row[quote_column_name]} (Score: {row['combined_score']:.2f})")

Precision: 0.8671708357572232
Recall: 0.8283015373217263
F1 Score: 0.8472906403940886
Confusion Matrix:
 [[ 789 1370]
 [1854 8944]]


In [None]:
test_sentences = test_set[quote_column_name].tolist()
top_romantic_quotes = extract_top_romantic_quotes(test_sentences, keyword_freq, top_n=10)

print("\nTop Romantic Quotes:")
for quote, score in top_romantic_quotes:
    print(f"{quote} (Score: {score:.2f})\n")



Top Romantic Quotes:
All that is worthy of love [*die Liebenswürdigkeiten*], from the viewpoint of God's comprehensive love, might have been stamped and created by this act of love; man's love does not so stamp or create its objects. Man's love is restricted to recognizing the objective demand these objects make and to submitting to the gradation of rank in what is worthy of love. This gradation exists in itself, but in itself it exists "for" man, ordered to his *particular* essence. Loving can be characterized as correct or false only because a man's actual inclinations and acts of love can be in harmony with or oppose the rank-ordering of what is worthy of love. In other words, man can feel and know himself to be at one with, or separated and opposed to, the love with which God loved the idea of the world or its content before he created it, the love with which he preserves it at every instant. If a man in his actual loving, or in the order of his acts of love, in his preferences an

In [30]:
false_positives = test_set[(test_set['predicted_label'] == 'romantic') & (test_set['actual_label'] == 'not romantic')]
false_negatives = test_set[(test_set['predicted_label'] == 'not romantic') & (test_set['actual_label'] == 'romantic')]

# Print misclassified quotes
print("\nTop 15 False Positives (Non-romantic quotes incorrectly classified as romantic):")
for index, row in false_positives.head(15).iterrows():
    print(f"{row[quote_column_name]} (Predicted: romantic, Actual: not romantic)")
    print(f"Keyword Score: {row['keyword_score']:.2f}, Sentiment Score: {row['sentiment_score']:.2f}, Readability Score: {row['readability_score']:.2f}, Lexical Diversity Score: {row['lexical_diversity_score']:.2f}\n")

print("\nTop 15 False Negatives (Romantic quotes incorrectly classified as non-romantic):")
for index, row in false_negatives.head(15).iterrows():
    print(f"{row[quote_column_name]} (Predicted: not romantic, Actual: romantic)")
    print(f"Keyword Score: {row['keyword_score']:.2f}, Sentiment Score: {row['sentiment_score']:.2f}, Readability Score: {row['readability_score']:.2f}, Lexical Diversity Score: {row['lexical_diversity_score']:.2f}\n")


Top 15 False Positives (Non-romantic quotes incorrectly classified as romantic):
Fire burns blue and hot.Its fair light blinds me not.Smell of smoke is satisfying, tastes nourishing to my tongue.I think fire ageless, never old, and yet no longer young.Morning coals are cool: daylight leaves me blind.I love the fire most because of what it leaves behind. (Predicted: romantic, Actual: not romantic)
Keyword Score: 42091.00, Sentiment Score: 3.00, Readability Score: 29.57, Lexical Diversity Score: 0.86

Chöd is conventionally and misleadingly seen as analogous to, if not derived from, shamanic initiatory dismemberment visions, as well as dualistic anti-body ascetic practices. Two of the elements most commonly referenced by authors in their "identification" of Chöd and/as shamanism—the dismemberment/sacrifice of the body and "demonology"—are presented in an oversimplistic fashion. In the first instance, the numerous Buddhist precursors for the offering of the body provide ample testimony t

In [None]:
paragraph = """
Life is beautiful and filled with many wonders.
The sky is blue and the sun shines bright.
Love is what makes the world go round.
Sometimes, all you need is a little romance to brighten your day.
Friends and family bring joy to our lives.
In the end, it's the love we share that matters the most.
The ocean waves crash gently on the shore.
Birds chirp melodiously in the early morning.
Trees sway with the gentle breeze.
The mountains stand tall, witnessing the passage of time.
Rivers flow endlessly, carrying stories of old.
Stars twinkle in the night sky, reminding us of our dreams.
The moonlight bathes the earth in a soft glow.
Children play happily in the park, their laughter echoing through the air.
Flowers bloom, adding color to our world.
Music has the power to heal our souls.
Books transport us to different worlds and times.
The kindness of strangers can brighten a gloomy day.
Art captures the beauty of life in all its forms.
Good food brings people together.
Exercise keeps our bodies and minds healthy.
In the end, it’s the love we share that matters the most.
Cherish every moment with loved ones.
The journey of life is more meaningful when shared with someone special.
Let's spread love and positivity wherever we go.
"""
def extract_sentences(paragraph):
    sentences = re.split(r'(?<=[.!?]) +', paragraph)
    return sentences
z = extract_sentences(paragraph)

top_romantic_quotes = extract_top_romantic_quotes(z, keyword_freq, top_n=3)
print("\nTop Romantic Quotes:")
for quote, score in top_romantic_quotes:
    print(f"{quote} (Score: {score:.2f})\n")



Top Romantic Quotes:

Love is what makes the world go round.
Sometimes, all you need is a little romance to brighten your day. (Score: 22661.28)


Exercise keeps our bodies and minds healthy.
In the end, it’s the love we share that matters the most. (Score: 17517.81)


Let's spread love and positivity wherever we go.
 (Score: 16600.41)

