In [88]:
import pandas as pd

# Load only necessary columns
file_path = 'quotes.csv'
df = pd.read_csv(file_path, usecols=['quote', 'author', 'category'])

# Display the first few rows of the dataset
df.head()

# Sample only 100,000 rows if the dataset is larger
if len(df) > 100000:
    df = df.sample(n=100000, random_state=42)


In [89]:
print(df.info())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 179178 to 96474
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   quote     100000 non-null  object
 1   author    99639 non-null   object
 2   category  99983 non-null   object
dtypes: object(3)
memory usage: 3.1+ MB
None
quote         0
author      361
category     17
dtype: int64


In [90]:
# Handle missing values
df.dropna(subset=['quote', 'category'], inplace=True)
df.dropna(subset=['author'], inplace=True)

# Normalize text data
df['quote'] = df['quote'].str.lower()
df['author'] = df['author'].str.lower()
df['category'] = df['category'].str.lower()

In [91]:
# Check for missing values
print(df.isnull().sum())

# Fill missing values in 'author' and 'category' columns with 'Unknown'
df['author'].fillna('Unknown', inplace=True)
df['category'].fillna('Unknown', inplace=True)


quote       0
author      0
category    0
dtype: int64


In [92]:
# Check for duplicate rows
print(df.duplicated().sum())

# Remove duplicate rows
df.drop_duplicates(inplace=True)


274


In [93]:
# Function to clean text
import re

def clean_text(text):
    # Remove any non-alphanumeric characters except spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lower case
    text = text.lower()
    return text

# Apply the clean_text function to the 'quote' column
df['quote'] = df['quote'].apply(clean_text)


In [94]:
# Verify that the 'quote' column is not empty
df = df[df['quote'].str.strip() != '']

# Verify the final dataset
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
Int64Index: 99327 entries, 179178 to 96474
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   quote     99327 non-null  object
 1   author    99327 non-null  object
 2   category  99327 non-null  object
dtypes: object(3)
memory usage: 3.0+ MB
None
                                                    quote  \
179178  the sting of her abandonment had not lessened ...   
183253  everything that falls upon the eye is an appar...   
84139   i dont hate republicans as individuals but i h...   
272877                                think more not less   
195518  some individuals have the courage to make it e...   

                                                   author  \
179178            t.j. forrester, miracles, inc.: a novel   
183253                  marilynn robinson in housekeeping   
84139                                         howard dean   
272877                                   

In [95]:
cleaned_file_path = 'cleaned_quotes.csv'
df.to_csv(cleaned_file_path, index=False)


In [96]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['quote'], df['category'], test_size=0.2, random_state=42)


In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer with a max feature limit to manage memory usage
tfidf_vectorizer = TfidfVectorizer(max_features=10000)

# Fit and transform the quotes
tfidf_matrix = tfidf_vectorizer.fit_transform(df['quote'])

# Display the shape of the TF-IDF matrix
print(tfidf_matrix.shape)


(99327, 10000)


In [98]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize Count Vectorizer for categories
count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
# Fit and transform the categories
category_matrix = count_vectorizer.fit_transform(df['category'])

# Display the shape of the category matrix
print(category_matrix.shape)


(99327, 58938)


In [100]:
from sklearn.metrics.pairwise import cosine_similarity

import random

def recommend_quotes_by_category(query_category, top_n=5, random_sample_size=None):
    # Convert the query category into a tokenized format
    query_vector = count_vectorizer.transform([query_category])
    
    # Calculate cosine similarity between the query and all categories
    category_similarity = cosine_similarity(query_vector, category_matrix).flatten()
    
    # Get indices of top N similar categories
    similar_category_indices = category_similarity.argsort()[-top_n:][::-1]
    
    # Collect the quotes corresponding to the top similar categories
    top_similar_quotes = df.iloc[similar_category_indices]['quote'].tolist()
    
    if random_sample_size is not None:
        # Randomly select quotes from the top similar quotes
        if len(top_similar_quotes) > random_sample_size:
            top_similar_quotes = random.sample(top_similar_quotes, random_sample_size)
    
    return top_similar_quotes


In [108]:
# Example query
query_category = 'happy'
recommended_quotes = recommend_quotes_by_category(query_category, top_n=10, random_sample_size=5)
for quote in recommended_quotes:
    print(quote)


the happy and efficient people in this world are those who accept trouble as a normal detail of human life and resolve to capitalize it when it comes along
a happy life is one which is in accordance with its own nature
if you are happy dont analyse your happiness dont ask questions and dont even think about it just live it to the fullest
make happy those who are near  and those who are far will come
im so happy to be rich  im willing to take all the consequences


In [115]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tokenizer import tokenizer  # Import the tokenizer function

# Load the dataframe
df = pd.read_pickle('quotes_df.pkl')

def recommend_quotes_by_category(query_category, top_n=5, random_sample_size=None):
    query_vector = count_vectorizer.transform([query_category])
    category_similarity = cosine_similarity(query_vector, category_matrix).flatten()
    similar_category_indices = category_similarity.argsort()[-top_n:][::-1]
    top_similar_quotes = df.iloc[similar_category_indices]['quote'].tolist()
    random.shuffle(top_similar_quotes)  # Shuffle the quotes
    if random_sample_size is not None:
        if len(top_similar_quotes) > random_sample_size:
            top_similar_quotes = random.sample(top_similar_quotes, random_sample_size)
    return top_similar_quotes

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['quote'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Initialize Count Vectorizer with the named function
count_vectorizer = CountVectorizer(tokenizer=tokenizer)
category_matrix = count_vectorizer.fit_transform(df['category'])
print(f"Category matrix shape: {category_matrix.shape}")

# Save the vectorizers and matrices
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(count_vectorizer, 'count_vectorizer.joblib')
joblib.dump(tfidf_matrix, 'tfidf_matrix.joblib')
joblib.dump(category_matrix, 'category_matrix.joblib')


TF-IDF matrix shape: (495350, 10000)
Category matrix shape: (495350, 146505)


['category_matrix.joblib']