In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
data = {
    'text': [
        # Shakespeare
        "To be, or not to be, that is the question.",
        "All the world's a stage, and all the men and women merely players.",
        "Some are born great, some achieve greatness, and some have greatness thrust upon them.",
        # Jane Austen
        "It is a truth universally acknowledged, that a single man in possession of a good fortune must be in want of a wife.",
        "I declare after all there is no enjoyment like reading!",
        "A lady's imagination is very rapid; it jumps from admiration to love, from love to matrimony in a moment.",
        # Charles Dickens
        "It was the best of times, it was the worst of times.",
        "Please, sir, I want some more.",
        "No one is useless in this world who lightens the burden of it for anyone else."
    ],
    'author': [
        'Shakespeare', 'Shakespeare', 'Shakespeare',
        'Austen', 'Austen', 'Austen',
        'Dickens', 'Dickens', 'Dickens'
    ]
}

df = pd.DataFrame(data)
print("Dataset Preview:")
print(df)

Dataset Preview:
                                                text       author
0         To be, or not to be, that is the question.  Shakespeare
1  All the world's a stage, and all the men and w...  Shakespeare
2  Some are born great, some achieve greatness, a...  Shakespeare
3  It is a truth universally acknowledged, that a...       Austen
4  I declare after all there is no enjoyment like...       Austen
5  A lady's imagination is very rapid; it jumps f...       Austen
6  It was the best of times, it was the worst of ...      Dickens
7                     Please, sir, I want some more.      Dickens
8  No one is useless in this world who lightens t...      Dickens


In [8]:
# Preprocessing and Feature Extraction
X = df['text']
y = df['author']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert text to numerical features
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



In [5]:
y_train

0    Shakespeare
8        Dickens
2    Shakespeare
4         Austen
3         Austen
6        Dickens
Name: author, dtype: object

In [9]:
# Train a Naive Bayes Model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Step 5 — Evaluate Model
y_pred = model.predict(X_test_tfidf)

print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Evaluation:
Accuracy: 0.3333333333333333

Classification Report:
               precision    recall  f1-score   support

      Austen       0.50      1.00      0.67         1
     Dickens       0.00      0.00      0.00         1
 Shakespeare       0.00      0.00      0.00         1

    accuracy                           0.33         3
   macro avg       0.17      0.33      0.22         3
weighted avg       0.17      0.33      0.22         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [10]:
# Test the Model on New Samples
samples = [
    "O Romeo, Romeo! wherefore art thou Romeo?",
    "It was a bright cold day in April, and the clocks were striking thirteen.",
    "Vanity and pride are different things, though the words are often used synonymously.",
    "To be, or not to be, that is the question."
]

sample_features = vectorizer.transform(samples)
predictions = model.predict(sample_features)

print("\nSample Predictions:")
for text, author in zip(samples, predictions):
    print(f"\nText: {text}\n→ Predicted Author: {author}")


Sample Predictions:

Text: O Romeo, Romeo! wherefore art thou Romeo?
→ Predicted Author: Austen

Text: It was a bright cold day in April, and the clocks were striking thirteen.
→ Predicted Author: Austen

Text: Vanity and pride are different things, though the words are often used synonymously.
→ Predicted Author: Austen

Text: To be, or not to be, that is the question.
→ Predicted Author: Shakespeare


### Project Implementation Pipeline

In [11]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [12]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
import pandas as pd

# Load saved dataset
df = pd.read_csv("gutenberg_authors_dataset.csv")
print("Total samples in full dataset:", len(df))

# Choose 5 authors
selected_authors = [
    "Jane Austen",
    "Shakespeare",
    "Herman Melville",
    "Lewis Carroll",
    "John Milton"
]

# Filter for selected authors
df = df[df['author'].isin(selected_authors)]
print("Samples after selecting 5 authors:", len(df))
print(df['author'].value_counts())

# Prepare text and label lists
texts = df['text'].tolist()
authors = df['author'].tolist()

print("Total Samples:", len(texts))
print(df.head())


Total samples in full dataset: 51939
Samples after selecting 5 authors: 27684
author
Jane Austen        13873
Herman Melville     7788
Shakespeare         3078
John Milton         1728
Lewis Carroll       1217
Name: count, dtype: int64
Total Samples: 27684
                                                text       author
0  [Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAP...  Jane Austen
1  She was the youngest of the two daughters of a...  Jane Austen
2  Her mother\nhad died too long ago for her to h...  Jane Austen
3  Sixteen years had Miss Taylor been in Mr. Wood...  Jane Austen
4  Between _them_ it was more the intimacy\nof si...  Jane Austen


In [14]:
# Preprocessing 
def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['text'].apply(preprocess)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['author'], test_size=0.3, random_state=42, stratify=df['author']
)

In [16]:
results = [] 

tfidf = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

count_vec = CountVectorizer(ngram_range=(1,2), stop_words='english')
X_train_count = count_vec.fit_transform(X_train)
X_test_count = count_vec.transform(X_test)

In [17]:
# Naive Bayes + TF-IDF
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)

y_pred_nb_tfidf = nb_tfidf.predict(X_test_tfidf)
acc_nb_tfidf = accuracy_score(y_test, y_pred_nb_tfidf)
results.append(("Naive Bayes (TF-IDF)", acc_nb_tfidf))

print("Naive Bayes (TF-IDF)")
print("Accuracy:", round(acc_nb_tfidf, 3))
print(classification_report(y_test, y_pred_nb_tfidf, zero_division=0))

Naive Bayes (TF-IDF)
Accuracy: 0.834
                 precision    recall  f1-score   support

Herman Melville       0.86      0.81      0.83      2337
    Jane Austen       0.80      0.99      0.88      4162
    John Milton       0.99      0.29      0.45       518
  Lewis Carroll       1.00      0.09      0.17       365
    Shakespeare       0.99      0.77      0.87       924

       accuracy                           0.83      8306
      macro avg       0.93      0.59      0.64      8306
   weighted avg       0.86      0.83      0.81      8306



In [18]:
# Naive Bayes + CountVectorizer
nb_count = MultinomialNB()
nb_count.fit(X_train_count, y_train)

y_pred_nb_count = nb_count.predict(X_test_count)
acc_nb_count = accuracy_score(y_test, y_pred_nb_count)
results.append(("Naive Bayes (CountVectorizer)", acc_nb_count))

print("Naive Bayes (CountVectorizer)")
print("Accuracy:", round(acc_nb_count, 3))
print(classification_report(y_test, y_pred_nb_count, zero_division=0))

Naive Bayes (CountVectorizer)
Accuracy: 0.895
                 precision    recall  f1-score   support

Herman Melville       0.91      0.86      0.88      2337
    Jane Austen       0.87      0.99      0.93      4162
    John Milton       0.92      0.77      0.84       518
  Lewis Carroll       1.00      0.36      0.53       365
    Shakespeare       0.99      0.84      0.91       924

       accuracy                           0.90      8306
      macro avg       0.94      0.76      0.82      8306
   weighted avg       0.90      0.90      0.89      8306



In [19]:
# Logistic Regression + TF-IDF
log_tfidf = LogisticRegression(max_iter=1000)
log_tfidf.fit(X_train_tfidf, y_train)

y_pred_log_tfidf = log_tfidf.predict(X_test_tfidf)
acc_log_tfidf = accuracy_score(y_test, y_pred_log_tfidf)
results.append(("Logistic Regression (TF-IDF)", acc_log_tfidf))

print("Logistic Regression (TF-IDF)")
print("Accuracy:", round(acc_log_tfidf, 3))
print(classification_report(y_test, y_pred_log_tfidf, zero_division=0))

Logistic Regression (TF-IDF)
Accuracy: 0.886
                 precision    recall  f1-score   support

Herman Melville       0.86      0.87      0.87      2337
    Jane Austen       0.88      0.97      0.92      4162
    John Milton       0.93      0.64      0.76       518
  Lewis Carroll       0.98      0.56      0.71       365
    Shakespeare       0.97      0.80      0.88       924

       accuracy                           0.89      8306
      macro avg       0.92      0.77      0.83      8306
   weighted avg       0.89      0.89      0.88      8306



In [20]:
# Logistic Regression + CountVectorizer
log_count = LogisticRegression(max_iter=1000)
log_count.fit(X_train_count, y_train)

y_pred_log_count = log_count.predict(X_test_count)
acc_log_count = accuracy_score(y_test, y_pred_log_count)
results.append(("Logistic Regression (CountVectorizer)", acc_log_count))

print("Logistic Regression (CountVectorizer)")
print("Accuracy:", round(acc_log_count, 3))
print(classification_report(y_test, y_pred_log_count, zero_division=0))

Logistic Regression (CountVectorizer)
Accuracy: 0.885
                 precision    recall  f1-score   support

Herman Melville       0.90      0.83      0.86      2337
    Jane Austen       0.86      0.98      0.91      4162
    John Milton       0.95      0.70      0.80       518
  Lewis Carroll       0.95      0.65      0.77       365
    Shakespeare       0.96      0.80      0.87       924

       accuracy                           0.89      8306
      macro avg       0.92      0.79      0.85      8306
   weighted avg       0.89      0.89      0.88      8306



In [21]:
# test sample for Naive Bayes (CountVectorizer)
samples = [
    "O Romeo, Romeo! wherefore art thou Romeo?",
    "It was a bright cold day in April, and the clocks were striking thirteen.",
    "Vanity and pride are different things, though the words are often used synonymously.",
    "To be, or not to be, that is the question."
]

sample_features = count_vec.transform(samples)
predictions = nb_count.predict(sample_features)

print("\nSample Predictions:")
for text, author in zip(samples, predictions):
    print(f"\nText: {text}\n→ Predicted Author: {author}")

    


Sample Predictions:

Text: O Romeo, Romeo! wherefore art thou Romeo?
→ Predicted Author: Shakespeare

Text: It was a bright cold day in April, and the clocks were striking thirteen.
→ Predicted Author: Jane Austen

Text: Vanity and pride are different things, though the words are often used synonymously.
→ Predicted Author: Jane Austen

Text: To be, or not to be, that is the question.
→ Predicted Author: Jane Austen


In [22]:
# test sample for Logistic Regression (CountVectorizer)
sample_features = count_vec.transform(samples)
predictions = log_count.predict(sample_features)

print("\nSample Predictions:")
for text, author in zip(samples, predictions):
    print(f"\nText: {text}\n→ Predicted Author: {author}")


Sample Predictions:

Text: O Romeo, Romeo! wherefore art thou Romeo?
→ Predicted Author: Shakespeare

Text: It was a bright cold day in April, and the clocks were striking thirteen.
→ Predicted Author: Herman Melville

Text: Vanity and pride are different things, though the words are often used synonymously.
→ Predicted Author: Jane Austen

Text: To be, or not to be, that is the question.
→ Predicted Author: Jane Austen
