In [1]:
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOllama
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.schema import Document
import pandas as pd
from langchain.prompts import PromptTemplate

In [2]:
data_path = "train.csv"  # Replace with your CSV file path
df = pd.read_csv(data_path)

# Ensure your CSV file has 'text', 'author', and 'id' columns
# assert "text" in df.columns, "'text' column not found in CSV file"
# assert "author" in df.columns, "'author' column not found in CSV file"
# assert "id" in df.columns, "'id' column not found in CSV file"

# Convert each row into a Document object
documents = [
    Document(
        page_content=row["text"],
        metadata={"author": row["author"], "id": row["id"]}
    )
    for _, row in df.iterrows()
]



In [3]:
documents[0]

Document(metadata={'author': 'EAP', 'id': 'id26305'}, page_content='This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.')

In [6]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap=50)
doc_splits = text_splitter.split_documents(documents)

# Use a pre-trained SentenceTransformer model for embeddings
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Smaller, fast model





In [8]:
# Create FAISS Vectorstore
vectorstore = FAISS.from_documents(doc_splits, embedding_model)

# Initialize the retrieval chain
retriever = vectorstore.as_retriever(serch_k=15)

In [21]:
llm = ChatOllama(model="llama3.1", format="json", temperature=0.2)


# Define a prompt template for the classification task
classification_prompt = PromptTemplate(
    template="""
You are an expert in stylometric analysis. Your task is to predict the author of a given text based on writing style. Consider stylistic features such as word choice, sentence structure, punctuation usage, and overall writing patterns to identify the author.

The input text below has a distinct writing style. Using the stylistic features from the text and your knowledge of the authors, classify the text's author.

Text: {text}

Here are the authors to choose from: {authors_list}

If the text doesn't closely resemble the style of any of the authors above, return "None".

Answer in the following format:
{{"author": "predicted_author_name"}}
""",
    input_variables=["text", "authors_list"]
)





In [22]:
# Define the RAG pipeline for author classification with the prompt template
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",  # Directly using the stuff chain for this example
    return_source_documents=False
)



In [23]:
import json

# Process workflow to classify the author
def classify_author(input_text, authors_list):
    # Prepare the input prompt for classification with the list of authors
    prompt = classification_prompt.format(text=input_text, authors_list=authors_list)
    
    # Retrieve relevant context and generate the prediction
    prediction = qa_chain.run(prompt)

    # Try to parse the result as JSON if it's a string
    try:
        prediction_dict = json.loads(prediction)
    except json.JSONDecodeError:
        prediction_dict = {"author": prediction}  # If the result is not in JSON format, treat it as a plain string

    # Check if the prediction is in the authors list; if not, classify as "None"
    if prediction_dict["author"] not in authors_list:
        prediction_dict["author"] = "None"

    # Return the predicted author
    return prediction_dict




In [24]:
# Example usage
authors_list = ", ".join(df["author"].unique())  # List of authors in the dataset

sample_text = "And the children's children, and the newcomers' children, grew up."

final_prediction = classify_author(sample_text, authors_list)

In [25]:

sample_text = "The rigging was found to be ill fitted, and greatly strained; and on the third day of the blow, about five in the afternoon, our mizzen mast, in a heavy lurch to windward, went by the board."
print(f"Predicted Author: {final_prediction}")

final_prediction = classify_author(sample_text, authors_list)


Predicted Author: {'author': 'MWS'}


In [26]:
from sklearn.metrics import accuracy_score
# Load the test data
test_data_path = "test.csv"  # Replace with your test CSV file path
test_df = pd.read_csv(test_data_path)

# List of authors in the dataset
authors_list = ", ".join(df["author"].unique())

# Predict the author for each text in the test data
test_df["predicted_author"] = test_df["text"].apply(lambda x: classify_author(x, authors_list))



In [27]:
test_df["predicted_author"]

0      {'author': 'HPL'}
1      {'author': 'MWS'}
2      {'author': 'EAP'}
3      {'author': 'EAP'}
4      {'author': 'HPL'}
             ...        
995    {'author': 'EAP'}
996    {'author': 'EAP'}
997    {'author': 'MWS'}
998    {'author': 'EAP'}
999    {'author': 'EAP'}
Name: predicted_author, Length: 1000, dtype: object

In [28]:
test_df['predicted_author'] = test_df['predicted_author'].astype(str)

all_strings = test_df['predicted_author'].apply(lambda x: isinstance(x, str)).all()

print("Are all values in 'predicted_author' strings?", all_strings)


Are all values in 'predicted_author' strings? True


In [29]:
import re
def extract_author(value):
    if isinstance(value, str) and "'author':" in value:
        match = re.search(r"'author': '(\w+)'", value)
        if match:
            return match.group(1)
    return None  # Return None if no match

test_df['predicted_author_final'] = test_df['predicted_author'].apply(extract_author)

In [32]:
test_df['predicted_author_final']

0      HPL
1      MWS
2      EAP
3      EAP
4      HPL
      ... 
995    EAP
996    EAP
997    MWS
998    EAP
999    EAP
Name: predicted_author_final, Length: 1000, dtype: object

In [33]:
accuracy = accuracy_score(test_df["author"],test_df['predicted_author_final'])
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 54.40%


In [50]:
from sklearn.metrics import  classification_report

print(classification_report(y_pred=test_df['predicted_author'], y_true=test_df['author']))

              precision    recall  f1-score   support

         EAP       0.53      0.62      0.57       390
         HPL       0.63      0.51      0.56       280
         MWS       0.51      0.49      0.50       330

    accuracy                           0.54      1000
   macro avg       0.56      0.54      0.54      1000
weighted avg       0.55      0.54      0.54      1000



In [36]:
test_df=test_df.drop(columns=['predicted_author'])

In [37]:
test_df

Unnamed: 0,id,text,author,predicted_author_final
0,id15695,The gigantic magnitude and the immediately ava...,EAP,HPL
1,id07954,Shall I disturb this calm by mingling in the w...,MWS,MWS
2,id16303,He had seen so many customs and witnessed so g...,MWS,EAP
3,id07932,We went up stairs into the chamber where the b...,EAP,EAP
4,id20875,Over those horrors the evil moon now hung very...,HPL,HPL
...,...,...,...,...
995,id11614,"We had been sitting in the dark, and Dupin now...",EAP,EAP
996,id03682,"His coat tail is very far longer his pipe, his...",EAP,EAP
997,id15691,As I spoke I fixed my eyes upon his countenanc...,MWS,MWS
998,id21676,"On this occasion, my sister was not alone; nor...",MWS,EAP


In [38]:
test_df.to_csv('test_predicted.csv', index=False)