In [2]:
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_ollama import ChatOllama
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from IPython.display import Markdown, HTML
from langchain.retrievers import BM25Retriever
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.vectorstores import FAISS


### Basic RAG

In [6]:
# 1. Load docs
loader = TextLoader("docs/StockWatson.txt")  # or PyPDFLoader for PDFs
docs = loader.load()

# 2. Chunk
splitter = CharacterTextSplitter(chunk_size=3000, chunk_overlap=800)
chunks = splitter.split_documents(docs)

# 3. Embed + 4. Build FAISS  
embeddings = OllamaEmbeddings(model = 'nomic-embed-text')
faiss_index = FAISS.from_documents(chunks, embeddings)

# 5. Retrieval-QA chain  
qa = RetrievalQA.from_chain_type(
    llm=ChatOllama(model = "gemma3:12b"), 
    chain_type="stuff", 
    retriever=faiss_index.as_retriever(search_type="similarity", search_kwargs={"k": 10, "fetch_k": 50})
)



Created a chunk of size 3340, which is longer than the specified 3000
Created a chunk of size 3491, which is longer than the specified 3000
Created a chunk of size 3408, which is longer than the specified 3000
Created a chunk of size 3289, which is longer than the specified 3000
Created a chunk of size 3172, which is longer than the specified 3000
Created a chunk of size 3369, which is longer than the specified 3000
Created a chunk of size 3253, which is longer than the specified 3000
Created a chunk of size 3313, which is longer than the specified 3000
Created a chunk of size 3200, which is longer than the specified 3000
Created a chunk of size 3125, which is longer than the specified 3000
Created a chunk of size 3331, which is longer than the specified 3000
Created a chunk of size 3127, which is longer than the specified 3000
Created a chunk of size 3570, which is longer than the specified 3000
Created a chunk of size 3258, which is longer than the specified 3000
Created a chunk of s

KeyboardInterrupt: 

In [None]:
# Example query
answer = qa.invoke("What are the assumptions of Linear Regression and how to test them?")
print(answer)

In [None]:
Markdown(answer['result'])

## Better Chunking

In [3]:

# 1. Load and enhanced split
loader = TextLoader("docs/StockWatson.txt")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1800, chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)
chunks = splitter.split_documents(docs)

# 3. FAISS with metadata
embeddings = OllamaEmbeddings(model = 'nomic-embed-text')

faiss_index = FAISS.from_documents(chunks, embeddings)

# 4. Metadata-aware retriever
retriever = faiss_index.as_retriever(
    search_type="similarity",
    search_kwargs={"k":5, 'top_k': 20},
)

# 5. Retrieval-QA chain  
qa = RetrievalQA.from_chain_type(
    llm=ChatOllama(model = "gemma3:12b"), 
    chain_type="stuff", 
    retriever=retriever)


  embeddings = OllamaEmbeddings(model = 'nomic-embed-text')


In [4]:
# Example query
answer = qa.invoke("What are the assumptions of Linear Regression and how to test them? Explain each point in detail.")
Markdown(answer['result'])

Okay, let's break down the assumptions of linear regression and how to test them. The assumptions are critical because they underpin the validity of the statistical inferences (hypothesis tests, confidence intervals) you draw from your regression model. Violations can lead to incorrect conclusions.

**Key Concepts First: The Linear Regression Model**

Before diving into assumptions, let's quickly recap the basic linear regression model:

Y<sub>i</sub> = β<sub>0</sub> + β<sub>1</sub>X<sub>1i</sub> + β<sub>2</sub>X<sub>2i</sub> + ... + β<sub>k</sub>X<sub>ki</sub> + ε<sub>i</sub>

Where:

*   Y<sub>i</sub>: The dependent variable for observation i
*   X<sub>1i</sub>, X<sub>2i</sub>, ..., X<sub>ki</sub>: The independent (predictor) variables for observation i
*   β<sub>0</sub>: The intercept (constant term)
*   β<sub>1</sub>, β<sub>2</sub>, ..., β<sub>k</sub>:  The coefficients (slopes) representing the effect of each independent variable on the dependent variable.
*   ε<sub>i</sub>: The error term (residual) for observation i, representing the difference between the actual value of Y and the value predicted by the model.

**The Assumptions (and How to Test Them)**

Here’s a detailed breakdown of the standard linear regression assumptions, their significance, and methods to test them.  I've divided them into "Essential" and "Technical" because the former are absolutely critical for the validity of most standard inference, and the latter are more technical and often relate to specific properties of estimators.

**I. Essential Assumptions (Critical for Inference)**

1.  **Linearity:**
    *   **What it means:**  The relationship between the independent variables and the dependent variable is linear. This doesn’t mean that the relationship is a straight line for *every* single observation. It means that the *average* or expected value of Y changes linearly with changes in the independent variables.
    *   **How to test:**
        *   **Scatterplots:** Create scatterplots of each independent variable against the dependent variable. Look for non-linear patterns (curves, exponential growth, etc.).
        *   **Residual Plots:**  Plot the residuals (the difference between the actual and predicted values) against each independent variable.  A random scatter of points around zero suggests linearity.  Patterns (e.g., a curved shape) indicate non-linearity.
        *   **Transformations:** If non-linearity is suspected, try transforming variables (e.g., taking logarithms, squares, cube roots).
2.  **Independence of Errors:**
    *   **What it means:** The error terms for different observations are independent of each other. In other words, the error for one observation doesn't influence the error for another.  This is *especially* critical for time-series data.
    *   **How to test:**
        *   **Durbin-Watson Test:** A formal test for autocorrelation (correlation between error terms).  A value close to 2 suggests no autocorrelation. Values significantly below or above 2 suggest autocorrelation. (Note: the interpretation of the Durbin-Watson statistic can be complex and is highly dependent on the characteristics of the data and the regression model.)
        *   **Residual Autocorrelation Plot:** Plot the residuals against their order in the data. A pattern here suggests autocorrelation.
        *   **Time Series Plots:** If you have time series data, plot the actual values of Y and the residuals over time. Look for patterns.
3.  **Homoskedasticity (Constant Variance of Errors):**
    *   **What it means:** The variance of the error term is constant for all values of the independent variables.  "Heteroskedasticity" means that the variance is *not* constant – it changes with the independent variables.
    *   **How to test:**
        *   **Residual Plots:** Plot the residuals against the predicted values or against independent variables.  Look for a "funnel shape" or other patterns that suggest the variance is changing.
        *   **Breusch-Pagan Test or White Test:** Formal statistical tests for heteroskedasticity.  These tests have null hypotheses of homoskedasticity.
4.  **Normality of Errors:**
    *   **What it means:** The error terms are normally distributed. This assumption is less critical for large sample sizes (due to the Central Limit Theorem), but it’s important for small samples or when making precise inferences.
    *   **How to test:**
        *   **Histograms or Density Plots of Residuals:** Visually check if the residuals resemble a normal distribution.
        *   **Q-Q Plot (Quantile-Quantile Plot):**  Plots the quantiles of the residuals against the quantiles of a normal distribution. If the residuals are normally distributed, the points will fall close to a straight line.
        *   **Jarque-Bera Test:** A formal test for normality.

**II. Technical Assumptions (Affect Efficiency & Properties)**

1.  **Zero Mean of Errors:**
    *   **What it means:**  The expected value of the error term is zero.  This is usually met if the intercept is appropriately estimated.
    *   **How to test:**  This is typically checked implicitly through the regression process itself. A poorly estimated intercept can suggest this assumption is violated.
2.  **No Multicollinearity:**
    *   **What it means:**  The independent variables are not highly correlated with each other. High multicollinearity can inflate standard errors, making it difficult to determine the individual effects of the variables.
    *   **How to test:**
        *   **Correlation Matrix:** Calculate the correlation coefficients between all pairs of independent variables.  High correlations (e.g., > 0.7 or 0.8) suggest multicollinearity.
        *   **Variance Inflation Factor (VIF):** A measure of how much the variance of an estimated coefficient is inflated due to multicollinearity.  A VIF greater than 5 or 10 often indicates a problem.

**Important Considerations:**

*   **Remedial Measures:** If assumptions are violated, you can take steps to address the problems, such as transforming variables, adding new variables, or using different estimation techniques.
*   **Severity of Violations:** Not all violations are equally serious. Some violations (e.g., mild non-normality in a large sample) may have little impact on inferences, while others (e.g., severe heteroskedasticity) can lead to incorrect conclusions.
*   **Software:**  Most statistical software packages (e.g., R, Stata, SPSS, Python with libraries like statsmodels) provide tools to test these assumptions and diagnose potential problems.



Let me know if you'd like a deeper dive into any specific assumption or testing method!

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import BM25Retriever
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.retrievers.ensemble import EnsembleRetriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import HuggingFaceRerank
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOllama

# 1. Load the source document(s) from disk.
loader = TextLoader("docs/StockWatson.txt")
# TextLoader reads a plain-text file and wraps it as a LangChain Document.
docs = loader.load()
# load() returns a list of Document objects containing the file text.


# 2. Split Documents into overlapping chunks.
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1800,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)
# We split on paragraph breaks first, then lines, then spaces, falling back to characters.
chunks = splitter.split_documents(docs)
# This yields many smaller Document chunks, which improves retrieval precision and LLM context handling.


# 3. Build two retrievers: BM25 for keyword matches…
bm25 = BM25Retriever.from_documents(chunks)
# …and FAISS for semantic embedding search.
embeddings = OllamaEmbeddings(model="nomic-embed-text")
faiss_index = FAISS.from_documents(chunks, embeddings)
faiss_ret = faiss_index.as_retriever()


# 4. Combine BM25 + FAISS using Reciprocal Rank Fusion.
ensemble = EnsembleRetriever(retrievers=[bm25, faiss_ret])
# EnsembleRetriever merges rankings from multiple retrievers to balance exact-match and semantic recall.


# 5. Set up a Hugging Face cross-encoder reranker.
reranker = HuggingFaceRerank(
    model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2",
    # This lightweight cross-encoder scores each (query, passage) pair.
    top_k=10
    # We’ll rerank only the top 10 candidates from the ensemble.
)
# HuggingFaceRerank uses the specified model to assign fine-grained relevance scores.


# 6. Wrap retrieval + reranking in a ContextualCompressionRetriever.
cc_retriever = ContextualCompressionRetriever(
    base_retriever=ensemble,
    base_compressor=reranker
)
# On each query, LangChain will:
#   a) call ensemble.get_relevant_documents(query) → candidates
#   b) call reranker.rerank(query, candidates) → reranked, trimmed list
#   c) return that final list into the QA chain.


# 7. Assemble the final RetrievalQA chain with “stuff” combiner.
qa_hf_rerank = RetrievalQA.from_chain_type(
    llm=ChatOllama(model="gemma3:12b"),
    retriever=cc_retriever,
    chain_type="stuff",
    verbose=True,
    return_source_documents=True
)
# RetrievalQA.from_chain_type:
#  • pulls context via cc_retriever
#  • concatenates all returned chunks (“stuff”)
#  • sends the combined prompt + question to the LLM
#  • returns the LLM’s answer (and, if requested, the source documents)


# Example usage:
question = "What investment strategies does Stock-Watson recommend for inflation hedging?"
result = qa_hf_rerank({"query": question})
print("Answer:", result["result"])
print("Sources:", [doc.metadata.get("source") for doc in result["source_documents"]])


In [10]:
answer = qa_hybrid.invoke("What are the assumptions of Linear Regression and how to test them? Explain each point in detail.")
Markdown(answer['result'])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


Okay, let's break down the assumptions of linear regression and how to test them.  Understanding these assumptions is *critical* because violating them can lead to inaccurate predictions, misleading conclusions, and generally unreliable models.  I'll be detailed, covering the assumptions, why they matter, and practical tests.

**1. Linearity:**

* **What it means:**  The relationship between the independent variable(s) (X) and the dependent variable (Y) is linear.  This means when you plot X against Y, the points should roughly form a straight line. It's not about perfect linearity (some "wiggle room" is expected), but a *general* linear trend.  It also implies that the relationship is linear at *all* levels of X.
* **Why it matters:**  If the relationship is non-linear (e.g., exponential, quadratic, etc.), a linear model will be a poor fit.  It will under- or over-predict certain areas of the data, leading to a high residual error.
* **How to Test:**
    * **Scatter Plots:** The most basic and visual way.  Plot Y vs. X.  Look for curves, bends, or other non-linear patterns.  It's easy to spot major deviations.
    * **Residual Plots:** *This is the most important test.*  Plot the residuals (the difference between the actual Y and the predicted Y) against the predicted Y values (Ŷ) or against the independent variables (X).
        * **Linearity Test for One Predictor:** Plot residuals versus one predictor at a time. A random scattering of points around zero indicates linearity. A pattern (e.g., a curve, a funnel shape, a fan shape) suggests non-linearity.
        * **Adding Squared Terms:** If a residual plot shows a curve, consider adding a squared term (X²) to your model.  This can help capture a curvilinear relationship.  If the curved pattern disappears in the residual plot, it supports the transformation.
    * **Transformation:** Apply transformations to the variables (e.g., log transformation, square root transformation) to linearize the relationship. Then re-run the regression and check the residuals.

**2. Independence of Errors (Residuals):**

* **What it means:** The error terms (residuals) for each observation are independent of each other.  One error doesn't "influence" another.  This is particularly important in time series data or data with a spatial component.  Autocorrelation (correlation between errors) violates this assumption.
* **Why it matters:**  Autocorrelation leads to underestimated standard errors.  This means your p-values will be artificially low, and you're more likely to incorrectly reject the null hypothesis (saying a variable is significant when it isn't).
* **How to Test:**
    * **Durbin-Watson Test:**  A common statistical test that produces a value between 0 and 4.
        * 2 ≈ No autocorrelation (ideal)
        * Close to 0: Positive autocorrelation (errors are positively correlated with previous errors)
        * Close to 4: Negative autocorrelation (errors are negatively correlated with previous errors)
        * *Important:* The Durbin-Watson test can be misleading with multiple independent variables.
    * **Plot Residuals vs. Order of Observation:**  If you have time series data, plot the residuals against the order of the observations.  A pattern (e.g., alternating positive and negative residuals) indicates autocorrelation.
    * **Correlogram (ACF/PACF):** For time series, a correlogram plots the autocorrelation function (ACF) and partial autocorrelation function (PACF). These can help you determine the order of autoregressive models that might be needed to address autocorrelation.

**3. Homoscedasticity (Constant Variance of Errors):**

* **What it means:** The variance of the error terms is constant across all levels of the independent variables.  In simpler terms, the "spread" of the residuals is roughly the same for all predicted values.
* **Why it matters:**  Heteroscedasticity (non-constant variance) leads to inefficient estimates of the standard errors. Your confidence intervals will be too narrow or too wide, and your hypothesis tests will be unreliable.
* **How to Test:**
    * **Residual Plots:**  Again, vital.  Plot residuals vs. predicted Y values (Ŷ) or vs. each independent variable.  Look for a "fan" shape or a funnel shape.  This indicates that the variance of the residuals is changing with the level of the predictor.
    * **Breusch-Pagan Test / White Test:**  These are formal statistical tests for heteroscedasticity.  They involve regressing the squared residuals on the independent variables.  A significant p-value indicates heteroscedasticity.
    * **Spread-Level Plot:** Plot the absolute values of residuals against the predicted values.

**4. Normality of Errors:**

* **What it means:**  The error terms are normally distributed.  This assumption is less critical than the others, especially with larger sample sizes (due to the Central Limit Theorem).
* **Why it matters:**  While not as critical, violated normality can affect the accuracy of p-values and confidence intervals, especially with smaller samples.
* **How to Test:**
    * **Histogram / Q-Q Plot:**  Plot a histogram of the residuals or use a quantile-quantile (Q-Q) plot.  A Q-Q plot compares the quantiles of the residuals to the quantiles of a normal distribution.  If the residuals are normally distributed, the points on the Q-Q plot should fall close to a straight line.
    * **Shapiro-Wilk Test / Kolmogorov-Smirnov Test:**  These are formal statistical tests for normality. A significant p-value indicates non-normality. *Be careful with these tests*, as they can be overly sensitive to minor deviations from normality, especially with large samples.

**5. No or Little Multicollinearity (for multiple regression):**

* **What it means:**  The independent variables are not highly correlated with each other.
* **Why it matters:**  High multicollinearity makes it difficult to determine the individual effect of each predictor variable. It can lead to unstable coefficient estimates (small changes in the data can lead to large changes in the coefficients), inflated standard errors, and unreliable hypothesis tests.
* **How to Test:**
    * **Correlation Matrix:** Calculate the correlation matrix of the independent variables.  Values close to +1 or -1 indicate high correlation.
    * **Variance Inflation Factor (VIF):**  A VIF measures how much the variance of a coefficient estimate is inflated due to multicollinearity. A VIF greater than 10 is generally considered a cause for concern.
    * **Tolerance:**  Tolerance is the reciprocal of VIF (Tolerance = 1/VIF).  Values close to 0 indicate high multicollinearity.



**Important Considerations and Remedial Actions:**

* **Not all assumptions need to be perfectly met:**  Linear regression is *robust* to some violations, especially with larger sample sizes.
* **Addressing Violations:**
    * **Non-Linearity:** Transformations (log, square root), adding polynomial terms, or using a non-linear model.
    * **Autocorrelation:** Include lagged variables, use time series models (ARIMA).
    * **Heteroscedasticity:** Use weighted least squares, transform variables, or use robust standard errors.
    * **Non-Normality:**  Consider transformations, use non-parametric methods.
    * **Multicollinearity:** Remove one of the correlated variables, combine them into a single variable, or use regularization techniques.

**Tools for Testing Assumptions:**

* **Statistical Software:**  R, Python (with libraries like `statsmodels` and `scikit-learn`), SPSS, Stata.  These packages have built-in functions for many of the tests mentioned above.
* **Excel:** Can be used for simple scatter plots and correlation calculations, but less suitable for formal hypothesis tests.

To help me tailor my advice further, could you tell me:

*   Are you working with a specific dataset or problem?
*   What statistical software are you using?

## ReRanking with CrossEncoder

In [11]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank

# 1. Load the text file into memory as a LangChain Document.
loader = TextLoader("docs/StockWatson.txt")  
docs = loader.load()  

# 2. Split the document into overlapping chunks of up to 1800 characters each.
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1800,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)
chunks = splitter.split_documents(docs)  

# 3. Build a BM25 retriever for exact keyword matching over the chunks.
bm25 = BM25Retriever.from_documents(chunks)  
llm = ChatOllama(model="gemma3:12b")

# 4. Build a FAISS retriever for semantic similarity over the same chunks.
embeddings = OllamaEmbeddings(model="nomic-embed-text")  
faiss_index = FAISS.from_documents(chunks, embeddings)  
faiss_ret = faiss_index.as_retriever()  

# 5. Ensemble the two retrievers using Reciprocal Rank Fusion.
ensemble = EnsembleRetriever(retrievers=[bm25, faiss_ret])  
# EnsembleRetriever will merge BM25 and FAISS hits into one unified ranking. :contentReference[oaicite:2]{index=2}

# 6. Initialize the MultiQueryRetriever correctly:
multi_q = MultiQueryRetriever.from_llm(
    llm=llm,
    # The language model used to rewrite queries.
    retriever=ensemble,
    # A single BaseRetriever (here, our EnsembleRetriever) that will be called for each rewrite.
    include_original=True,
    # Include the original user query in addition to LLM-generated rewrites.
)
# Note: There is no `num_queries` parameter. By default, the internal prompt generates three query variants.
# If you need a different number of rewrites, supply a custom `prompt` BasePromptTemplate that specifies your desired count.

# 7. Build the RetrievalQA chain using the “stuff” combiner.
qa_hybrid = RetrievalQA.from_chain_type(
    llm=llm,
    # We pass our Ollama-based LLM under the 'llm' parameter. :contentReference[oaicite:5]{index=5}

    retriever=multi_q,
    # We pass our MultiQueryRetriever instance under the 'retriever' parameter. :contentReference[oaicite:6]{index=6}

    chain_type="stuff",
    # We explicitly choose the "stuff" combine strategy (default if omitted). :contentReference[oaicite:7]{index=7}

    verbose=True,
    # We enable verbose mode so we can inspect the prompt and retrieval details. :contentReference[oaicite:8]{index=8}

    return_source_documents=True
    # We ask the chain to return which source documents were used. :contentReference[oaicite:9]{index=9}
)
# Base retriever: hybrid_retriever
reranker = CohereRerank(top_n=10, model="rerank-english-v2.0")
cc_retriever = ContextualCompressionRetriever(
    base_retriever=qa_hybrid,
    base_compressor=reranker
)

qa_rerank = RetrievalQA(
    llm=llm,
    retriever=cc_retriever
)

  reranker = CohereRerank(top_n=10, model="rerank-english-v2.0")


ImportError: Could not import cohere python package. Please install it with `pip install cohere`.

In [None]:
answer = qa_rerank.invoke("What are the assumptions of Linear Regression and how to test them? Explain each point in detail.")
Markdown(answer['result'])