Extracting Data

In [7]:
!pip install PyPDF2 pdfplumber pandas



In [8]:
import PyPDF2
import pdfplumber
import pandas as pd
import os

# Path to PDF folder
pdf_folder = "Research Papers/"

# List of PDF files
pdf_files = [
    "ACT Bohlmeijer-et-al.-2010-Efficacy-of-an-early-intervention-based-on-ACT.pdf",
    "ACT Info.pdf",
    "ACt-vs-CBT-for-Anxiety.pdf",
    "CBT metaanalysis for sex offenders.pdf",
    "CBT metaanalysis in prisons.pdf",
    "CorrectionalEducationandRecidivism-TowardAToolforReduction.pdf",
    "Dodson_et_al_2011-libre.pdf",
    "Does_incarceration_based_drug_treatment.pdf",
    "drug court metaanalysis.pdf",
    "Meta-analysis-of-CBT-Landenberger-Lipsey_CBT_JEC-paper.pdf",
    "metaanalysys of CBT.pdf",
    "metaanlysis of vocational training to reduce recidivism.pdf",
    "vocational training metaanalysis.pdf",
    "volokh 2011 faith based program metaanlysis.pdf"
]

# Function to extract text from PDFs with debugging
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            
            # Check if the file is encrypted
            if reader.is_encrypted:
                print(f"Encrypted file: {pdf_path} (Skipping)")
                return None
            
            # Try extracting text using PyPDF2
            for page in reader.pages:
                extracted_text = page.extract_text()
                if extracted_text:
                    text += extracted_text + "\n"

        # If PyPDF2 fails, try pdfplumber
        if not text.strip():
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"

        if not text.strip():
            print(f" No text extracted from: {pdf_path}")
            return None

    except Exception as e:
        print(f" Error reading {pdf_path}: {str(e)}")
        return None

    return text

# Extract text from all documents and log errors
documents = {}
failed_files = []
for pdf in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf)
    text = extract_text_from_pdf(pdf_path)
    if text:
        documents[pdf] = text
    else:
        failed_files.append(pdf)

# Convert to DataFrame
df = pd.DataFrame(list(documents.items()), columns=["Document", "Text"])

# Save extracted text to CSV
csv_path = os.path.join(pdf_folder, "extracted_texts.csv")
df.to_csv(csv_path, index=False)

# Display extracted document count and failed files
print(f"\n✅ Successfully extracted {len(df)} files")
if failed_files:
    print(f"⚠️ Failed to extract: {failed_files}")

# Show extracted text
df.head()



✅ Successfully extracted 14 files


Unnamed: 0,Document,Text
0,ACT Bohlmeijer-et-al.-2010-Efficacy-of-an-earl...,Shorter communication\nEfﬁcacy of an early int...
1,ACT Info.pdf,About ACT\nAbout ACT\nPsychological Inflexibil...
2,ACt-vs-CBT-for-Anxiety.pdf,Randomized Clinical Trial of Cognitive Behavio...
3,CBT metaanalysis for sex offenders.pdf,Sexual Offender Treatment Effectiveness Within...
4,CBT metaanalysis in prisons.pdf,www.thelancet.com/psychiatry Vol 8 Septemb...


In [15]:
pip install --upgrade pip setuptools wheel

Collecting setuptools
  Using cached setuptools-76.0.0-py3-none-any.whl.metadata (6.7 kB)
Collecting wheel
  Downloading wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Using cached setuptools-76.0.0-py3-none-any.whl (1.2 MB)
Downloading wheel-0.45.1-py3-none-any.whl (72 kB)
Installing collected packages: wheel, setuptools
Successfully installed setuptools-76.0.0 wheel-0.45.1
Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install spacy

Collecting spacy
  Using cached spacy-3.8.2.tar.gz (1.3 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [101 lines of output]
      Ignoring numpy: markers 'python_version < "3.9"' don't match your environment
      Collecting setuptools
        Using cached setuptools-76.0.0-py3-none-any.whl.metadata (6.7 kB)
      Collecting cython<3.0,>=0.25
        Using cached Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Using cached cymem-2.0.11-cp313-cp313-win_amd64.whl.metadata (8.8 kB)
      Collecting preshed<3.1.0,>=3.0.2
        Using cached preshed-3.0.9.tar.gz (14 kB)
        Installing build dependencies: started
        Installing build dependencies: finished with status 'done'
        Getting requirements to build wheel: started
        Getting requirements to build wheel: finished with status 'done'
        Preparing metadata (pyproject.toml): started
        Preparing metadata (pyproject.toml): f

In [17]:
pip install scikit-learn==1.6.1 pandas==2.2.3 numpy==2.2.3 matplotlib==3.10.1 spacy==3.8.2

Collecting scikit-learn==1.6.1
  Using cached scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting matplotlib==3.10.1
  Using cached matplotlib-3.10.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting spacy==3.8.2
  Using cached spacy-3.8.2.tar.gz (1.3 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [101 lines of output]
      Ignoring numpy: markers 'python_version < "3.9"' don't match your environment
      Collecting setuptools
        Using cached setuptools-76.0.0-py3-none-any.whl.metadata (6.7 kB)
      Collecting cython<3.0,>=0.25
        Using cached Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Using cached cymem-2.0.11-cp313-cp313-win_amd64.whl.metadata (8.8 kB)
      Collecting preshed<3.1.0,>=3.0.2
        Using cached preshed-3.0.9.tar.gz (14 kB)
        Installing build dependencies: started
        Installing build dependencies: finished with status 'done'
        Getting requirements to build wheel: started
        Getting requirements to build wheel: finished with status 'done'
        Preparing metadata (pyproject.toml): started
        Preparing metadata (pyproject.toml): f

In [13]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Custom stop words (to remove unwanted terms like 'et', 'al')
custom_stopwords = {"et", "al", "pm", "id", "use", "new"}

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    return " ".join([word for word in text.lower().split() if word not in custom_stopwords])

df["Cleaned_Text"] = df["Text"].apply(preprocess_text)

# Convert text to a document-term matrix with more extracted words
vectorizer = CountVectorizer(stop_words="english", max_features=2000)
doc_term_matrix = vectorizer.fit_transform(df["Cleaned_Text"])

# Apply LDA with optimized parameters
lda_model = LatentDirichletAllocation(n_components=5, max_iter=15, random_state=42, learning_method="online")
lda_topics = lda_model.fit_transform(doc_term_matrix)

# Get top words for each topic
words = vectorizer.get_feature_names_out()
topics = {
    f"Topic {i+1}": [words[idx] for idx in lda_model.components_[i].argsort()[-15:]]  # Increased to 15 words
    for i in range(5)
}

# Convert to DataFrame
lda_df = pd.DataFrame(topics)

# Display improved topics
display(lda_df)

ModuleNotFoundError: No module named 'sklearn'