In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/question/q.txt


In [4]:
# Step 1: Install required packages
!pip install PyMuPDF rake-nltk yake scikit-learn

# Step 2: Import libraries
import nltk
# Download all required NLTK resources
nltk.download(['stopwords', 'punkt', 'punkt_tab', 'wordnet', 'omw-1.4'])

import fitz  # PyMuPDF
import re
from rake_nltk import Rake
from yake import KeywordExtractor
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# Step 3: Document scraping function
def scrape_pdf_text(path):
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text



[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


**Load the question from the file.**

In [16]:

# Example document (replace with your document path)
text = scrape_pdf_text("/kaggle/input/question1/q1.txt")  # Use any PDF in Kaggle input

text

"To prepare for this discussion, watch the nearly 10-\nminute videoBiotech Academy: A catalyst for change. \n \nhttps://www.youtube.com/watch?v=cY66NX1174s\n \n \n Create an initial post that addresses the following:\nShare a minimum of three things you observed showing\nhow teachers at Biotech Academy instigate and\nmaintain student support and success. Be sure to\nconsider what characteristics they have to have and\nwhat behaviors and practices they need to present to\ntheir students.\n \nDescribe the impact collaboration between teachers,\nparents, and students has on student success as well\nas having a spirit of cooperation instead of\ncompetition between students. \n \nDiscuss two to three observations you made that show\nrelevancy between Biotech Academy instruction &\nsupports and what students need in post-secondary\neducation and employment. Be sure to address how\nhaving such relevancy for student learning impacts\ntheir ultimate success.\n \nDiscuss how traditional school e

Pre-process text (sort of normalization equivalent)

In [17]:
# Step 4: Text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    return ' '.join([word for word in words if word not in stop_words and len(word) > 2])

cleaned_text = preprocess_text(text)
cleaned_text

'prepare discussion watch nearly minute videobiotech academy catalyst change httpswwwyoutubecomwatchvcy66nx1174s create initial post addresses following share minimum three things observed showing teachers biotech academy instigate maintain student support success sure consider characteristics behaviors practices need present students describe impact collaboration teachers parents students student success well spirit cooperation instead competition students discuss two three observations made show relevancy biotech academy instruction supports students need postsecondary education employment sure address relevancy student learning impacts ultimate success discuss traditional school environments might learn biotech academy employ school culture potentially increase graduation rates matriculation postsecondary education employment atrisk students provide specific feedback anyone may reading paper regarding relevancy biotech academy postsecondary education employment assume persons viewpo

In [18]:
# Step 5: Keyword extraction implementations

# 5.1 RAKE Implementation
rake = Rake()
rake.extract_keywords_from_text(cleaned_text)
rake_keywords = rake.get_ranked_phrases_with_scores()[:10]

In [19]:
rake.get_ranked_phrases_with_scores()

[(21025.0,
  'prepare discussion watch nearly minute videobiotech academy catalyst change httpswwwyoutubecomwatchvcy66nx1174s create initial post addresses following share minimum three things observed showing teachers biotech academy instigate maintain student support success sure consider characteristics behaviors practices need present students describe impact collaboration teachers parents students student success well spirit cooperation instead competition students discuss two three observations made show relevancy biotech academy instruction supports students need postsecondary education employment sure address relevancy student learning impacts ultimate success discuss traditional school environments might learn biotech academy employ school culture potentially increase graduation rates matriculation postsecondary education employment atrisk students provide specific feedback anyone may reading paper regarding relevancy biotech academy postsecondary education employment assume p

In [21]:
# 5.2 YAKE Implementation
yake = KeywordExtractor(lan="en", top=20)
yake_keywords = yake.extract_keywords(cleaned_text)
yake_keywords

[('postsecondary education employment', 0.00016939364285747637),
 ('traditional school environments', 0.00037389667683901005),
 ('relevancy biotech academy', 0.00041210500562966976),
 ('matriculation postsecondary education', 0.000547683780067102),
 ('education employment atrisk', 0.000547683780067102),
 ('education employment assume', 0.000547683780067102),
 ('academy postsecondary education', 0.0005798259634228021),
 ('biotech academy postsecondary', 0.0007114628475933112),
 ('similar demonstrated biotech', 0.0007243035098468239),
 ('create initial post', 0.0007365455463335732),
 ('biotech academy instigate', 0.0007668888198067515),
 ('biotech academy instruction', 0.0007668888198067515),
 ('learn biotech academy', 0.0007668888198067515),
 ('biotech academy employ', 0.0007668888198067515),
 ('demonstrated biotech academy', 0.0007668888198067515),
 ('show relevancy biotech', 0.0007784146928525921),
 ('academy catalyst change', 0.0007798554005465419),
 ('school environments adopting', 

In [22]:
# 5.3 TF-IDF Implementation (Scikit-learn)
def tfidf_extractor(text, n=30):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    return sorted(zip(feature_names, tfidf_matrix.sum(0).A1), 
                  key=lambda x: x[1], reverse=True)[:n]

tfidf_keywords = tfidf_extractor(cleaned_text)
tfidf_keywords


[('academy', 0.27824333745610097),
 ('biotech', 0.23186944788008415),
 ('biotech academy', 0.23186944788008415),
 ('students', 0.23186944788008415),
 ('education', 0.13912166872805048),
 ('education employment', 0.13912166872805048),
 ('employment', 0.13912166872805048),
 ('postsecondary', 0.13912166872805048),
 ('postsecondary education', 0.13912166872805048),
 ('relevancy', 0.13912166872805048),
 ('school', 0.13912166872805048),
 ('student', 0.13912166872805048),
 ('success', 0.13912166872805048),
 ('teachers', 0.13912166872805048),
 ('discuss', 0.09274777915203367),
 ('environments', 0.09274777915203367),
 ('learning', 0.09274777915203367),
 ('need', 0.09274777915203367),
 ('provide', 0.09274777915203367),
 ('regarding', 0.09274777915203367),
 ('relevancy biotech', 0.09274777915203367),
 ('school environments', 0.09274777915203367),
 ('sure', 0.09274777915203367),
 ('three', 0.09274777915203367),
 ('traditional', 0.09274777915203367),
 ('traditional school', 0.09274777915203367),
 (

In [23]:
# Step 6: Display results
print("RAKE Keywords:", [phrase for score, phrase in rake_keywords])
print("\nYAKE Keywords:", [kw[0] for kw in yake_keywords])
print("\nTF-IDF Keywords:", [kw[0] for kw in tfidf_keywords])

RAKE Keywords: ['prepare discussion watch nearly minute videobiotech academy catalyst change httpswwwyoutubecomwatchvcy66nx1174s create initial post addresses following share minimum three things observed showing teachers biotech academy instigate maintain student support success sure consider characteristics behaviors practices need present students describe impact collaboration teachers parents students student success well spirit cooperation instead competition students discuss two three observations made show relevancy biotech academy instruction supports students need postsecondary education employment sure address relevancy student learning impacts ultimate success discuss traditional school environments might learn biotech academy employ school culture potentially increase graduation rates matriculation postsecondary education employment atrisk students provide specific feedback anyone may reading paper regarding relevancy biotech academy postsecondary education employment assum

In [15]:

# Step 7: Title generation
def generate_title(keywords_list):
    # Simple strategy: Take first keyword from each method
    return ' '.join([keywords_list[0][0], keywords_list[1][0], keywords_list[2][0]])

title = generate_title([
    [phrase for score, phrase in rake_keywords],
    [kw[0] for kw in yake_keywords],
    [kw[0] for kw in tfidf_keywords]
])

print("\nGenerated Title:", title)


Generated Title: write research paper contains following define describe cloudbased collaboration google docs cloudbased tool used document sharing discuss pros cons using google docs businessbased documents compare contrast use google docs microsoft 365 word docs businessbased documents research paper requirements paper four pages long including title reference pages use times new roman size font throughout paper apply apa 7th edition style include three major sections title page main body references minimum two scholarly journal articles besides textbook required writing demonstrate thorough understanding materials address required elements writing use exceptional language skillfully communicates meaning readers clarity fluency virtually errorfree note plagiarism check required apa7 format include references within 8hrs docs businessbased documents docs


End.