In [1]:
pip install nltk




In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')

# Sample sentences
sentences = [
    "Artificial intelligence (AI) is a field of computer science.",
    "Machine learning is a subset of AI that focuses on training models to make predictions.",
    "Deep learning is a type of machine learning that uses neural networks with multiple layers.",
    "Neural networks are composed of interconnected nodes called neurons.",
    "Recurrent neural networks (RNNs) are commonly used in natural language processing tasks."
]

# Tokenization and preprocessing
def preprocess(sentence):
    tokens = word_tokenize(sentence.lower())  # Tokenization and convert to lowercase
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]  # Remove stopwords and non-alphanumeric tokens
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]  # Stemming
    return ' '.join(tokens)

preprocessed_sentences = [preprocess(sentence) for sentence in sentences]

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Extract top keywords or phrases based on TF-IDF scores
def get_top_keywords(doc, top_n=5):
    feature_index = tfidf_matrix[doc].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])
    sorted_scores = sorted(tfidf_scores, key=lambda x: (x[1]), reverse=True)
    top_features = [feature_names[i] for i, _ in sorted_scores[:top_n]]
    return top_features

# Extract top keywords/phrases for each sentence
for i, sentence in enumerate(sentences):
    print(f"Keywords for Sentence {i+1}: {get_top_keywords(i)}")


Keywords for Sentence 1: ['scienc', 'comput', 'field', 'intellig', 'artifici']
Keywords for Sentence 2: ['predict', 'make', 'model', 'train', 'focus']
Keywords for Sentence 3: ['learn', 'layer', 'multipl', 'type', 'deep']
Keywords for Sentence 4: ['neuron', 'call', 'node', 'interconnect', 'compos']
Keywords for Sentence 5: ['task', 'process', 'languag', 'natur', 'commonli']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
