In [43]:
import numpy as np
from numpy.linalg import svd
import pandas as pd

In [37]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation

nltk.download('punkt')
nltk.download('stopwords')

def read_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

def tokenize_and_stem(text):
    sentences = nltk.sent_tokenize(text)
    stop_words = set(stopwords.words('english') + list(punctuation))
    ps = PorterStemmer()

    sentences_tokens = []
    all_tokens = set()

    for sentence in sentences:
        word_tokens = word_tokenize(sentence)
        filtered_sentence = [ps.stem(w.lower()) for w in word_tokens if (not w.lower() in stop_words and w.isalpha())]
        sentences_tokens.append(filtered_sentence)
        all_tokens.update(set(filtered_sentence))

    return sentences_tokens, all_tokens



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
file_path = 'C:/Users/Asus/Desktop/text.txt'
text = read_text(file_path)
sentences_tokens, all_tokens = tokenize_and_stem(text)

# # Print the result
# for sentence_tokens in sentences_tokens:
#     print(sentence_tokens)



In [26]:
all_tokens = sorted(list(all_tokens))
words_data = np.zeros((len(all_tokens), len(sentences_tokens)))

In [27]:
words_data.shape

(683, 166)

In [28]:
import numpy as np

# Fill the matrix
for i, token in enumerate(all_tokens):
    for j, sentence_tokens in enumerate(sentences_tokens):
        if token in sentence_tokens:
            words_data[i, j] = 1


method1


In [38]:
U , S, V = svd(words_data)

In [40]:
words_and_values = [(word, U[idx, 0]) for idx, word in enumerate(all_tokens)]

# Sort the list based on the absolute values of the matrix values
sorted_words_and_values = sorted(words_and_values, key=lambda x: abs(x[1]), reverse=True)

# Extract the top 5 words
top_5_words = [word for word, _ in sorted_words_and_values[:5]]

# Print the top 5 words
print("Top 5 most important words:")
for word in top_5_words:
    print(word)

Top 5 most important words:
product
featur
new
competit
compani


In [41]:
sentences_and_values = [(idx, V.T[idx, 0]) for idx in range(V.shape[1])]

# Sort the list based on the absolute values of the matrix values
sorted_sentences_and_values = sorted(sentences_and_values, key=lambda x: abs(x[1]), reverse=True)

# Extract the top 5 sentences
top_5_sentences_indices = [idx for idx, _ in sorted_sentences_and_values[:5]]

# Print the top 5 sentences
print("Top 5 most important sentences:")
for idx in top_5_sentences_indices:
    print(sentences[idx])

Top 5 most important sentences:
The example illustrates
the real business pressures on companies: the need for speed, the
concern about costs, the competition that may force the company
to change its offerings, and the need to satisfy several classes of
customers—investors, distributors, and, of course, the people who
will actually use the product.
• A competing company adds new features to its products, producing
competitive pressures to match that offering, but to do even more in
order to get ahead of the competition.
FEATURITIS: A DEADLY TEMPTATION
In every successful product there lurks the carrier of an insidious
disease called “featuritis,” with its main symptom being “creeping featurism.” The disease seems to have been first identified and
named in 1976, but its origins probably go back to the earliest technologies, buried far back in the eons prior to the dawn of history.
Even relatively stable home products, such as automobiles, kitchen appliances, television sets, and
compute

method2

In [70]:
import numpy as np
import pandas as pd

def get_permutation_matrix(col_1, col_2, size):
    I = np.eye(size)
    temt = I[:, col_1].copy()
    I[:, col_1] = I[:, col_2].copy()
    I[:, col_2] = temt
    return I

def get_norm(x):
    return np.sqrt(np.sum(np.square(x)))

def householder(X):
    v = column_converter(X)
    size_of_v = v.shape[1]
    e1 = np.zeros_like(v)
    e1[0, 0] = 1
    vector = get_norm(v) * e1
    if v[0, 0] < 0:
        vector = -vector
    u = (v + vector).astype(np.float32)
    a = np.identity(size_of_v)
    b1 = (2 * np.matmul(np.transpose(u), u))
    b2 = np.matmul(u, np.transpose(u))
    b = b1 / b2
    H = a - b
    H[H == -np.inf] = 0
    return H

def column_converter(x):
    x.shape = (1, x.shape[0])
    return x



In [71]:
sentence_columns = [f'sentence_{i}' for i in range(len(sentences))]
sorted_sentence_columns = sentence_columns.copy()
words_dataframe = pd.DataFrame(words_data, columns=sentence_columns)


In [72]:

U, S, V = np.linalg.svd(words_data, full_matrices=False)
C, D = U, np.diag(S).dot(V)

# List to store the top k important sentences for different values of k
top_sentences_list = []

# Iterate for k = 5, 6, 7
for k_value in range(5, 8):
    P_list = []
    
    for i in range(k_value):
        arr = D[:, i:].copy()
        col_sum = np.apply_along_axis(np.linalg.norm, axis=0, arr=arr)
        max_col = np.argmax(col_sum)
        P = get_permutation_matrix(i, max_col + i, D.shape[1])
        sorted_sentence_columns[i], sorted_sentence_columns[max_col + i] = sorted_sentence_columns[max_col + i], sorted_sentence_columns[i]
        D = D.dot(P).copy()
        Q = householder(D[:, i].reshape((-1, 1)))
        D_a = np.round(Q.dot(D[:, i:]), 12)
        D[:, i:] = D_a.copy()
        P_list.append(P)
    
    words_dataframe_sorted = words_dataframe.copy()

    for P in P_list:
        words_dataframe_sorted = words_dataframe_sorted @ P

    words_dataframe_sorted.columns = sorted_sentence_columns
    # Get the most important sentence for the current k
    most_important_sentence = words_dataframe_sorted.idxmax(axis=1)[0]
    top_sentences_list.append((k_value, most_important_sentence))

# Print the results
for k_value, most_important_sentence in top_sentences_list:
    print(f"For k={k_value}, the most important sentence is:")
    sentence_index = int(most_important_sentence.split('_')[1])  # Extract the index from the column name
    print(sentences[sentence_index])
    print("\n")

For k=5, the most important sentence is:
Many companies claim to aspire to this philosophy, but few are able to follow
it.


For k=6, the most important sentence is:
Many companies claim to aspire to this philosophy, but few are able to follow
it.


For k=7, the most important sentence is:
Many companies claim to aspire to this philosophy, but few are able to follow
it.


