In [1]:
import os
import pandas as pd
from PyPDF2 import PdfReader
from datetime import datetime

# Path of the PDFs folder
pdfs_folder = "All_PDFs"

# Initialize empty lists for each column
file_names = []
file_sizes_kb = []
dates_created = []
dates_modified = []
dates_accessed = []
num_pages = []

# Loop through each PDF file in the folder
for file_name in os.listdir(pdfs_folder):
    if file_name.endswith(".pdf"):
        file_path = os.path.join(pdfs_folder, file_name)
        
        # Check if the file is empty
        if os.path.getsize(file_path) == 0:
            print(f"Skipping empty file: {file_name}")
            continue
        
        # Extract information from the non-empty PDF file
        with open(file_path, "rb") as f:
            try:
                pdf = PdfReader(f)
                file_size_bytes = os.path.getsize(file_path)
                file_size_kb = file_size_bytes/1024
                date_created = os.path.getctime(file_path)
                date_modified = os.path.getmtime(file_path)
                date_accessed = os.path.getatime(file_path)
                pages = len(pdf.pages)
            except Exception as e:
                print(f"Error reading file: {file_name} - {str(e)}")
                continue
        
        # Convert timestamp to datetime object and format it
        date_created = datetime.fromtimestamp(date_created).strftime('%d-%m-%Y %H:%M:%S')
        date_modified = datetime.fromtimestamp(date_modified).strftime('%d-%m-%Y %H:%M:%S')
        date_accessed = datetime.fromtimestamp(date_accessed).strftime('%d-%m-%Y %H:%M:%S')
        # Append the extracted information to the respective lists
        file_names.append(file_name)
        file_sizes_kb.append(file_size_kb)
        dates_created.append(date_created)
        dates_modified.append(date_modified)
        dates_accessed.append(date_accessed)
        num_pages.append(pages)

# Create a DataFrame from the collected information
data = {
    "File Name": file_names,
    "File Size (kb)": file_sizes_kb,
    "Date Created": dates_created,
    "Date Modified": dates_modified,
    "Date Accessed": dates_accessed,
    "Number of Pages": num_pages
}
df = pd.DataFrame(data)

# Display the DataFrame
df


NumberObject(b'-') invalid; use 0 instead


Skipping empty file: attachment (1).pdf
Skipping empty file: attachment (5).pdf
Skipping empty file: attachment (6).pdf
Error reading file: attachment (7).pdf - 'NumberObject' object is not subscriptable
Skipping empty file: attachment.pdf
Error reading file: ePAN_CYWPM4396M.pdf - File has not been decrypted
Error reading file: national-audit-pressure-ulcer-prevalence-england-cross-sectional-study.pdf - [Errno 22] Invalid argument


Multiple definitions in dictionary at byte 0x6439e for key /Info
Multiple definitions in dictionary at byte 0x643ab for key /Info
Multiple definitions in dictionary at byte 0x643b8 for key /Info
Multiple definitions in dictionary at byte 0x3076e for key /Info
Multiple definitions in dictionary at byte 0x3077a for key /Info
Multiple definitions in dictionary at byte 0x30786 for key /Info


Error reading file: UTIITSL_ePAN_CYWPM4396M_03042023_213849.pdf - File has not been decrypted


Unnamed: 0,File Name,File Size (kb),Date Created,Date Modified,Date Accessed,Number of Pages
0,03-IH-comprehensive-intellectual-humility.pdf,98.156250,21-05-2023 21:28:42,05-04-2023 15:49:53,22-05-2023 08:59:44,3
1,04-lit-review-Measuring-Intellectual-Humility-...,631.642578,21-05-2023 21:28:42,05-04-2023 15:49:54,22-05-2023 08:59:44,11
2,0_Task Scheduling CSP.pdf,2907.464844,21-05-2023 21:28:42,16-04-2023 11:54:27,22-05-2023 08:59:44,9
3,1-Intellectual Humility An Introduction to the...,6906.728516,21-05-2023 21:28:42,05-04-2023 15:50:22,22-05-2023 08:59:44,369
4,1-s2.0-S0141933118304629-main.pdf,1779.131836,21-05-2023 21:28:42,01-11-2022 13:35:09,22-05-2023 08:59:44,14
...,...,...,...,...,...,...
411,week4.pdf,3177.144531,21-05-2023 21:18:03,24-02-2023 11:04:29,22-05-2023 08:59:49,94
412,Week5_lessson.pdf,189.061523,21-05-2023 21:18:03,16-10-2022 20:21:58,22-05-2023 08:59:49,4
413,Wireless Compact Laser Vibrometer EDITED (1).pdf,1663.701172,21-05-2023 21:18:03,19-08-2022 02:27:09,22-05-2023 08:59:49,12
414,Wireless Compact Laser Vibrometer EDITED.pdf,1663.701172,21-05-2023 21:28:44,19-08-2022 02:27:05,22-05-2023 08:59:49,12


In [2]:
from datetime import datetime, timedelta

df['Date Accessed'] = pd.to_datetime(df['Date Accessed'], format='%d-%m-%Y %H:%M:%S')
df['Date Modified'] = pd.to_datetime(df['Date Modified'], format='%d-%m-%Y %H:%M:%S')


print(f"Number of pdfs = {df.shape[0]}")
total_memory = df["File Size (kb)"].sum() / 1024

print(f"Total memory occupied by pdfs = {total_memory} MB")

time_gap = datetime.now() - timedelta(days=182)
filtered_df = df[df['Date Accessed'] < time_gap]
filtered_df2 = df[df['Date Modified'] < time_gap]


count = filtered_df.shape[0]
count2 = filtered_df2.shape[0]

print("Number of pdfs not opened since 6 months = ", count)
print("Number of pdfs not modified since 6 months = ", count2)

Number of pdfs = 416
Total memory occupied by pdfs = 958.0769453048706 MB
Number of pdfs not opened since 6 months =  0
Number of pdfs not modified since 6 months =  272


In [3]:
import hashlib

# Function to calculate the hash value of a file
def calculate_file_hash(file_path):
    with open(file_path, "rb") as f:
        data = f.read()
        hash_value = hashlib.md5(data).hexdigest()
    return hash_value

# Dictionary to store duplicate files
duplicate_files = {}

# Iterate over the PDF files and calculate hash values
for filename in os.listdir(pdfs_folder):
    file_path = os.path.join(pdfs_folder, filename)
    if os.path.isfile(file_path):
        hash_value = calculate_file_hash(file_path)
        if hash_value in duplicate_files:
            duplicate_files[hash_value].append(filename)
        else:
            duplicate_files[hash_value] = [filename]

# Display duplicate file information
num_duplicate_sets = len(duplicate_files)
total_duplicate_files = sum(len(files) - 1 for files in duplicate_files.values())

print(f"Number of duplicate sets: {num_duplicate_sets}")
print(f"Total number of duplicate files (excluding first files): {total_duplicate_files}")

for files in duplicate_files.values():
    if len(files) > 1:
        print("Duplicate files: ", files)


Number of duplicate sets: 382
Total number of duplicate files (excluding first files): 45
Duplicate files:  ['AI4CPS_Search_1 (1).pdf', 'AI4CPS_Search_1.pdf']
Duplicate files:  ['attachment (1).pdf', 'attachment (5).pdf', 'attachment (6).pdf', 'attachment.pdf']
Duplicate files:  ['attachment (10).pdf', 'attachment (9).pdf']
Duplicate files:  ['attachment (2).pdf', 'attachment (3).pdf']
Duplicate files:  ['BTP-1 Consent Form (1).pdf', 'BTP-1 Consent Form.pdf']
Duplicate files:  ['CCU_NAG_2022-08-24_OLWZVS (1).pdf', 'CCU_NAG_2022-08-24_OLWZVS (2).pdf', 'CCU_NAG_2022-08-24_OLWZVS.pdf']
Duplicate files:  ['CentuRITon_Idea_AuReal_PPT (1).pdf', 'CentuRITon_Idea_AuReal_PPT.pdf']
Duplicate files:  ['DOC-20230207-WA0000. (1).pdf', 'DOC-20230207-WA0000..pdf']
Duplicate files:  ['downloadRegistrationCard (1).pdf', 'downloadRegistrationCard.pdf']
Duplicate files:  ['ePAN_CYWPM4396M.pdf', 'UTIITSL_ePAN_CYWPM4396M_03042023_213849.pdf']
Duplicate files:  ['FAQ_2023_Application-Requirements (1).pdf', 

In [4]:
# List to store the indices of rows to be removed
rows_to_remove = []

# Iterate over the duplicate files and find the indices of the rows to be removed
for files in duplicate_files.values():
    if len(files) > 1:
        duplicate_filenames = files[1:]  # Exclude the first file in each set
        for index, row in df.iterrows():
            if row['File Name'] in duplicate_filenames:
                rows_to_remove.append(index)

# Remove the rows from the DataFrame
df = df.drop(rows_to_remove)

# Reset the index of the DataFrame
df = df.reset_index(drop=True)

# Display the updated DataFrame
df

Unnamed: 0,File Name,File Size (kb),Date Created,Date Modified,Date Accessed,Number of Pages
0,03-IH-comprehensive-intellectual-humility.pdf,98.156250,21-05-2023 21:28:42,2023-04-05 15:49:53,2023-05-22 08:59:44,3
1,04-lit-review-Measuring-Intellectual-Humility-...,631.642578,21-05-2023 21:28:42,2023-04-05 15:49:54,2023-05-22 08:59:44,11
2,0_Task Scheduling CSP.pdf,2907.464844,21-05-2023 21:28:42,2023-04-16 11:54:27,2023-05-22 08:59:44,9
3,1-Intellectual Humility An Introduction to the...,6906.728516,21-05-2023 21:28:42,2023-04-05 15:50:22,2023-05-22 08:59:44,369
4,1-s2.0-S0141933118304629-main.pdf,1779.131836,21-05-2023 21:28:42,2022-11-01 13:35:09,2023-05-22 08:59:44,14
...,...,...,...,...,...,...
370,Untitled document.pdf,17.873047,21-05-2023 21:18:03,2022-11-12 12:27:02,2023-05-22 08:59:49,1
371,week4.pdf,3177.144531,21-05-2023 21:18:03,2023-02-24 11:04:29,2023-05-22 08:59:49,94
372,Week5_lessson.pdf,189.061523,21-05-2023 21:18:03,2022-10-16 20:21:58,2023-05-22 08:59:49,4
373,Wireless Compact Laser Vibrometer EDITED (1).pdf,1663.701172,21-05-2023 21:18:03,2022-08-19 02:27:09,2023-05-22 08:59:49,12


In [5]:
import send2trash

# Dictionary to store duplicate files
duplicate_files = {}

# Iterate over the PDF files and calculate hash values
for filename in os.listdir(pdfs_folder):
    file_path = os.path.join(pdfs_folder, filename)
    if os.path.isfile(file_path):
        hash_value = calculate_file_hash(file_path)
        if hash_value in duplicate_files:
            duplicate_files[hash_value].append(file_path)
        else:
            duplicate_files[hash_value] = [file_path]

# Move duplicate files to the recycle bin
for files in duplicate_files.values():
    if len(files) > 1:
        # Keep the first file and move the rest to the recycle bin
        files_to_move = files[1:]
        for file_to_move in files_to_move:
            try:
                send2trash.send2trash(file_to_move)
                print(f"Moved file to recycle bin: {file_to_move}")
            except Exception as e:
                print(f"Error moving file to recycle bin: {file_to_move} - {str(e)}")

print("Duplicate files moved to the recycle bin successfully.")


Moved file to recycle bin: All_PDFs\AI4CPS_Search_1.pdf
Moved file to recycle bin: All_PDFs\attachment (5).pdf
Moved file to recycle bin: All_PDFs\attachment (6).pdf
Moved file to recycle bin: All_PDFs\attachment.pdf
Moved file to recycle bin: All_PDFs\attachment (9).pdf
Moved file to recycle bin: All_PDFs\attachment (3).pdf
Moved file to recycle bin: All_PDFs\BTP-1 Consent Form.pdf
Moved file to recycle bin: All_PDFs\CCU_NAG_2022-08-24_OLWZVS (2).pdf
Moved file to recycle bin: All_PDFs\CCU_NAG_2022-08-24_OLWZVS.pdf
Moved file to recycle bin: All_PDFs\CentuRITon_Idea_AuReal_PPT.pdf
Moved file to recycle bin: All_PDFs\DOC-20230207-WA0000..pdf
Moved file to recycle bin: All_PDFs\downloadRegistrationCard.pdf
Moved file to recycle bin: All_PDFs\UTIITSL_ePAN_CYWPM4396M_03042023_213849.pdf
Moved file to recycle bin: All_PDFs\FAQ_2023_Application-Requirements.pdf
Moved file to recycle bin: All_PDFs\Final_Schedule_for_SIH_2022_Software_25_to_26th_August_2022_2 (2).pdf
Moved file to recycle bin

In [6]:
from datetime import datetime, timedelta

df['Date Accessed'] = pd.to_datetime(df['Date Accessed'], format='%d-%m-%Y %H:%M:%S')
df['Date Modified'] = pd.to_datetime(df['Date Modified'], format='%d-%m-%Y %H:%M:%S')


print(f"Number of pdfs = {df.shape[0]}")
total_memory = df["File Size (kb)"].sum() / 1024

print(f"Total memory occupied by pdfs = {total_memory} MB")

time_gap = datetime.now() - timedelta(days=182)
filtered_df = df[df['Date Accessed'] < time_gap]
filtered_df2 = df[df['Date Modified'] < time_gap]


count = filtered_df.shape[0]
count2 = filtered_df2.shape[0]

print("Number of pdfs not opened since 6 months = ", count)
print("Number of pdfs not modified since 6 months = ", count2)



Number of pdfs = 375
Total memory occupied by pdfs = 906.2167615890503 MB
Number of pdfs not opened since 6 months =  0
Number of pdfs not modified since 6 months =  241


In [7]:
# Dictionary to store duplicate files
duplicate_files = {}

# Iterate over the PDF files and calculate hash values
for filename in os.listdir(pdfs_folder):
    file_path = os.path.join(pdfs_folder, filename)
    if os.path.isfile(file_path):
        hash_value = calculate_file_hash(file_path)
        if hash_value in duplicate_files:
            duplicate_files[hash_value].append(filename)
        else:
            duplicate_files[hash_value] = [filename]

# Display duplicate file information
num_duplicate_sets = len(duplicate_files)
total_duplicate_files = sum(len(files) - 1 for files in duplicate_files.values())

print(f"Number of duplicate sets: {num_duplicate_sets}")
print(f"Total number of duplicate files (excluding first files): {total_duplicate_files}")

for files in duplicate_files.values():
    if len(files) > 1:
        print("Duplicate files: ", files)


Number of duplicate sets: 382
Total number of duplicate files (excluding first files): 0


In [8]:
dfparent=df.copy(deep=True)
df=dfparent[dfparent["Number of Pages"]<60].copy(deep=True)
dfbigfile=dfparent[dfparent["Number of Pages"]>=60].copy(deep=True)

In [9]:
from tqdm import tqdm
import re
import pytesseract
from PIL import Image
import time

# Function to extract text from a PDF file using OCR
def extract_text_from_pdf(file_path):
    start_time = time.time()
    with open(file_path, "rb") as f:
        pdf = PdfReader(f)
        text = ""
        for page in pdf.pages:
            if time.time() - start_time > 30:
                return None  # Interrupt extraction if it takes more than 30 seconds
            if hasattr(page, 'Resources') and "/XObject" in page.Resources.keys():
                x_objects = page.Resources['/XObject'].getObject()
                for obj in x_objects:
                    if x_objects[obj]['/Subtype'] == '/Image':
                        # Convert image to PIL Image object
                        img = Image.frombytes(
                            x_objects[obj]['/ColorSpace'] if '/ColorSpace' in x_objects[obj] else '/DeviceRGB',
                            (x_objects[obj]['/Width'], x_objects[obj]['/Height']),
                            x_objects[obj].getData()
                        )
                        # Perform OCR on the image
                        extracted_text = pytesseract.image_to_string(img)
                        # Remove invalid characters from extracted text
                        cleaned_text = re.sub(r"[^\x00-\x7F]+", "", extracted_text)
                        text += cleaned_text
            else:
                extracted_text = page.extract_text()
                cleaned_text = re.sub(r"[^\x00-\x7F]+", "", extracted_text)
                text += cleaned_text
        
        return text

# Get the total number of PDF files
total_files = len(df)

# Initialize progress bar
pbar = tqdm(total=total_files, desc="Processing PDFs", unit="file")

# Extract text for each PDF file and update the DataFrame
for index, row in df.iterrows():
    file_name = row["File Name"]
    file_path = os.path.join(pdfs_folder, file_name)
    try:
        text = extract_text_from_pdf(file_path)
        if text is not None:
            df.at[index, "Text Extracted"] = text
            df.at[index, "Text_Ext_Status"] = "Done"
        
        else:
            df.at[index, "Text_Ext_Status"] = "Interrupted due to too much time"
    except Exception as e:
        print(f"Error extracting text from file: {file_name} - {str(e)}")
        df.at[index, "Text Extracted"] = ""
        df.at[index, "Text_Ext_Status"] = "Error"
    
    # Update the progress bar
    pbar.update(1)

# Close the progress bar
pbar.close()

# Display the updated DataFrame
df

Processing PDFs:   1%|▏         | 5/347 [00:02<03:02,  1.87file/s]FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid; use 0.0 instead
FloatObject (b'0.00-10498687') invalid

Unnamed: 0,File Name,File Size (kb),Date Created,Date Modified,Date Accessed,Number of Pages,Text Extracted,Text_Ext_Status
0,03-IH-comprehensive-intellectual-humility.pdf,98.156250,21-05-2023 21:28:42,2023-04-05 15:49:53,2023-05-22 08:59:44,3,\n \n \n \n \n \...,Done
1,04-lit-review-Measuring-Intellectual-Humility-...,631.642578,21-05-2023 21:28:42,2023-04-05 15:49:54,2023-05-22 08:59:44,11,Measur ing Intellectual Humility in an Online...,Done
2,0_Task Scheduling CSP.pdf,2907.464844,21-05-2023 21:28:42,2023-04-16 11:54:27,2023-05-22 08:59:44,9,. \n11 elo it \nntg u dtar ttme eustraaliE \nf...,Done
4,1-s2.0-S0141933118304629-main.pdf,1779.131836,21-05-2023 21:28:42,2022-11-01 13:35:09,2023-05-22 08:59:44,14,Microprocessors and Microsystems 70 (2019) 1...,Done
5,1-s2.0-S0169131706002043-main.pdf,486.779297,21-05-2023 21:28:42,2022-12-10 20:36:46,2023-05-22 08:59:44,9,Controlling green sand mould properties using ...,Done
...,...,...,...,...,...,...,...,...
369,Unsafe_Patient_Handling.5.pdf,1093.536133,21-05-2023 21:18:03,2022-08-09 16:48:12,2023-05-22 08:59:49,1,ajn@wolterskluwer.com AJN January 2019 Vol...,Done
370,Untitled document.pdf,17.873047,21-05-2023 21:18:03,2022-11-12 12:27:02,2023-05-22 08:59:49,1,Mini\nStatements\nattesting\nreceiving\nprize\...,Done
372,Week5_lessson.pdf,189.061523,21-05-2023 21:18:03,2022-10-16 20:21:58,2023-05-22 08:59:49,4,Tasks for 12/09/2022 and 19/09/2022 : \n(Prob...,Done
373,Wireless Compact Laser Vibrometer EDITED (1).pdf,1663.701172,21-05-2023 21:18:03,2022-08-19 02:27:09,2023-05-22 08:59:49,12,Wireless Compact Laser Vibrometer\nIdea/A ppro...,Done


In [10]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# Function to extract keywords from text
def extract_keywords(text):
    # Tokenize the text into individual words
    tokens = nltk.word_tokenize(text)
    
    # Perform any preprocessing or filtering on the tokens as needed
    
    # Remove punctuation and convert to lowercase
    tokens = [token.lower() for token in tokens if token.isalnum()]
    
    # Check if the tokens are empty or contain only stop words
    stop_words = set(nltk.corpus.stopwords.words("english"))
    valid_tokens = [token for token in tokens if token not in stop_words]
    if not valid_tokens:
        return []
    
    # Extract the keywords using TF-IDF algorithm
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform([" ".join(valid_tokens)])  # Join valid tokens back into a single string
    feature_names = tfidf.get_feature_names_out()
    keyword_indices = tfidf_matrix[0].nonzero()[1]
    keywords = [feature_names[idx] for idx in keyword_indices]
    
    return keywords

# Get the total number of PDF files
total_files = len(df)

# Initialize progress bar
pbar = tqdm(total=total_files, desc="Extracting Keywords", unit="file")

# Apply keyword extraction to each row in the DataFrame
for index, row in df.iterrows():
    text = row["Text Extracted"]
    
    if isinstance(text, str):  # Check if text is a string
        if text.strip() == "":
            keywords_str = ""
        else:
            keywords = extract_keywords(text)
            keywords_str = ", ".join(keywords) if keywords else ""  # Convert the list of keywords to a comma-separated string if not empty
    else:
        keywords_str = ""
    
    df.at[index, "Keywords"] = keywords_str
    
    # Update the progress bar
    pbar.update(1)

# Close the progress bar
pbar.close()


def word_splt(text):
    if isinstance(text, float): 
        words_list = None
        
    else: 
        words_list = text.split(", ")
    
    return words_list

df["Keywords"] = df["Keywords"].apply(word_splt)


# Display the updated DataFrame
df


Extracting Keywords: 100%|██████████| 347/347 [00:04<00:00, 72.38file/s] 


Unnamed: 0,File Name,File Size (kb),Date Created,Date Modified,Date Accessed,Number of Pages,Text Extracted,Text_Ext_Status,Keywords
0,03-IH-comprehensive-intellectual-humility.pdf,98.156250,21-05-2023 21:28:42,2023-04-05 15:49:53,2023-05-22 08:59:44,3,\n \n \n \n \n \...,Done,"[openness, insignificant, scores, tally, aster..."
1,04-lit-review-Measuring-Intellectual-Humility-...,631.642578,21-05-2023 21:28:42,2023-04-05 15:49:54,2023-05-22 08:59:44,11,Measur ing Intellectual Humility in an Online...,Done,"[afliation, education, race, gender, ng, inclu..."
2,0_Task Scheduling CSP.pdf,2907.464844,21-05-2023 21:28:42,2023-04-16 11:54:27,2023-05-22 08:59:44,9,. \n11 elo it \nntg u dtar ttme eustraaliE \nf...,Done,"[mr, ddtal, mn, b2, ruus, puoaor, pvoosox, poc..."
4,1-s2.0-S0141933118304629-main.pdf,1779.131836,21-05-2023 21:28:42,2022-11-01 13:35:09,2023-05-22 08:59:44,14,Microprocessors and Microsystems 70 (2019) 1...,Done,"[member, ference, reviewed, articles, magazine..."
5,1-s2.0-S0169131706002043-main.pdf,486.779297,21-05-2023 21:28:42,2022-12-10 20:36:46,2023-05-22 08:59:44,9,Controlling green sand mould properties using ...,Done,"[fines, 254, 243, 38, recycling, conservation,..."
...,...,...,...,...,...,...,...,...,...
369,Unsafe_Patient_Handling.5.pdf,1093.536133,21-05-2023 21:18:03,2022-08-09 16:48:12,2023-05-22 08:59:49,1,ajn@wolterskluwer.com AJN January 2019 Vol...,Done,"[etc, fax, 212, 10001, ny, york, new, floor, 1..."
370,Untitled document.pdf,17.873047,21-05-2023 21:18:03,2022-11-12 12:27:02,2023-05-22 08:59:49,1,Mini\nStatements\nattesting\nreceiving\nprize\...,Done,"[punbh22313399079, xx9051, raj, raunak, finali..."
372,Week5_lessson.pdf,189.061523,21-05-2023 21:18:03,2022-10-16 20:21:58,2023-05-22 08:59:49,4,Tasks for 12/09/2022 and 19/09/2022 : \n(Prob...,Done,"[54, meshgrid, numpy, need, matrices, coeffici..."
373,Wireless Compact Laser Vibrometer EDITED (1).pdf,1663.701172,21-05-2023 21:18:03,2022-08-19 02:27:09,2023-05-22 08:59:49,12,Wireless Compact Laser Vibrometer\nIdea/A ppro...,Done,"[feedback, sales, finally, framework, manufact..."


In [11]:
from sklearn.metrics.pairwise import cosine_similarity

df['Text Extracted'] = df['Text Extracted'].fillna('').astype(str)

vectorizer = TfidfVectorizer()
text_vectors = vectorizer.fit_transform(df['Text Extracted'])

def search_files(query, top_k=10):
    query_vector = vectorizer.transform([query])
    similarities = cosine_similarity(query_vector, text_vectors).flatten()
    indices = similarities.argsort()[::-1][:top_k]
    return df.iloc[indices]['File Name'].tolist()

In [13]:
search_query = 'education'
relevant_files = search_files(search_query)
print(relevant_files)

['MKSree CV 210122.pdf', 'Modern_Deedy (2).pdf', 'Modern_Deedy (1).pdf', 'MKSree_CV (7).pdf', 'MKSree_CV (9).pdf', 'MKSree_CV (8).pdf', 'MKSree_CV (6).pdf', 'MKSree_CV (4) (1).pdf', 'MKSree_CV (3).pdf', 'MKSree_CV (2).pdf']


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Convert the text data into TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Text Extracted'])

# Perform clustering using K-means
num_clusters = 5  # Number of clusters to create
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Assign cluster labels to the DataFrame
df['Cluster Label'] = kmeans.labels_

# Display the updated DataFrame
df

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

df2 = pd.DataFrame()


# Combine text, file name, and file size for clustering
df2['Text'] = df['Text Extracted']
df2['File Name'] = df['File Name'].apply(lambda x: os.path.splitext(x)[0])  # Remove file extension
df2['File Size'] = df['File Size (kb)']
cluster_data = df2[['Text', 'File Name', 'File Size']]


# Define the column transformer to handle different variable types
column_transformer = make_column_transformer(
    (TfidfVectorizer(), 'Text'),
    (StandardScaler(), ['File Size'])
)

# Create the pipeline for clustering
pipeline = make_pipeline(column_transformer, KMeans(n_clusters=num_clusters, random_state=42))

# Fit the pipeline to the data
pipeline.fit(cluster_data)

# Add the cluster labels to the DataFrame
df['Cluster'] = pipeline.predict(cluster_data)





In [18]:
import shutil

# Create a directory to store the clusters
clusters_dir = pdfs_folder
os.makedirs(clusters_dir, exist_ok=True)

# Iterate over the clusters
for cluster_id in range(num_clusters):
    # Create a directory for each cluster
    cluster_dir = os.path.join(clusters_dir, f"Cluster_{cluster_id}")
    os.makedirs(cluster_dir, exist_ok=True)

    # Get the PDFs in the current cluster
    cluster_pdf_names = df[df["Cluster Label"] == cluster_id]["File Name"]

    # Move the PDFs to the cluster directory
    for pdf_name in cluster_pdf_names:
        src_path = os.path.join(pdfs_folder, pdf_name)
        dst_path = os.path.join(cluster_dir, pdf_name)
        shutil.move(src_path, dst_path)

#---------------------------------------------------------------------
cluster_dir = os.path.join(clusters_dir, f"Cluster_Bigfiles")
os.makedirs(cluster_dir, exist_ok=True)

# Get the PDFs in the current cluster
cluster_pdf_names = dfbigfile["File Name"]

# Move the PDFs to the cluster directory
for pdf_name in cluster_pdf_names:
    src_path = os.path.join(pdfs_folder, pdf_name)
    dst_path = os.path.join(cluster_dir, pdf_name)
    shutil.move(src_path, dst_path)
#-----------------------------------------------------------------




In [16]:
# Define the output directory where the PDFs will be moved
output_directory = pdfs_folder

# Iterate over each cluster
for cluster_id in range(num_clusters):
    # Create a folder for the cluster
    cluster_folder = os.path.join(output_directory, f"Cluster_{cluster_id}")
    os.makedirs(cluster_folder, exist_ok=True)

    # Filter the DataFrame to get the PDFs belonging to the current cluster
    cluster_df = df[df['Cluster Label'] == cluster_id]

    # Move the PDF files from the cluster folder to their original locations
    for _, row in cluster_df.iterrows():
        try: 
            file_name = row['File Name']
            original_path = os.path.join(pdfs_folder, file_name)
            new_path = os.path.join(cluster_folder, file_name)
            shutil.move(new_path, original_path)
        except FileNotFoundError:
            pass


#---------------------------------------------
cluster_folder = os.path.join(output_directory, f"Cluster_Bigfiles")
os.makedirs(cluster_folder, exist_ok=True)

# Filter the DataFrame to get the PDFs belonging to the current cluster
cluster_df = dfbigfile

# Move the PDF files from the cluster folder to their original locations
for _, row in cluster_df.iterrows():
    try: 
        file_name = row['File Name']
        original_path = os.path.join(pdfs_folder, file_name)
        new_path = os.path.join(cluster_folder, file_name)
        shutil.move(new_path, original_path)
    except FileNotFoundError:
        pass
#-------------------------------------------------------------------------




# Delete empty folders in the output directory
for cluster_id in range(num_clusters):
    cluster_folder = os.path.join(output_directory, f"Cluster_{cluster_id}")
    if os.path.isdir(cluster_folder) and len(os.listdir(cluster_folder)) == 0:
        os.rmdir(cluster_folder)

#-----------------------------------------------------------------------------
cluster_folder = os.path.join(output_directory, f"Cluster_Bigfiles")
if os.path.isdir(cluster_folder) and len(os.listdir(cluster_folder)) == 0:
    os.rmdir(cluster_folder)
#-------------------------------------------------------------------------------
