In [1]:
import pandas as pd
import os
from glob import glob
from pdfminer.high_level import extract_text
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Import directory paths from secret config file
from config import pdf_directory, text_directory, output_directory

def extract_text_from_pdf(pdf_path):
    text = extract_text(pdf_path)
    # Replace or remove unwanted characters
    text = text.replace('\x0c', '')  # Removes the form feed character
    # Add more replacements if needed
    return text

def save_text_to_file(text, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)

# Lemmatizer
def lemmatized_tokenizer(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalpha() and len(word) > 3]
    return lemmatized_tokens

In [2]:
# Make directories if they do not yet exist
os.makedirs(text_directory, exist_ok=True)

# Process each PDF file
for pdf_path in glob(os.path.join(pdf_directory, '*.pdf')):
    pdf_text = extract_text_from_pdf(pdf_path)
    # Create a text file name based on the PDF file name
    text_file_name = os.path.splitext(os.path.basename(pdf_path))[0] + '.txt'
    text_file_path = os.path.join(text_directory, text_file_name)
    save_text_to_file(pdf_text, text_file_path)

# Find all text files in the directory
file_paths = glob(f'{text_directory}/*')
file_paths

['/Users/pergolicious/Scripts/Playground/text_mining_project_v1/data/pdfs_converted_to_text/IST 769- Advanced Database Syllabus.txt',
 '/Users/pergolicious/Scripts/Playground/text_mining_project_v1/data/pdfs_converted_to_text/IST 687 Fall 2020 Syllabus.txt',
 '/Users/pergolicious/Scripts/Playground/text_mining_project_v1/data/pdfs_converted_to_text/IST 736 Fall 2020 (2U) Syllabus.txt',
 '/Users/pergolicious/Scripts/Playground/text_mining_project_v1/data/pdfs_converted_to_text/IST 707 Fall 2021 Syllabus_Introne.txt',
 '/Users/pergolicious/Scripts/Playground/text_mining_project_v1/data/pdfs_converted_to_text/IST718-FA.txt',
 '/Users/pergolicious/Scripts/Playground/text_mining_project_v1/data/pdfs_converted_to_text/IST 772 Fall 2021 Syllabus_Crowston.txt',
 '/Users/pergolicious/Scripts/Playground/text_mining_project_v1/data/pdfs_converted_to_text/IST 664 Fall 2021 Syllabus_Stanton.txt',
 '/Users/pergolicious/Scripts/Playground/text_mining_project_v1/data/pdfs_converted_to_text/IST 652 syl

In [3]:
# Initialize an empty dictionary to store the content of each file
corpus_content = {}

# Read each file and store its content in the dictionary
for file_path in file_paths:
    file_name = file_path.split('/')[-1]
    with open(file_path, 'r') as file:
        corpus_content[file_name] = file.read()

# Convert the dictionary into a DataFrame
corpus_df = pd.DataFrame(list(corpus_content.items()), columns=['Label', 'Text'])
print(corpus_df.info())
corpus_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   9 non-null      object
 1   Text    9 non-null      object
dtypes: object(2)
memory usage: 276.0+ bytes
None


Unnamed: 0,Label,Text
0,IST 769- Advanced Database Syllabus.txt,COURSE SYLLABUS \nIST769: Advanced Big Data M...
1,IST 687 Fall 2020 Syllabus.txt,1 \n\nIST687 Applied Data Science \nSchool of ...
2,IST 736 Fall 2020 (2U) Syllabus.txt,COURSE SYLLABUS \nIST 736 Text Mining \n\n1 \n...
3,IST 707 Fall 2021 Syllabus_Introne.txt,COURSE SYLLABUS \nIST 407/707 Data Analytics \...
4,IST718-FA.txt,IST 718: Big Data Analytics \n\nCourse informa...


In [4]:
# TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=lemmatized_tokenizer)

# Fit and transform
X_vectorized = vectorizer.fit_transform(corpus_df['Text'])

# Get dataframe
tfidf_df = pd.DataFrame(X_vectorized.todense(), columns=vectorizer.get_feature_names_out(), index=corpus_df['Label'])
tfidf_df.reset_index(inplace=True)

# Pattern to extract class type (IST / SCM) followed by course number
pattern = r'([A-Z]{3})\W*(\d{3})'

# Replace current label text with regex extraction
tfidf_df['Label'] = tfidf_df['Label'].str.extract(pattern).agg(' '.join, axis=1)

print(tfidf_df.info(show_counts=True, verbose=True))
tfidf_df



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2402 columns):
 #     Column              Non-Null Count  Dtype  
---    ------              --------------  -----  
 0     Label               9 non-null      object 
 1     aaron               9 non-null      float64
 2     abbreviated         9 non-null      float64
 3     abide               9 non-null      float64
 4     ability             9 non-null      float64
 5     able                9 non-null      float64
 6     absence             9 non-null      float64
 7     absent              9 non-null      float64
 8     absolutely          9 non-null      float64
 9     abused              9 non-null      float64
 10    academic            9 non-null      float64
 11    academically        9 non-null      float64
 12    accept              9 non-null      float64
 13    acceptable          9 non-null      float64
 14    accepted            9 non-null      float64
 15    accepts             9 n



Unnamed: 0,Label,aaron,abbreviated,abide,ability,able,absence,absent,absolutely,abused,...,yankee,yarn,year,york,yoshua,zaharia,zero,zhang,zipf,zoom
0,IST 769,0.0,0.0,0.01026,0.0,0.009282,0.028047,0.017787,0.0,0.0,...,0.0,0.024221,0.01026,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,IST 687,0.0,0.0,0.008718,0.0,0.007887,0.0,0.0,0.0,0.0,...,0.0,0.0,0.008718,0.0,0.0,0.0,0.0,0.0,0.0,0.017383
2,IST 736,0.0,0.0,0.009763,0.0,0.017665,0.0,0.0,0.0,0.0,...,0.0,0.0,0.009763,0.014955,0.0,0.0,0.0,0.023048,0.0,0.0
3,IST 707,0.0,0.0,0.011873,0.0,0.010741,0.016228,0.0,0.0,0.0,...,0.0,0.0,0.011873,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,IST 718,0.029403,0.0,0.012455,0.019078,0.022536,0.0,0.021593,0.0,0.0,...,0.0,0.0,0.012455,0.0,0.029403,0.029403,0.029403,0.0,0.0,0.0
5,IST 772,0.0,0.015731,0.0,0.010207,0.024114,0.0,0.0,0.0,0.011552,...,0.0,0.0,0.0,0.020414,0.0,0.0,0.0,0.0,0.0,0.026573
6,IST 664,0.0,0.0,0.006136,0.0,0.005551,0.008387,0.021275,0.0,0.010637,...,0.0,0.0,0.012272,0.018797,0.0,0.0,0.0,0.0,0.014485,0.0
7,IST 652,0.0,0.0,0.003738,0.005725,0.010145,0.010218,0.0,0.017648,0.00648,...,0.008824,0.0,0.003738,0.011451,0.0,0.0,0.0,0.0,0.0,0.0
8,IST 659,0.0,0.0,0.008038,0.024626,0.007272,0.021974,0.0,0.0,0.0,...,0.0,0.0,0.008038,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Save to csv
tfidf_df.to_csv(f'{output_directory}/tfidf_matrix_df.csv', index=False)