In [189]:
import os
import re
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [190]:
#base file path
base_path = '/home/ec2-user/SageMaker/data/apps_pc_ocr/'

In [191]:
# data dict stores doc labels and text of txt docs
data = {}

In [213]:
#given that the path follows the following format: apps_pc_ocr/APP_NUMBER/DOCUMENT_LABEL/DOC_SOURCE/file.txt' 

# iterate over each APP_NUMBER folder
for app_number in os.listdir(base_path):
    app_path = os.path.join(base_path, app_number)
   
    # iterate over each DOCUMENT_LABEL folder
    for document_label in os.listdir(app_path):
        doc_label_path = os.path.join(app_path, document_label)
       
        # iterate over each DOC_SOURCE folder
        for doc_source in os.listdir(doc_label_path):
            doc_source_path = os.path.join(doc_label_path, doc_source)
           
            # get the first text file in the DOC_SOURCE folder
            text_files = [f for f in os.listdir(doc_source_path) if f.endswith('.txt')]
            if text_files:
                first_text_file = text_files[0]
                with open(os.path.join(doc_source_path, first_text_file), 'r', encoding='utf-8', errors='ignore') as file:
                    lines = file.readlines()
                    # strip newline characters and join all lines
                    text = ' '.join([line.strip() for line in lines])
                    # strip punctuation
                    text = text.translate(str.maketrans('', '', string.punctuation))
                    # strip numbers with fewer than 3 digits
                    text = re.sub(r'\b\d{1,2}\b', '', text)
                    # Limit the text to the first 100 char and last 100 char
                    first100 = text[:100]
                    last100 = text[-100:]
                    text = first100 + last100
                    
                    #code below is combining all of the text docs from each doc-label-class into one doc
                    #effectively diminishes the IDF
                    #should drop each doc into df first, then run TF-IDF on the df
                    
                    # append the text to the corresponding document label in the dictionary
                    if document_label in data:
                        data[document_label].append(text)
                    else:
                        data[document_label] = [text]

In [214]:
# dict to df
df = pd.DataFrame([(label, len(texts), ' '.join(texts)) for label, texts in data.items()], columns=['Document Label', 'Count', 'Texts'])

In [215]:
#list of custom stop words
stop_words = ['united', 'states', 'trademark' , 'office', 'patent', 'the', 'and', 'in', 're', 'docket', 'us', 'page']

In [216]:
# initialize the tfidf
vectorizer = TfidfVectorizer(ngram_range=(3,7), stop_words=stop_words)

In [217]:
# list to store the representative words for each document label
representative_words = []

In [218]:
#num of words/phrases for each doc label
n=5

# Perform TF-IDF analysis for each group of texts
for texts in df['Texts']:
    X = vectorizer.fit_transform([texts])
    feature_names = vectorizer.get_feature_names_out()
    # Get the top n phrases for each group of texts
    top_phrases = [feature_names[i] for i in X.sum(axis=0).A1.argsort()[-n:]]
    representative_words.append(top_phrases)

In [209]:
# add representative words to the df
df['Representative Words'] = representative_words

In [210]:
#split the n phrases into separate columns
df2 = pd.concat([df[['Document Label','Count']], pd.DataFrame(df['Representative Words'].to_list(), columns=[f'Phrase {i+1}' for i in range(n)])], axis=1)

In [211]:
print(df2)

   Document Label  Count                                        Phrase 1  \
0            A.NE  21862                                begin on of this   
1             ADS   3682                                approved for use   
2            A.PE   2872                        commissioner for patents   
3            ABST  12878                                of disclosure an   
4          (A...)   7936                                begin on of this   
5              XT  16360                             box 1450 alexandria   
6            WFEE   4020  application fee transmittal application number   
7            A.NA   8168                                begin on of this   
8            OATH   3094                                  filing date of   
9             DRW   1298                                   seq id seq id   
10           TRNA   3132                              deposit account no   
11         (PA..)   3348                                approved for use   
12       P.2

In [212]:
df2.to_csv('tfidf_doc_phrases.csv', index=False)

In [178]:
!pip install xlwt
import xlwt

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xlwt
  Downloading xlwt-1.3.0-py2.py3-none-any.whl (99 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.0/100.0 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlwt
Successfully installed xlwt-1.3.0


In [201]:
df2.to_excel('tfidf_doc_phrases.xlsx', index=False)

In [None]:
#reverse order of words?
#last 100?
