In [7]:
import pandas as pd
import re
import collections
import csv

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

class Preprocessor:
    stop_words = []
    ps = PorterStemmer()
    rare_words = []

    def __init__(self, stop_words_file_path, rare_words_file_path):
        self.stop_words = set(line.strip() for line in open(stop_words_file_path,'rb'))
        self.load_rare_words(rare_words_file_path)
                
    def load_rare_words(self, file):    
        with open(file, 'rb') as f :
            for w in f:
                self.rare_words.append(w.strip())

    def get_unique_tokens(self, file):
        input_file = open(file, "r", encoding="utf8")
        allWords = list()
        vocab_tokens = list()

        for line in input_file:
            line.rstrip()
            words = line.split()
            allWords.extend(words)

        for word in allWords:
            word = re.sub(r'\b[^\W\d_]+\b', '', word)
            word = word.lower().strip()
            if not word.isdigit():
                if word not in vocab_tokens:
                    vocab_tokens.append(word)

        return vocab_tokens
    
    def get_unique_tokens_preprocessed(self, file):
        unique = set()
        input_file = open(file, "r", encoding="utf8")
    
        for line in input_file:
            line.rstrip()
            words = line.split()
            for word in words:
                unique.add(word)
        return unique
            
    def get_preprocessed_docs(self, file, keyword, keep_rare_words = True, with_stemming= True):
        data = self.parse_docs(file.readlines(), keyword, keep_rare_words, with_stemming)
        return data

    def parse_docs(self, lines, keyword, keep_rare_words, with_stemming= True):
        data = []
        for line in lines:
            docid, line = line.split('\t', 1)
            url, text = line.split('\t', 1)

            if keyword in docid:
                docid = docid.replace('\t', '')
                line = self.preprocess_line(text, keep_rare_words, with_stemming)
                data.append((docid, line))
        return data

    def parse_queries(self, lines):
        data = []
        for line in lines:
            docid, text = line.split('\t', 1)
            url, text_whole = text.split('\t', 1)
            title, rest = text_whole.split('\t', 1)
            title = self.preprocess_line(title)
            data.append((docid, title))
        return data

    def preprocess_line(self, line, keep_rare_words = True, with_stemming= True):
        # print(line)
        line = self.remove_punctuation(line)
        words = line.split()
        words_to_keep = []
        for word in words:
            word = self.clean_url(word)
            word = word.lower().strip()
            if word not in self.stop_words:
                if not self.has_more_digits(word):
                    if with_stemming:
                        word = self.ps.stem(word)
                    if keep_rare_words:
                        words_to_keep.append(word)
                    else:
                        if word not in self.rare_words:
                            words_to_keep.append(word)

            new_line = ' '.join(words_to_keep)

        return self.clean_digits(new_line)

    def clean_digits(self, line):
        words_to_keep = []
        words = line.split()
        # if
        for word in words:
            count_digits = 0
            count_chars = 0
            for w in word:
                if w.isdigit():
                    count_digits = count_digits + 1
                elif w == '-':
                    count_digits = count_digits + 1
                elif w == '/':
                    count_digits = count_digits + 1
                else:
                    count_chars = count_chars + 1
            if count_chars > count_digits:
                words_to_keep.append(word)
        return ' '.join(words_to_keep)

    def clean_url(self, text):
        text = re.sub(r'http://www.ncbi.nlm.nih.gov/pubmed/?', '', text)
        text = re.sub(r'http://nutritionfacts.org/topics/', '', text)
        text = re.sub(r'http://www.ncbi.nlm.nih.gov/pmc/articles/', '', text)

        return text

    def remove_punctuation(self, txt):
        txt = txt.replace('-', ' ')
        txt = txt.replace('/', ' ')
        return re.sub('[^A-Za-z0-9\s]+', '', txt)

    def has_more_digits(self, txt):
        count = 0
        for ch in txt:
            if ch.isdigit():
                count = count + 1
        if count > 1:
            return True
        else:
            return False


In [8]:
# stop_words_file_path=r'C:\Users\hechen\Desktop\nfcorpus\raw\stopwords.large' 
# rare_words_file_path=r'C:\Users\hechen\Desktop\rare_tokens.txt'