In [1]:
# importing libraries
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
import spacy
import glob

In [2]:
# setting up tokenizer and stemmer
nlp = spacy.load('en_core_web_sm')
stemmer = SnowballStemmer("english")

In [3]:
# finction to stem the words of a given string

def clean (df, stemmer, nlp, stem = True, tokenize = True):
    # snowballstemming
    if stem:
        df["stemmed"] = df["paragraph"].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))
    # tokenization
    if tokenize:
        df["tokenized"] = df["paragraph"].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
    

In [4]:
# test_run
test_file = pd.read_csv("test_file.csv")
clean(test_file, stemmer, nlp)
print(f"original: {test_file.iloc[0].paragraph}")
print(f"stemmed: {test_file.iloc[0].stemmed}")
print(f"tokenized: {test_file.iloc[0].tokenized}")

original: Approved by the Superintendent of Financial Services for use as the standard Owners Policy on or
after June 1 2016
stemmed: approv by the superintend of financi servic for use as the standard owner polici on or after june 1 2016
tokenized: approve by the Superintendent of Financial Services for use as the standard Owners Policy on or 
 after June 1 2016


In [5]:
auto_files = glob.glob("/home/majime/programming/github/information-retrieval-assignments/assignment 1/csvs/Auto/*.csv")
for file in auto_files:
    df = pd.read_csv(file)
    clean(df, stemmer, nlp)
    df.to_csv(file, index=False)


In [6]:
property_files = glob.glob("/home/majime/programming/github/information-retrieval-assignments/assignment 1/csvs/Property/*.csv")
for file in property_files:
    df = pd.read_csv(file)
    clean(df, stemmer, nlp)
    df.to_csv(file, index=False)