# Text Cleaning/Prep for Classical ML Model Training

In [21]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet

In [22]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patrickc410/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/patrickc410/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/patrickc410/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [23]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

In [24]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)
#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop and i.strip().lower().isalpha():
            final_text.append(i.strip().lower())
    return " ".join(final_text)
#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

In [25]:
def add_tokens_col(row: pd.Series) -> pd.Series:
    sent = row["sent_cleaned"]
    tokens = word_tokenize(sent)
    row["tokens"] = tokens
    return row

In [26]:
def add_lemmatized_col(row: pd.Series) -> pd.Series:
    wnl = WordNetLemmatizer()
    tokens = row["tokens"]
    lemmatized = [wnl.lemmatize(token) for token in tokens]
    row["sent_lemm"] = " ".join(lemmatized)
    return row

In [27]:
df = pd.read_csv("./data/train_auto_annotations_UPDATED_cleaned.csv", index_col=0)
print(f"dataframe shape: {df.shape}")
df.head()

dataframe shape: (31799, 10)


Unnamed: 0,sent-index,sent,svo_dist,apv,scv,hv,svo_dist_norm,svo_dist_norm_disc5,svo_dist_norm_disc10,svo_dist_norm_disc20
0,0_66252-0-0-0,The Local Government Act 1985 was an Act of Pa...,11,1,1,1,0.733333,0.6,0.7,0.7
1,0_66252-0-0-1,Its main effect was to abolish the county coun...,31,0,0,1,0.775,0.6,0.7,0.75
2,0_66252-0-1-0,"The main provision, section 1 stated that ""the...",12,1,2,1,0.48,0.4,0.4,0.45
3,0_66252-0-2-0,It came into effect on 1 April 1986.,6,1,1,1,0.666667,0.6,0.6,0.65
4,0_66252-0-2-1,The metropolitan boroughs and London boroughs ...,16,1,1,1,0.695652,0.6,0.6,0.65


In [28]:
df["sent_cleaned"] = df["sent"].apply(denoise_text)
df = df.apply(add_tokens_col, axis="columns")
df = df.apply(add_lemmatized_col, axis="columns")
df = df[df["sent_lemm"] != ""]
print(f"dataframe shape: {df.shape}")
df.head()



dataframe shape: (31740, 13)


Unnamed: 0,sent-index,sent,svo_dist,apv,scv,hv,svo_dist_norm,svo_dist_norm_disc5,svo_dist_norm_disc10,svo_dist_norm_disc20,sent_cleaned,tokens,sent_lemm
0,0_66252-0-0-0,The Local Government Act 1985 was an Act of Pa...,11,1,1,1,0.733333,0.6,0.7,0.7,local government act act parliament united,"[local, government, act, act, parliament, united]",local government act act parliament united
1,0_66252-0-0-1,Its main effect was to abolish the county coun...,31,0,0,1,0.775,0.6,0.7,0.75,main effect abolish county councils metropolit...,"[main, effect, abolish, county, councils, metr...",main effect abolish county council metropolita...
2,0_66252-0-1-0,"The main provision, section 1 stated that ""the...",12,1,2,1,0.48,0.4,0.4,0.45,main section stated greater london metropolita...,"[main, section, stated, greater, london, metro...",main section stated greater london metropolita...
3,0_66252-0-2-0,It came into effect on 1 April 1986.,6,1,1,1,0.666667,0.6,0.6,0.65,came effect april,"[came, effect, april]",came effect april
4,0_66252-0-2-1,The metropolitan boroughs and London boroughs ...,16,1,1,1,0.695652,0.6,0.6,0.65,metropolitan boroughs london boroughs got powe...,"[metropolitan, boroughs, london, boroughs, got...",metropolitan borough london borough got power ...


In [29]:
df.to_csv("./data/train_auto_annotations_UPDATED_cleaned2.csv")