In [16]:
# Notebook Parameters

fold_pth = "N:/Code/GITHUB/Analytics_Day/Analytics_Day_Fall2022/" + "/data/"

input_data_name = "output_df.csv"

output_data_name = "sentiment_results.csv"


In [17]:
# Functions

def sent_anal(text):
    sent_res = classifier(text)
    return [sent_res[0]["label"], sent_res[0]["score"]]

def remove_urls(text):
    """
    var_name: String, Single Input String containing URL links
    return: String, Single Input String with the URL links removed
    """
    url_pattern = re.compile(r'https?://\S+|www\.\S+|com\.\S+|\S+\.com|\S+\.pdf|\S+\.bat|\S+.jpg|\S+.png|https?://\S+\.jpg|https?://\S+\.bat')
    text = re.sub(url_pattern, " ", text)
    return text
    
def remove_parentheses(text):
    """
    var_name: String, Single Input String containing parentheses
    return: String, Single Input String with the parentheses removed
    """
    para_pattern = re.compile(r'\(')
    text = re.sub(para_pattern, " ", text)
    para_pattern = re.compile(r'\)')
    text = re.sub(para_pattern, " ", text)
    para_pattern = re.compile(r'\[')
    text = re.sub(para_pattern, " ", text)
    para_pattern = re.compile(r'\]')
    text = re.sub(para_pattern, " ", text)
    text = re.sub(r' +', " ", text)
    return text # PARENTHESES_MASK
    
def remove_html(text):
    """
    var_name: String, Single Input String containing HTML tags
    return: String, Single Input String with HTML tags removed
    """
    html_pattern = re.compile(' ')
    return html_pattern.sub(r' ', text)

def remove_new_line(text):
    """
    var_name: String, Single Input String containing '\n'
    return: String, Single Input String with '\n' removed
    """
    return re.sub(r"\\\n", ' ', text)

def remove_non_alpha(text):
    """
    var_name: String, Single Input String containing non-alpha characters
    return: String, Single Input String with non-alpha characters removed
    """
    t = re.sub("[^A-Za-z]+", ' ', str(text)).strip()
    t = re.sub("[ +]", " ", t).strip()
    return t

def remove_phonenumbers(text):
    """
    var_name: String, Single Input String containing phonenumbers
    return: String, Single Input String with phone numbers removed
    """
    p = re.compile("^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}$")
    text = re.sub(p, " ", text)
    p = re.compile("^\d{3}[\s.-]\d{4}$")
    text = re.sub(p, " ", text)
    p = re.compile("^\d{10}")
    text = re.sub(p, " ", text)
    return text

def remove_stopwords(text):
    """
    var_name: String, Single Input String containing stopwords
    return: String, Single Input String containing no stopwords
    """
    return " ".join([x for x in text.split(" ") if x not in stopwords])

def preprocess_text(text):
    """
    var_name: String, Single Input String in raw form
    return: String, Single Input String in processed form
    """
    text = re.sub(r"\"", " ", text)
    text = re.sub(r" and", " and ", text)
    text = remove_parentheses(text)
    text = remove_phonenumbers(text)
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_new_line(text)
    text = remove_non_alpha(text)
    text = text.replace("\n", " ").replace("\xa0", " ").replace("*", " ")
    return text.lower().strip()

def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])


In [20]:
# Data Import

import pandas as pd
from transformers import pipeline
import time, re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words("english")
classifier = pipeline("sentiment-analysis", model = "siebert/sentiment-roberta-large-english")

df = pd.read_csv(fold_pth + input_data_name)



In [21]:
# Data Preprocess

s_list = []
print("STARTED PREPROCESSING TEXT")

s = time.time()
df["clean_rComments"] = df["rComments"].apply(preprocess_text)
print("CHECKPOINT: clean_rComments", s)
s_list.append(["clean_rComments", s])

s = time.time()
df["clean_rComments_tags"] = df["rComments_tags"].apply(preprocess_text)
print("CHECKPOINT: clean_rComments_tags", s)
s_list.append(["clean_rComments_tags", s])

df = df[df["clean_rComments"] != ""]

print("COMPLETED PREPROCESS :: STARTED REMOVING STOPWORDS") # clean_rComments, clean_rComments_tags


s = time.time()
df["clean_rComments_nostops"] = df["clean_rComments"].apply(remove_stopwords)
print("CHECKPOINT: clean_rComments_nostops", s)
s_list.append(["clean_rComments_nostops", s])


s = time.time()
df["clean_rComments_tags_nostops"] = df["clean_rComments_tags"].apply(remove_stopwords)
print("CHECKPOINT: clean_rComments_tags_nostops", s)
s_list.append(["clean_rComments_tags_nostops", s])



print("COMPLETED REMOVING STOPWORDS :: STARTED LEMMATIZATION") # clean_rComments, clean_rComments_tags, clean_rComments_nostops, clean_rComments_tags_nostops



s = time.time()
df["clean_rComments_lemma"] = df["clean_rComments"].apply(lemmatize_words)
print("CHECKPOINT: clean_rComments_lemma", s)
s_list.append(["clean_rComments_lemma", s])


s = time.time()
df["clean_rComments_tags_lemma"] = df["clean_rComments_tags"].apply(lemmatize_words)
print("CHECKPOINT: clean_rComments_tags_lemma", s)
s_list.append(["clean_rComments_tags_lemma", s])


s = time.time()
df["clean_rComments_nostops_lemma"] = df["clean_rComments_nostops"].apply(lemmatize_words)
print("CHECKPOINT: clean_rComments_nostops_lemma", s)
s_list.append(["clean_rComments_nostops_lemma", s])


s = time.time()
df["clean_rComments_tags_nostops_lemma"] = df["clean_rComments_tags_nostops"].apply(lemmatize_words)
print("CHECKPOINT: clean_rComments_tags_nostops_lemma", s)
s_list.append(["clean_rComments_tags_nostops_lemma", s])



print("COMPLETED LEMMATIZATION") # clean_rComments, clean_rComments_tags, clean_rComments_nostops, clean_rComments_tags_nostops, clean_rComments_lemma, clean_rComments_tags_lemma, clean_rComments_nostops_lemma, clean_rComments_tags_nostops_lemma

##### Columns Created
# clean_rComments
# clean_rComments_tags

# clean_rComments_nostops
# clean_rComments_tags_nostops

# clean_rComments_lemma
# clean_rComments_tags_lemma

# clean_rComments_nostops_lemma
# clean_rComments_tags_nostops_lemma


STARTED PREPROCESSING TEXT
CHECKPOINT: clean_rComments 1670459111.9211745
CHECKPOINT: clean_rComments_tags 1670459135.354621
COMPLETED PREPROCESS :: STARTED REMOVING STOPWORDS
CHECKPOINT: clean_rComments_nostops 1670459160.535451
CHECKPOINT: clean_rComments_tags_nostops 1670459172.4941287
COMPLETED REMOVING STOPWORDS :: STARTED LEMMATIZATION
CHECKPOINT: clean_rComments_lemma 1670459185.5493476
CHECKPOINT: clean_rComments_tags_lemma 1670459212.3769488
CHECKPOINT: clean_rComments_nostops_lemma 1670459240.0439713
CHECKPOINT: clean_rComments_tags_nostops_lemma 1670459253.8232596
COMPLETED LEMMATIZATION


In [22]:
# Sentiment Analysis

print("STARTED SENTIMENT ANALYSIS")

# clean_rComments
s = time.time()
print("NUMBER OF RECORDS:", df.loc[df["clean_rComments"] != "", "clean_rComments"].shape[0])
df.loc[df["clean_rComments"] != "", "STUDSENT_rComments"] = df.loc[df["clean_rComments"] != "", "clean_rComments"].apply(sent_anal)
print("CHECKPOINT: STUDSENT_rComments", s)
s_list.append(["STUDSENT_rComments", s])

# clean_rComments_tags
s = time.time()
df.loc[df["clean_rComments_tags"] != "", "STUDSENT_rComments_tags"] = df.loc[df["clean_rComments_tags"] != "", "clean_rComments_tags"].apply(sent_anal)
print("CHECKPOINT: STUDSENT_rComments_tags", s)
s_list.append(["STUDSENT_rComments_tags", s])

# clean_rComments_lemma
s = time.time()
df.loc[df["clean_rComments_lemma"] != "", "STUDSENT_rComments_lemma"] = df.loc[df["clean_rComments_lemma"] != "", "clean_rComments_lemma"].apply(sent_anal)
print("CHECKPOINT: STUDSENT_rComments_lemma", s)
s_list.append(["STUDSENT_rComments_lemma", s])

# clean_rComments_tags_lemma
s = time.time()
df.loc[df["clean_rComments_tags_lemma"] != "", "STUDSENT_rComments_tags_lemma"] = df.loc[df["clean_rComments_tags_lemma"] != "", "clean_rComments_tags_lemma"].apply(sent_anal)
print("CHECKPOINT: STUDSENT_rComments_tags_lemma", s)
s_list.append(["STUDSENT_rComments_tags_lemma", s])

# clean_rComments_nostops
s = time.time()
df.loc[df["clean_rComments_nostops"] != "", "STUDSENT_rComments_nostops"] = df.loc[df["clean_rComments_nostops"] != "", "clean_rComments_nostops"].apply(sent_anal)
print("CHECKPOINT: STUDSENT_rComments_nostops", s)
s_list.append(["STUDSENT_rComments_nostops", s])

# clean_rComments_tags_nostops
s = time.time()
df.loc[df["clean_rComments_tags_nostops"] != "", "STUDSENT_rComments_tags_nostops"] = df.loc[df["clean_rComments_tags_nostops"] != "", "clean_rComments_tags_nostops"].apply(sent_anal)
print("CHECKPOINT: STUDSENT_rComments_tags_nostops", s)
s_list.append(["STUDSENT_rComments_tags_nostops", s])

# clean_rComments_nostops_lemma
s = time.time()
df["STUDSENT_rComments_nostops_lemma"] = df["clean_rComments_nostops_lemma"].apply(sent_anal)
print("CHECKPOINT: STUDSENT_rComments_nostops_lemma", s)
s_list.append(["STUDSENT_rComments_nostops_lemma", s])


STARTED SENTIMENT ANALYSIS
NUMBER OF RECORDS: 286333


KeyboardInterrupt: 

In [None]:
# Save Data

df.to_csv(fold_pth + output_data_name)