## <span style="font-family: 'Bebas Neue'; font-size:1.2em;">To be Implemented</span>

<span style="font-family: 'Bebas Neue'; font-size:1.2em;">Could not run due to memory issues</span>

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

In [None]:
import re
from nltk.corpus import words
import string
punct = set(string.punctuation)
eng_set = set(words.words())

def clean_text(text:str):
    """
    Removes emojis, double quotation marks, and other non-word characters, 
    keeping only English words.

    Args:
        text (str): The input text string.

    Returns:
        str: The cleaned text with only English words and spaces.
    """
    if not isinstance(text, str):  # Check if the input is a float
        text = str(text)
    #Lower case all text
    text = text.lower()
    
    if isinstance(text, float):  # Check if the input is a float
        text = str(text)
    # 1. Emoji Removal:
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # Flags 
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"  # Enclosed characters
        "]+", flags=re.UNICODE)
    
    text = emoji_pattern.sub(r'', text) 

    # 2. Remove double quotation marks:
    text = text.replace('"', '')

    #Remove mentions
    pattern = r"@\w+"

    text = re.sub(pattern, '', text)

    #Clean single letters and numbers
    pattern = r'(\d+)'

    text = re.sub(pattern, '', text)

    # Remove Punctuations
    text = " ".join([word for word in text.split() if word not in punct])
    
    #Only words in english language
    text = " ".join([word for word in text.split() if word.lower() in eng_set])
    
    text = text.strip()
    
    return text

In [None]:
import gdown

url = 'https://drive.google.com/uc?id=1DcWCYXWuFmWNqlqfJT8L1OpaYuuzaI9e'
output = 'final_df.csv'

gdown.download(url, output, quiet=False)

df = pd.read_csv(output)
df['clean_text'] = df['text'].apply(clean_text)

#### <span style="font-family: 'Bebas Neue'; font-size:1.2em;">Labelling toxic and non-toxic posts</span>

In [None]:
text_column = 'clean_text'
model_name = 's-nlp/roberta_toxicity_classifier'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
text = df.clean_text.to_list()
if isinstance(text, str):
    text = [text]
elif isinstance(text, pd.Series):
    text = text.tolist()

text = [str(t) for t in text]

In [None]:
# Tokenize the text data
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

In [None]:
# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)

#### <span style="font-family: 'Bebas Neue'; font-size:1.2em;">After the comments are labelled. Further Label them with granularity</span>

In [None]:
text_column = 'text'
model_name = 'unitary/toxic-bert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Tokenize the text data
inputs = tokenizer(df[text_column].tolist(), padding=True, truncation=True, return_tensors="pt")

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)

# Add predicted labels to the DataFrame
# df[label_column] = [1 if label != -1 else 0 for label in predictions.tolist()]

#### Look at the discourse of toxicity through time

In [8]:
#To be implemented