In [1]:
import gdown
import pandas as pd

file_url = "https://drive.google.com/uc?id=1-gETJqZ4bZLE28ax8gbs-WX46OLWsY4F"
output_file_path = "/content/my_file.ext"
gdown.download(file_url, output_file_path, quiet=False)
database = pd.read_csv(output_file_path)
df = database

Downloading...
From: https://drive.google.com/uc?id=1-gETJqZ4bZLE28ax8gbs-WX46OLWsY4F
To: /content/my_file.ext
100%|██████████| 816M/816M [00:07<00:00, 111MB/s]


In [2]:
df.shape

(1804874, 45)

In [3]:
import pandas as pd

toxic_comments = df[df['target'] >= 0.5]
non_toxic_comments = df[df['target'] < 0.5]
num_rows_per_class = 100000

most_toxic_comments = toxic_comments.nlargest(num_rows_per_class, 'target')
sampled_toxic = toxic_comments.sample(n=num_rows_per_class - len(most_toxic_comments), random_state=42)
sampled_non_toxic = non_toxic_comments.sample(n=num_rows_per_class, random_state=42)
final_df = pd.concat([sampled_toxic, most_toxic_comments, sampled_non_toxic])

final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)
print("Final DataFrame shape:", final_df.shape)

Final DataFrame shape: (200000, 45)


In [4]:
columns_to_drop = ['asian', 'atheist', 'bisexual', 'black', 'buddhist', 'christian', 'female',
                   'heterosexual', 'hindu', 'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
                   'jewish', 'latino', 'male', 'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity',
                   'other_religion', 'other_sexual_orientation', 'physical_disability', 'psychiatric_or_mental_illness',
                   'transgender', 'white', "created_date",	"publication_id", "article_id", "parent_id"]
final_df = final_df.drop(columns=columns_to_drop, axis=1)
final_df.shape

(200000, 17)

In [5]:
final_df["comment_text"] = final_df["comment_text"].str.lower()
url_pattern = r"https?://\S+|www\.\S+"
url2 = r'https?:\/\/.*[\r\n]*'
url3 = r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]'
index_has_url = final_df["comment_text"].str.contains(url2)
text_has_url = final_df.loc[index_has_url, "comment_text"]
import re
sample = text_has_url.iloc[14]
print(f"Sample:\n {sample}\n")
print(f"Remove URL:\n {re.sub(url_pattern, '', sample)}")

Sample:
 *sigh*  i'm thinking that is about 1.9 billion roubles too much to spend on that ridiculous "sport".  it is like an awful mashup of a 1980s aerobics workout together with a low-rent figure skating competition, accompanied by some painfully bad music that shouldn't be allowed anywhere near the name "rock 'n' roll":
https://www.youtube.com/watch?v=l4j5zwl_xre

Remove URL:
 *sigh*  i'm thinking that is about 1.9 billion roubles too much to spend on that ridiculous "sport".  it is like an awful mashup of a 1980s aerobics workout together with a low-rent figure skating competition, accompanied by some painfully bad music that shouldn't be allowed anywhere near the name "rock 'n' roll":



In [6]:
def preprocess(final_df):
    contractions = {
        "ain't": "am not, is not, are not, has not, have not",
        "aren't": "are not",
        "can't": "cannot",
        "could've": "could have",
        "couldn't": "could not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "gonna": "going to",
        "gotta": "got to, have to",
        "hadn't": "had not",
        "hasn't": "has not",
        "he'd": "he had, he would",
        "he'll": "he will",
        "here's": "here is",
        "how'd": "how did",
        "how's": "how is",
        "I'd": "I had, I would",
        "I'll": "I will",
        "I'm": "I am",
        "I've": "I have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "might've": "might have",
        "mightn't": "might not",
        "must've": "must have",
        "mustn't": "must not",
        "my": "mine",
        "needn't": "need not",
        "'o clock": "of the clock",
        "o'er": "over",
        "o's": "of the",
        "oughtn't": "ought not",
        "shan't": "shall not",
        "she'd": "she had, she would",
        "she'll": "she will",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "somethin'": "something",
        "that's": "that is",
        "they'd": "they had, they would",
        "they'll": "they will",
        "they're": "they are",
        "this's": "this is",
        "those's": "those are",
        "'tis": "it is",
        "twas": "it was",
        "wanna": "want to",
        "we'd": "we had, we would",
        "we'll": "we will",
        "we're": "we are",
        "what're": "what are",
        "what's": "what is",
        "when's": "when is",
        "where's": "where is",
        "who's": "who is",
        "would've": "would have",
        "wouldn't": "would not",
        "you'd": "you had, you would",
        "you'll": "you will",
        "you're": "you are",
    }

    for contraction, expanded in contractions.items():
        final_df["comment_text"] = final_df["comment_text"].str.replace(rf'\b{contraction}\b', expanded)
        final_df["comment_text"] = final_df["comment_text"].str.lower()
        final_df["comment_text"] = final_df["comment_text"].str.replace(r"https?://\S+|www\.\S+", " ")
        final_df["comment_text"] = final_df["comment_text"].str.replace(r'https?:\/\/.*[\r\n]*', " ")
        final_df["comment_text"] = final_df["comment_text"].str.replace(r'\<a href', " ")
        final_df["comment_text"] = final_df["comment_text"].str.replace(r'&amp;', " ")
        final_df["comment_text"] = final_df["comment_text"].str.replace(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', " ")
        final_df["comment_text"] = final_df["comment_text"].str.replace(r'<br />', " ")
        final_df["comment_text"] = final_df["comment_text"].str.replace(r'\'', " ")
        final_df["comment_text"] = final_df["comment_text"].str.replace(r"'", " ")
    return final_df
final_df = preprocess(final_df)

  final_df["comment_text"] = final_df["comment_text"].str.replace(rf'\b{contraction}\b', expanded)
  final_df["comment_text"] = final_df["comment_text"].str.replace(r"https?://\S+|www\.\S+", " ")
  final_df["comment_text"] = final_df["comment_text"].str.replace(r'https?:\/\/.*[\r\n]*', " ")
  final_df["comment_text"] = final_df["comment_text"].str.replace(r'\<a href', " ")
  final_df["comment_text"] = final_df["comment_text"].str.replace(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', " ")
  final_df["comment_text"] = final_df["comment_text"].str.replace(r'\'', " ")


In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

final_df['comment_text'] = final_df['comment_text'].apply(remove_stopwords)

import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
final_df['comment_text'] = final_df['comment_text'].apply(remove_punctuation)


def tokenize_text(text):
    return word_tokenize(text)
final_df['comment_text'] = final_df['comment_text'].apply(tokenize_text)

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

def stem_text(tokens):
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens
final_df['stemmed_text'] = final_df['comment_text'].apply(stem_text)
# print(final_df[['id', 'comment_text', 'stemmed_text']])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
final_df.shape

(200000, 18)

In [9]:
final_df.to_csv('preprocessed_database_200000_0.5.csv', index=False)

In [10]:
import pandas as pd
total_comments = len(final_df)
toxic_comments = len(final_df[final_df['target'] >= 0.5])

percentage_toxic = (toxic_comments / total_comments) * 100
percentage_non_toxic = 100 - percentage_toxic

print(f"Percentage of toxic comments: {percentage_toxic:.2f}%")
print(f"Percentage of non-toxic comments: {percentage_non_toxic:.2f}%")

Percentage of toxic comments: 50.00%
Percentage of non-toxic comments: 50.00%
