In [None]:
!pip install contractions
!pip install word2number
!pip install Unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.2/103.2 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/sim

# Offensive Text Detector

## Useful functions

In [None]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import CountVectorizer # bag-of-words
import offensive_preprocessing as preprocess

""" extract dataframe from dataset on file """
def offensiveDataframe(filename: str, include_bag_of_words: bool, include_bad_words: bool, include_slur: bool):

    offensive_feature_dict = {"text": []}
    if(include_bad_words):
        offensive_feature_dict["has_badword"] = []
    if(include_slur):
        offensive_feature_dict["has_slur"] = []
    offensive_feature_dict["is_offensive"] = []

    offensive_df = pd.DataFrame(offensive_feature_dict)
    print(offensive_df)
    offensive_arr = []

    # Open file for reading
    with open(filename, "r", encoding='utf-8-sig') as file_ptr:

        # Make badword/slur dictionaries
        f_bad = open('./content/Bad_Word_Data.txt')
        bad = preprocess.make_dict(f_bad)

        
        f_slur = open('./content/Slur_Data.txt')
        slur = preprocess.make_dict(f_slur)

        # Read each line of the file
        file_csv = csv.reader(file_ptr)
        for line in file_csv:

            ## get features

            # text
            offensive_line = line[0]

            # contains badwords or slurs
            processed = preprocess.text_preprocessing(offensive_line)
            # print(" ".join(processed))
            offensive_arr.append(" ".join(processed)) # add to bag-of-words

            has_badword = 0 # badword
            for token in processed:
                if bad.get(token) == 1:
                    has_badword = 1
                    break

            has_slur = 0    # slur
            for token in processed:
                if slur.get(token) == 1:
                    has_slur = 1
                    break

            # is it offensive or not?
            offensive = line[1]

            ## add features to dataframe
            # print(offensive_line, has_slur, has_badword, offensive)
            _offensive_append(offensive_df, [offensive_line, has_slur, has_badword, offensive])

    # Make a dataframe from extra "bag-of-word" features
    if(include_bag_of_words):
        bow_extracted = _get_bagOWords(offensive_arr)
        bow_labels = bow_extracted[0]
        bow_elements = bow_extracted[1]
        bow_features = pd.DataFrame(np.array(bow_elements), columns=bow_labels)

        offensive_df = pd.concat((offensive_df, bow_features), axis=1)

        return offensive_df, bow_labels
    
    return offensive_df

""" extract dataframe from given user string. """
def user_offensive_text(user_input: str, user_offensive_labels: list, include_bag_of_words: bool, include_bad_words: bool, include_slur: bool):

    offensive_feature_dict = {"text": []}
    if(include_bad_words):
        offensive_feature_dict["has_badword"] = []
    if(include_slur):
        offensive_feature_dict["has_slur"] = []

    # combine current labels and bag-of-words labels
    offensive_df = pd.DataFrame(offensive_feature_dict)
    # offensive_arr = []

    # Make badword/slur dictionaries
    f_bad = open('./content/Bad_Word_Data.txt')
    bad = preprocess.make_dict(f_bad)

    
    f_slur = open('./content/Slur_Data.txt')
    slur = preprocess.make_dict(f_slur)

    ## get features

    # contains badwords or slurs
    processed = preprocess.text_preprocessing(user_input)
    # offensive_arr.append(" ".join(processed)) # add to bag-of-words

    has_badword = 0 # badword
    for token in processed:
        if bad.get(token) == 1:
            has_badword = 1
            break

    has_slur = 0    # slur
    for token in processed:
        if slur.get(token) == 1:
            has_slur = 1
            break

    ## add features to dataframe
    # print(offensive_line, has_slur, has_badword)
    _offensive_append(offensive_df, [user_input, has_slur, has_badword])

    if(include_bag_of_words):

        # Make a dataframe from extra "bag-of-word" features
        bow_extracted = _get_bagOWords([" ".join(processed)])
        bow_labels = bow_extracted[0]
        # bow_elements = bow_extracted[1]

        # print(user_offensive_labels)
        # print(bow_labels)
        user_offensive_features = []
        for lbl in user_offensive_labels:
            if(lbl in bow_labels):
                user_offensive_features.append(1)
            else:
                user_offensive_features.append(0)
        
        # print(user_offensive_labels)
        # print(user_offensive_features) # add this to a dataframe

        user_offensive_df = pd.DataFrame([user_offensive_features], columns=np.array(user_offensive_labels))
        return pd.concat((offensive_df, user_offensive_df), axis=1)
    
    return offensive_df

""" create data features. """
def _get_bagOWords(data: list):
    vector = CountVectorizer()

    # Fit the bag-of-words model
    bag_of_words = vector.fit_transform(data)

    return (vector.get_feature_names_out(), bag_of_words.toarray())

""" append offensive element to list """
def _offensive_append(offensive_df, features: list):
    offensive_df.loc[len(offensive_df.index)] = features

## Preprocessing and Training

In [None]:
import pandas as pd
import offensive_parser as parser

# offensive_df, bag_of_words = parser.offensiveDataframe("./content/Offensive_Dataset_Team.csv", False, True, True)
offensive_df = parser.offensiveDataframe("./content/Offensive_Dataset_Team.csv", False, True, True)
print(offensive_df.head())

In [None]:
# Training Split

from sklearn.model_selection import train_test_split

y = offensive_df['is_offensive']
x = offensive_df.drop(['is_offensive', 'text'], axis = 1)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

## Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

print("Results for Logistic Regression on Bad Words and Slurs Words:\n")

# train model
lr = LogisticRegression()
lr.fit(X_train, y_train)

# make predictions
y_pred = lr.predict(X_test)
pred = lr.predict_proba(X_test)

train_accuracy = lr.score(X_train, y_train)
print("Accuracy on train = %0.4f " % train_accuracy)

test_accuracy = lr.score(X_test, y_test)
print("Accuracy on test = %0.4f " % test_accuracy, "\n")

print(classification_report(y_test, y_pred))