<a href="https://colab.research.google.com/github/philliewright/Hatespeech_CW2/blob/main/utils_hs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
%%writefile "/content/drive/My Drive/Projects_Portfolio/hatespeech_detection/utils_hs.py"

import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

def load_csv(file_path):
    try:
        data = pd.read_csv(file_path)
        if data.empty:
            raise ValueError("The CSV file is empty")
        return data
    except FileNotFoundError:
        raise FileNotFoundError("The specified file does not exist")
    except Exception as e:
        raise Exception(f"An unknown error occurred: {e}")

def report_missing_values(data):
    """ Checks for missing values"""
    missing_values = data.isnull().sum()
    print("Missing values in each column:")
    for column, missing_value_count in missing_values.items():
        print(f"{column}: {missing_value_count}")

def tokenise_text(data):
    """
    Tokenise the text in the clean_text column
    """
    try:
        # label changed to a float when I imported it, so changing it back
        data['oh_label'] = data['oh_label'].astype(int)
        data['clean_text'] = data['clean_text'].astype(str)

        # Begin by tokenizing the words
        # also lowercase all words for consistency
        data['tokens'] = data['clean_text'].apply(lambda x: [word.lower() for word in x.split()]) #ensures all text is lowercase
        print("Tokenisation successful")
        return data
    except Exception as e:
        print(f"Tokenisation error: {e}")
        return None

def lemmatize_text(data):
    """
    Lemmatises the tesxt data
    """
    try:
        lemm = WordNetLemmatizer() #using the inbuilt lemmatisation function

    # Lemmatize all words
        data['lemmatized'] = data['tokens'].apply(lambda x: [lemm.lemmatize(word) for word in x])
        print("Lemmatisation successful")
        return data
    except Exception as e:
        print(f"An error occurred during lemmatisation: {e}")
        return None

def word_frequency_analysis(data):
    """
    calculates the most common words in the data
    """
    words_1 = data[data.oh_label == 1]['lemmatized']
    words_0 = data[data.oh_label == 0]['lemmatized']

    _1_words = Counter(word for words in words_1 for word in str(words).split())
    _0_words = Counter(word for words in words_0 for word in str(words).split())

    print("Most common words for oh_label = 1:")
    print(_1_words.most_common(50))

    print("Most common words for oh_label = 0:")
    print(_0_words.most_common(50))

# Function to remove numbers from a list of words
def remove_numbers(word_list):
    """
    removees any numbers from the text
    """
    return [word for word in word_list if not bool(re.search(r'\d', word))]

all_data['lemmatized_no_numbers'] = all_data['lemmatized'].apply(remove_numbers)



# Function to remove URLs from a list of words
def remove_urls(word_list):
    """
    Removes any URLs from the text
    """
    return [word for word in word_list if not (word.startswith('http') or word.startswith('www'))]

all_data['lemmatized_clean'] = all_data['lemmatized_no_numbers'].apply(remove_urls)



Overwriting /content/drive/My Drive/Projects_Portfolio/hatespeech_detection/utils_hs.py


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
