## This notebook is used to clean the message csv file in order to extract sentences that will be used in notebooks/test_text_performance.ipynb for testing our text model performances

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [15]:
cols = ['account_id', 'alliance_id', 'timestamp', 'date', 'raw_message',
       'filtered_message', 'filtered', 'filtered_content', 'risk',
       'filter_detected_language', 'is_family_friendly', 'GENERAL_RISK',
       'BULLYING', 'VIOLENCE', 'RELATIONSHIP_SEXUAL_CONTENT', 'VULGARITY',
       'DRUGS_ALCOHOL', 'IN_APP', 'ALARM', 'FRAUD', 'HATE_SPEECH', 'RELIGIOUS',
       'WEBSITE', 'CHILD_GROOMING', 'PUBLIC_THREAT', 'EXTREMISM', 'SUBVERSIVE',
       'SENTIMENT', 'POLITICS']
negatives = ['risk', 'is_family_friendly', 'GENERAL_RISK',
       'BULLYING', 'VIOLENCE', 'RELATIONSHIP_SEXUAL_CONTENT', 'VULGARITY',
       'DRUGS_ALCOHOL', 'IN_APP', 'ALARM', 'FRAUD', 'HATE_SPEECH', 'RELIGIOUS',
       'WEBSITE', 'CHILD_GROOMING', 'PUBLIC_THREAT', 'EXTREMISM', 'SUBVERSIVE',
       'SENTIMENT', 'POLITICS']
       
def dropUnusefulCols(messages):
    """Drop the columns that are not useful for the analysis"""
    ls = ['filter_detected_language']
    return messages.drop(columns=ls)


def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False


def returnNonNumeric(col):
    """Return the non-numeric values of a column"""
    ls = list(col.value_counts().index)
    # numeric in floatFormat
    isFloat = [isfloat(elem) for elem in ls]
    # numeric in intFormat
    isInt = [str(elem).isnumeric() for elem in ls]
    notNumeric = []
    for i in range(len(ls)):
        if not(isFloat[i] or isInt[i]):
            notNumeric.append(ls[i])
    return notNumeric

def retrieveBadIndices(messages):
    """Retrieve the indices of the lines that contain non-numerical values"""
    badIndices = []
    for col in negatives:
        column = messages[col]
        nonNumeric = returnNonNumeric(column)
        # Retrieve indices of lines that are not numerical (problematic)
        badIndex = list(column[column.isin(nonNumeric)].index)
        badIndices += badIndex
    return badIndices


def cleanData(messages):
    """Clean the data by removing the lines that contain non-numerical values"""

    bad =retrieveBadIndices(messages)

    messages= messages[~messages.index.isin(bad)]
    for col in negatives:
        messages[col] = messages[col].astype('float64')
    return messages

def cleanMessage2(messages):
    """Clean message_2.csv by removing the lines that contain non-numerical values"""
    messages = messages.dropna()
    messages = cleanData(messages)
    stringColumns = ['account_id','alliance_id','raw_message','filtered_message']
    for col in stringColumns:
        messages[col] = messages[col].astype("string")
    messages = messages.drop(columns= ['filter_detected_language'])
    messages['filtered'] = messages['filtered'] == '0'
    return messages

In [16]:
messages_2 = pd.read_csv("../data/chat_messages_2.csv")
messages_1 = pd.read_csv("../data/chat_messages_1.csv")

# Drop the columns that are not useful for the analysis, and the lines that contain non-numerical values
messages_2 = cleanMessage2(messages_2)
messages_1 = cleanMessage2(messages_1)

messages_2.to_csv("../data/cleaned_messages_2.csv", index=False)
messages_1.to_csv("../data/cleaned_messages_1.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages[col] = messages[col].astype('float64')
