In [1]:
import re
from datasets import load_dataset
import spacy
from spacy.util import minibatch
import pandas as pd
import random

In [3]:
#Get male words list
male = open("Data/GenderWordData/male_word_file.txt", "r")
male_data = male.read()
male_list = male_data.replace('\n', ' ').split(" ")
male.close()

#Get female words list
female = open("Data/GenderWordData/female_word_file.txt", "r")
female_data = female.read()
female_list = female_data.replace('\n', ' ').split(" ")
female.close()

In [4]:
#Function to classify overall gender majority in text

def classify_text(text_lst,male_list, female_list):
    #get rid of punctuation
    results = []
    for i in range(0,len(text_lst)):
        text = re.sub(r'[^\w\s]', '', text_lst[i])    
        words = text.split(' ')
        male_count = 0
        female_count = 0
        for word in words:
            word_filter = word.strip().lower()
            if word_filter != '':
                if word_filter in male_list:
                    male_count = male_count + 1
                elif word_filter in female_list:
                    female_count = female_count + 1
        if female_count > male_count:
            results.append(1)
        elif male_count > female_count:
            results.append(0)
        else:
            results.append(-1)
    return results

In [5]:
#Load Dataset
dataset = load_dataset("sentiment140")
train = dataset["train"]
test = dataset["test"]

Reusing dataset sentiment140 (/Users/sagar/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/f81c014152931b776735658d8ae493b181927de002e706c4d5244ecb26376997)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
#Label Gender
train_classified = classify_text(train["text"],male_list,female_list)
test_classified = classify_text(train["text"],male_list,female_list)

In [7]:
#Classify train set
train_df = pd.DataFrame(list(zip(train["text"],train["sentiment"],train_classified)),
                    columns = ["Text","Label", "Gender"])
train_df

Unnamed: 0,Text,Label,Gender
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,-1
1,is upset that he can't update his Facebook by ...,0,0
2,@Kenichan I dived many times for the ball. Man...,0,-1
3,my whole body feels itchy and like its on fire,0,-1
4,"@nationwideclass no, it's not behaving at all....",0,-1
...,...,...,...
1599995,Just woke up. Having no school is the best fee...,4,-1
1599996,TheWDB.com - Very cool to hear old Walt interv...,4,-1
1599997,Are you ready for your MoJo Makeover? Ask me f...,4,-1
1599998,Happy 38th Birthday to my boo of alll time!!! ...,4,-1


In [41]:
#Classify test set
test_df = pd.DataFrame(list(zip(test["text"],test["sentiment"],test_classified)),
                    columns = ["Text","Label", "Gender"])
test_df

Unnamed: 0,Text,Label,Gender
0,@stellargirl I loooooooovvvvvveee my Kindle2. ...,4,-1
1,Reading my kindle2... Love it... Lee childs i...,4,0
2,"Ok, first assesment of the #kindle2 ...it fuck...",4,-1
3,@kenburbary You'll love your Kindle2. I've had...,4,-1
4,@mikefish Fair enough. But i have the Kindle2...,4,-1
...,...,...,...
493,Ask Programming: LaTeX or InDesign?: submitted...,2,-1
494,"On that note, I hate Word. I hate Pages. I hat...",0,-1
495,Ahhh... back in a *real* text editing environm...,4,-1
496,"Trouble in Iran, I see. Hmm. Iran. Iran so far...",0,-1


In [9]:
#Combine datasets
data = pd.concat([train_df,test_df])

#Get rid of ungendered senetences 
data = data[data.Gender !=-1]

#Get rid of neutral sentiments
data = data[data.Label != 2]

#Use 0-1 sentiment labbeling
data['Label'] = data['Label'].replace([4],1)

data = data.reset_index()


In [15]:
#Load Toxicity Models
toxicClassifierModelSpacy =  spacy.load("Transformer/Models/ToxicTextClassifierModel/model")
toxicSpanModelSpacy = spacy.load("Transformer/Models/ToxicTextSpanDetectionModel/model")




In [44]:
#Function to mask toxic elements
def removeToxicity(text,model,mask):
    doc = model(text)
    if len(doc.ents) > 0:
        for word in doc.ents[0]:
            text = text.replace(word.text, mask)
    return text


In [None]:
data["Text"] = data["Text"].apply(removeToxicity, args = (toxicSpanModelSpacy,"[TOXIC]"))

In [None]:
#save?