In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50

In [2]:
RAND_STATE = 42 
TT_RATIO = 0.3 

In [3]:
df= pd.read_csv("sexism_data.csv")
df

Unnamed: 0,id,dataset,text,toxicity,sexist,of_id
0,0,other,MENTION3481 i didn't even know random was an o...,0.118180,False,-1
1,1,other,Bottom two should've gone! #mkr,0.251850,False,-1
2,2,callme,MENTION3111 MENTION3424 ladyboner deserves so ...,0.113331,False,-1
3,3,other,She shall now be known as Sourpuss #MKR #KatAn...,0.531153,False,-1
4,4,other,Tarah W threw a bunch of women under the bus s...,0.118718,False,-1
...,...,...,...,...,...,...
13626,13630,callme,this reminds me of the MENTION3079 situation; ...,0.147044,False,-1
13627,13631,other,#mkr I love Annie and loyld there like a real ...,0.213106,False,-1
13628,13632,other,No u. http://t.co/zOr0eWahSS,0.324702,False,-1
13629,13633,other,#mkr the way kat looks at Annie is like she's ...,0.563036,False,-1


In [6]:
df.drop(['id','dataset','toxicity','of_id'], axis =1, inplace = True)
df

Unnamed: 0,text,sexist
0,MENTION3481 i didn't even know random was an o...,False
1,Bottom two should've gone! #mkr,False
2,MENTION3111 MENTION3424 ladyboner deserves so ...,False
3,She shall now be known as Sourpuss #MKR #KatAn...,False
4,Tarah W threw a bunch of women under the bus s...,False
...,...,...
13626,this reminds me of the MENTION3079 situation; ...,False
13627,#mkr I love Annie and loyld there like a real ...,False
13628,No u. http://t.co/zOr0eWahSS,False
13629,#mkr the way kat looks at Annie is like she's ...,False


In [19]:
df.isna().any()

text               False
sexist             False
normalized_text    False
dtype: bool

In [20]:
df = df.drop_duplicates()
print(df)

                                                    text  sexist  \
0      MENTION3481 i didn't even know random was an o...   False   
1                       Bottom two should've gone!  #mkr   False   
2      MENTION3111 MENTION3424 ladyboner deserves so ...   False   
3      She shall now be known as Sourpuss #MKR #KatAn...   False   
4      Tarah W threw a bunch of women under the bus s...   False   
...                                                  ...     ...   
13626  this reminds me of the MENTION3079 situation; ...   False   
13627  #mkr I love Annie and loyld there like a real ...   False   
13628                       No u. http://t.co/zOr0eWahSS   False   
13629  #mkr the way kat looks at Annie is like she's ...   False   
13630  #mkr omg the kiss stains on the "dirty" mirror...   False   

                                         normalized_text  
0              mention3481 didnt even know random option  
1                            bottom two shouldv gone mkr  
2     

In [21]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import re

[nltk_data] Downloading package punkt to /Users/mm527x/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mm527x/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

# Load the stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Initialize the stemmer
stemmer = PorterStemmer()

# Define a function to preprocess the text
def preprocess_text(text):
    # Normalize the text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    # Perform stemming
    tokens = [stemmer.stem(token) for token in tokens]
    # Join the tokens back into a string
    text = ' '.join(tokens)
    return text

# Apply the preprocessing function to the 'text' column of your DataFrame
df['processed_text'] = df['text'].apply(preprocess_text)

# Save the processed dataset
df.to_csv('processed_dataset.csv', index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mm527x/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
df

Unnamed: 0,text,sexist,normalized_text,processed_text
0,MENTION3481 i didn't even know random was an o...,False,mention3481 didnt even know random option,mention3481 didnt even know random option
1,Bottom two should've gone! #mkr,False,bottom two shouldv gone mkr,bottom two shouldv gone mkr
2,MENTION3111 MENTION3424 ladyboner deserves so ...,False,mention3111 mention3424 ladybon deserv much cr...,mention3111 mention3424 ladybon deserv much cr...
3,She shall now be known as Sourpuss #MKR #KatAn...,False,shall known sourpuss mkr katandr failedfoodi,shall known sourpuss mkr katandr failedfoodi
4,Tarah W threw a bunch of women under the bus s...,False,tarah w threw bunch women bu could get wadhwa ...,tarah w threw bunch women bu could get wadhwa ...
...,...,...,...,...
13626,this reminds me of the MENTION3079 situation; ...,False,remind mention3079 situat sorri actual dont ca...,remind mention3079 situat sorri actual dont ca...
13627,#mkr I love Annie and loyld there like a real ...,False,mkr love anni loyld like real life disney coup...,mkr love anni loyld like real life disney coup...
13628,No u. http://t.co/zOr0eWahSS,False,u httptcozor0ewahss,u httptcozor0ewahss
13629,#mkr the way kat looks at Annie is like she's ...,False,mkr way kat look anni like she stear soul creepi,mkr way kat look anni like she stear soul creepi
