In [14]:
import pandas as pd
import numpy as np

# This notebook contains the preprocessing steps of the datasets
## The data was chosen to include both binary and multi-label text classification

### Preprocessing Function

**First we will create a preprocessing function to then apply to the datasets**

In [15]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')


def preprocessor(text):

    text = text.lower()
    
    # Remove punctuations, newlines etc and strip the text
    text = re.sub(r'[^\w\s\d]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Rejoin words into a single string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ibragimzhussup/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ibragimzhussup/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### BBC News Categorization
**Now we apply our text preprocessor to our datasets**

In [16]:
bbc_data = pd.read_csv("../data/bbc_data.csv")

In [17]:
bbc_data

Unnamed: 0,data,labels
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment
3,Snicket tops US box office chart The film ada...,entertainment
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment
...,...,...
2220,Warning over Windows Word files Writing a Mic...,tech
2221,Fast lifts rise into record books Two high-sp...,tech
2222,Nintendo adds media playing to DS Nintendo is...,tech
2223,Fast moving phone viruses appear Security fir...,tech


In [18]:
bbc_data['text'] = bbc_data['data'].apply(preprocessor)
bbc_data

Unnamed: 0,data,labels,text
0,Musicians to tackle US red tape Musicians gro...,entertainment,musician tackle u red tape musician group tack...
1,"U2s desire to be number one U2, who have won ...",entertainment,u2s desire number one u2 three prestigious gra...
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment,rocker doherty onstage fight rock singer pete ...
3,Snicket tops US box office chart The film ada...,entertainment,snicket top u box office chart film adaptation...
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment,ocean twelve raid box office ocean twelve crim...
...,...,...,...
2220,Warning over Windows Word files Writing a Mic...,tech,warning window word file writing microsoft wor...
2221,Fast lifts rise into record books Two high-sp...,tech,fast lift rise record book two highspeed lift ...
2222,Nintendo adds media playing to DS Nintendo is...,tech,nintendo add medium playing d nintendo releasi...
2223,Fast moving phone viruses appear Security fir...,tech,fast moving phone virus appear security firm w...


In [19]:
clean_bbc = bbc_data.copy()
clean_bbc['label_ids'] = clean_bbc['labels'].factorize()[0]
clean_bbc = clean_bbc[['text', 'labels', 'label_ids']]
clean_bbc.dropna(inplace=True)
clean_bbc

Unnamed: 0,text,labels,label_ids
0,musician tackle u red tape musician group tack...,entertainment,0
1,u2s desire number one u2 three prestigious gra...,entertainment,0
2,rocker doherty onstage fight rock singer pete ...,entertainment,0
3,snicket top u box office chart film adaptation...,entertainment,0
4,ocean twelve raid box office ocean twelve crim...,entertainment,0
...,...,...,...
2220,warning window word file writing microsoft wor...,tech,4
2221,fast lift rise record book two highspeed lift ...,tech,4
2222,nintendo add medium playing d nintendo releasi...,tech,4
2223,fast moving phone virus appear security firm w...,tech,4


**Now we save the clean version to import in the next notebooks**

In [20]:
clean_bbc.to_csv("../data/clean_bbc_classification.csv", index=False)

### Sarcasm Detection 

In [21]:
sarcasm_data = pd.read_json("../data/Sarcasm_Headlines_Dataset_v2.json", lines=True)

In [22]:
sarcasm_data

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [23]:
sarcasm_data['text'] = sarcasm_data['headline'].apply(preprocessor)
sarcasm_data

Unnamed: 0,is_sarcastic,headline,article_link,text
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...,thirtysomething scientist unveil doomsday cloc...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...,dem rep totally nail congress falling short ge...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...,eat veggie 9 deliciously different recipe
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...,inclement weather prevents liar getting work
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...,mother come pretty close using word streaming ...
...,...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...,jew celebrate rosh hashasha something
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...,internal affair investigator disappointed cons...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...,beautiful acceptance speech week came queer ko...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...,mar probe destroyed orbiting spielberggates sp...


In [24]:
clean_sarcasm = sarcasm_data.copy()
clean_sarcasm = clean_sarcasm[['text', 'is_sarcastic']]
clean_sarcasm.dropna(inplace=True)
clean_sarcasm

Unnamed: 0,text,is_sarcastic
0,thirtysomething scientist unveil doomsday cloc...,1
1,dem rep totally nail congress falling short ge...,0
2,eat veggie 9 deliciously different recipe,0
3,inclement weather prevents liar getting work,1
4,mother come pretty close using word streaming ...,1
...,...,...
28614,jew celebrate rosh hashasha something,1
28615,internal affair investigator disappointed cons...,1
28616,beautiful acceptance speech week came queer ko...,0
28617,mar probe destroyed orbiting spielberggates sp...,1


In [25]:
clean_sarcasm.to_csv("../data/clean_sarcasm_classification.csv", index=False)