For text data preprocessing, we followed the following steps:
- Remove punctuation signs to remove ambiguity between words such as ‘terrible!’ and ‘terrible’.
- Convert text to lowercase.
- Remove stop words which create noise in the dataset.
- Remove numbers in the text
- Remove non-english words/ non-meaningful text such as symbols, emails, urls.
- Extract root words from every word in the data (Lemmatizing). 
- Vectorise the data into numeric tensors. 


In [175]:
import pandas as pd
import numpy as np
import json
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
import re

## Read in the dataset

In [188]:
#financial news 
f_news_df=pd.read_csv("Financial_News.csv",encoding='ISO-8859-1', header = None)
#Name columns
f_news_df.columns=["Class", "News"]
#clean dataset and drop rows with na values
f_news_df=f_news_df.dropna()
f_news_df.head()
print(f_news_df.shape)

(4846, 2)


In [189]:
#function for text preprocessing
def text_preprocessing(text):
    
    stop_words = ENGLISH_STOP_WORDS
    wordnet_lemmatizer=WordNetLemmatizer()
    
    #lowercase
    text=text.lower()
    #punctuations
    text=text.translate(str.maketrans('', '', string.punctuation))
    #numbers
    translation_table = str.maketrans('', '', string.digits)
    text=text.translate(translation_table)
    #URL
    text=text.replace(r'\s*https?://\S+(\s+|$)', ' ').strip()
    #stopwords
    text=' '.join([word for word in text.split() if word not in (stop_words)])
    #non-english
    text=' '.join([word for word in text.split() if word not in (eng_words)])
    #tokenise
    text=nltk.word_tokenize(text)
    #lemmatization
    text=([wordnet_lemmatizer.lemmatize(w) for w in text])
    
    return text

#function for vectorisation (pass in all the text data for training after splitting into train and test sets)
def text_vect(X_train, X_test):
    
    vectorizer = TfidfVectorizer()
    vectorize_text = vectoirizer.fit(X_train)
    vect_X_train = vectoirizer.transform(X_train)
    vect_X_test = vectoirizer.transform(X_test)
    return vect_X_train, vect_X_test

In [190]:
f_news_df['News'] = f_news_df['News'].apply(lambda x:text_preprocessing(x))

In [191]:
f_news_df.head()

Unnamed: 0,Class,News
0,neutral,"[gran, plan]"
1,neutral,"[technopolis, plan, stage, meter, company, tec..."
2,negative,"[elcoteq, ten, employee, tallinn, earlier, lay..."
3,positive,"[expected, material]"
4,positive,"[updated, year, basware, target, longterm, sal..."


In [194]:
print(f_news_df.shape)

(4846, 2)


In [None]:
# split into train test sets
X = f_news_df['News']
y = f_news_df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vect_X_train, vect_X_test = text_vect(X_train, X_test)

In [201]:
#sarcasm headlines
s_headlines_df=pd.read_json('Sarcasm_Headlines.json',lines=True)
s_headlines_df=pd.DataFrame(s_headlines_df)
#drop the first column on article link
s_headlines_df=s_headlines_df.iloc[: , 1:]
#rename columns (Class: 1 = sarcarstic, 0= not)
s_headlines_df.rename(columns={'headline':'Headline', 'is_sarcastic':'Class'}, inplace = True)
s_headlines_df=s_headlines_df.dropna()
s_headlines_df.head()
print(s_headlines_df.shape)

(26709, 2)


In [202]:
s_headlines_df['Headline'] = s_headlines_df['Headline'].apply(lambda x:text_preprocessing(x))

In [203]:
s_headlines_df.head()

Unnamed: 0,Headline,Class
0,"[versace, sue, shopper]",0
1,"[roseanne, catch]",0
2,"[mom, closest]",1
3,"[boehner, want, debtreduction, idea]",1
4,"[jk, rowling, wish]",0


In [204]:
print(s_headlines_df.shape)

(26709, 2)


In [None]:
# split into train test sets
X = s_headlines_df['Headline']
y = s_headlines_df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vect_X_train, vect_X_test = text_vect(X_train, X_test)

In [218]:
#squid game
squidgame_df=pd.read_csv("Squid_Game.csv")
#Name columns
#clean dataset and drop rows with na values
squidgame_df=squidgame_df.dropna()
#drop the unused column
squidgame_df.drop(squidgame_df.columns[[0,1,2,4]], axis=1, inplace=True)
squidgame_df.head()
squidgame_df.rename(columns={'Review Rating':'Class', 'Review_body':'Review'}, inplace = True)

In [219]:
squidgame_df['Review'] = squidgame_df['Review'].apply(lambda x:text_preprocessing(x))

In [220]:
squidgame_df.head()

Unnamed: 0,Class,Review
0,10/10,"[korean, tv, netflix, japanese, alice, centred..."
1,9/10,"[game, game, played, heard, alreadyif, spoiler..."
2,10/10,"[korea, centred, seong, gihun, invited, game, ..."
3,5/10,"[korean, ive, movie, ive, ppl, korean, casuals..."
4,7/10,"[wtf, needed, attempting, having, expectation,..."


In [225]:
#check unique values
squidgame_df['Class'].unique()
#convert ratings into 3 classes
squidgame_df['Class'] = squidgame_df['Class'].replace(['1/10','2/10','3/10'],'0')
squidgame_df['Class'] = squidgame_df['Class'].replace(['4/10','5/10','6/10','7/10'],'1')
squidgame_df['Class'] = squidgame_df['Class'].replace(['8/10','9/10','10/10'],'2')

In [226]:
squidgame_df.head()

Unnamed: 0,Class,Review
0,2,"[korean, tv, netflix, japanese, alice, centred..."
1,2,"[game, game, played, heard, alreadyif, spoiler..."
2,2,"[korea, centred, seong, gihun, invited, game, ..."
3,1,"[korean, ive, movie, ive, ppl, korean, casuals..."
4,1,"[wtf, needed, attempting, having, expectation,..."


In [227]:
print(squidgame_df.shape)

(1185, 2)


In [None]:
# split into train test sets
X = squidgame_df['Review']
y = squidgame_df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vect_X_train, vect_X_test = text_vect(X_train, X_test)