For text data preprocessing, we followed the following steps:
- Remove punctuation signs to remove ambiguity between words such as ‘terrible!’ and ‘terrible’.
- Convert text to lowercase.
- Remove stop words which create noise in the dataset.
- Remove numbers in the text
- Remove non-english words/ non-meaningful text such as symbols, emails, urls.
- Extract root words from every word in the data (Lemmatizing). 
- Vectorise the data into numeric tensors. 


In [371]:
import pandas as pd
import numpy as np
import json
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.stem import WordNetLemmatizer
import re
from pathlib import Path   

## Read in the dataset

In [372]:
#financial news 
f_news_df=pd.read_csv("data\Financial_News.csv",encoding='ISO-8859-1', header = None)
#Name columns
f_news_df.columns=["Class", "News"]
#clean dataset and drop rows with na values
f_news_df=f_news_df.dropna()
f_news_df.head()
print(f_news_df.shape)

(4846, 2)


In [373]:
#function for text preprocessing
def text_preprocessing(text):
    stop_words = ENGLISH_STOP_WORDS
    wordnet_lemmatizer=WordNetLemmatizer()
    #punctuations
    text=text.translate(str.maketrans('', '', string.punctuation))
    #numbers
    translation_table = str.maketrans('', '', string.digits)
    text=text.translate(translation_table)
    #URL
    text=text.replace(r'\s*https?://\S+(\s+|$)', ' ').strip()
    #stopwords
    text=' '.join([word for word in text.split() if word not in (stop_words)])
    #lemmatization
    text=''.join([wordnet_lemmatizer.lemmatize(w) for w in text])
    return text

In [374]:
f_news_df['News']=f_news_df['News'].apply(lambda x: x.lower())
f_news_df['News']=f_news_df['News'].apply(lambda x:text_preprocessing(x))

In [375]:
#convert ratings into 3 classes
f_news_df['Class'] = f_news_df['Class'].replace(['negative'],'0')
f_news_df['Class'] = f_news_df['Class'].replace(['neutral'],'1')
f_news_df['Class'] = f_news_df['Class'].replace(['positive'],'2')

In [376]:
f_news_df.head()

Unnamed: 0,Class,News
0,1,according gran company plans production russia...
1,1,technopolis plans develop stages area square m...
2,0,international electronic industry company elco...
3,2,new production plant company increase capacity...
4,2,according company s updated strategy years bas...


In [377]:
print(f_news_df.shape)

(4846, 2)


In [378]:
#function to tokenise the strings
def text_vectorisation(text):
    # tokenising the text data
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    #encode the training data sentences into sequences
    text_sequences = tokenizer.texts_to_sequences(text)
    print("Vocab length", len(tokenizer.word_index) + 1)
    #max length of the training sequence
    max_length = np.max(list(map(lambda x: len(x), text_sequences)))
    print("max length:" ,max_length)
    #padding the training sequence to ensure same length
    text_padded = pad_sequences(text_sequences, padding='post',maxlen=max_length)
    return text_padded
    

In [379]:
#tokenise
train_text_padded = text_vectorisation(f_news_df['News'])

Vocab length 9242
max length: 38


In [380]:
train_text_padded

array([[  42, 3037,    3, ...,    0,    0,    0],
       [ 629,  196,  663, ...,    0,    0,    0],
       [ 163,  466,   71, ...,    0,    0,    0],
       ...,
       [  14,   10,  126, ...,    0,    0,    0],
       [   9,    7,   75, ...,    0,    0,    0],
       [   7,   11,   80, ...,    0,    0,    0]])

In [381]:
# split into train test sets
X = f_news_df['News']
y = f_news_df['Class']
y = pd.get_dummies(f_news_df['Class']).values
X_train, X_test, y_train, y_test = train_test_split(train_text_padded, y, test_size=0.2, random_state=42)

In [382]:
np.savetxt('data/train/fnews_Xtrain_.csv', X_train, delimiter=",")
np.savetxt('data/test/fnews_Xtest_.csv', X_test, delimiter=",")
np.savetxt('data/train/fnews_ytrain_.csv', y_train, delimiter=",")
np.savetxt('data/test/fnews_ytest_.csv', y_test, delimiter=",")

In [309]:
# split into train test sets
# X = f_news_df['News']
# y = f_news_df['Class']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [179]:
# filepath1 = Path('data/train/fnews_Xtrain.csv')   
# X_train.to_csv(filepath1)
# filepath2 = Path('data/test/fnews_Xtest.csv')   
# X_test.to_csv(filepath2)
# filepath3 = Path('data/train/fnews_ytrain.csv')   
# y_train.to_csv(filepath3)
# filepath4 = Path('data/test/fnews_ytest.csv')   
# y_test.to_csv(filepath4)

In [394]:
#squid game
squidgame_df=pd.read_csv("data/Squid_Game.csv")
#Name columns
#clean dataset and drop rows with na values
squidgame_df=squidgame_df.dropna()
#drop the unused column
squidgame_df.drop(squidgame_df.columns[[0,1,2,4]], axis=1, inplace=True)
squidgame_df.head()
squidgame_df.rename(columns={'Review Rating':'Class', 'Review_body':'Review'}, inplace = True)

In [395]:
squidgame_df['Review'] = squidgame_df['Review'].apply(lambda x:text_preprocessing(x))

In [396]:
squidgame_df.head()

Unnamed: 0,Class,Review
0,10/10,Just finished SQUID GAME Korean TV thats globa...
1,9/10,Well multiple games actually accurate Now look...
2,10/10,This runaway hit South Korea centred Seong Gih...
3,5/10,I went knowing absolutely korean series And an...
4,7/10,I honestly dont know wtf main character needed...


In [397]:
#check unique values
squidgame_df['Class'].unique()
#convert ratings into 3 classes
squidgame_df['Class'] = squidgame_df['Class'].replace(['1/10','2/10','3/10', '4/10'],'0')
squidgame_df['Class'] = squidgame_df['Class'].replace(['5/10'],'1')
squidgame_df['Class'] = squidgame_df['Class'].replace(['6/10','7/10','8/10','9/10','10/10'],'2')

In [398]:
squidgame_df.head()

Unnamed: 0,Class,Review
0,2,Just finished SQUID GAME Korean TV thats globa...
1,2,Well multiple games actually accurate Now look...
2,2,This runaway hit South Korea centred Seong Gih...
3,1,I went knowing absolutely korean series And an...
4,2,I honestly dont know wtf main character needed...


In [399]:
print(squidgame_df.shape)

(1185, 2)


In [400]:
#tokenise
train_text_padded = text_vectorisation(squidgame_df['Review'])

Vocab length 9066
max length: 831


In [401]:
# split into train test sets
X = squidgame_df['Review']
y = squidgame_df['Class']
y = pd.get_dummies(squidgame_df['Class']).values
X_train, X_test, y_train, y_test = train_test_split(train_text_padded, y, test_size=0.2, random_state=42)

In [403]:
np.savetxt('data/train/squidgame_Xtrain_.csv', X_train, delimiter=",")
np.savetxt('data/test/squidgame_Xtest_.csv', X_test, delimiter=",")
np.savetxt('data/train/squidgame_ytrain_.csv', y_train, delimiter=",")
np.savetxt('data/test/squidgame_ytest_.csv', y_test, delimiter=",")

In [196]:
# filepath1 = Path('data/train/squidgame_Xtrain.csv')   
# X_train.to_csv(filepath1)
# filepath2 = Path('data/test/squidgame_Xtest.csv')   
# X_test.to_csv(filepath2)
# filepath3 = Path('data/train/squidgame_ytrain.csv')   
# y_train.to_csv(filepath3)
# filepath4 = Path('data/test/squidgame_ytest.csv')   
# y_test.to_csv(filepath4)