For text data preprocessing, we followed the following steps:
- Remove punctuation signs to remove ambiguity between words such as ‘terrible!’ and ‘terrible’.
- Convert text to lowercase.
- Remove stop words which create noise in the dataset.
- Remove numbers in the text
- Remove non-english words/ non-meaningful text such as symbols, emails, urls.
- Extract root words from every word in the data (Lemmatizing). 
- Vectorise the data into numeric tensors. 


In [88]:
import pandas as pd
import numpy as np
import json
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.stem import WordNetLemmatizer
import re
from pathlib import Path   
from transformers import MarianMTModel, MarianTokenizer

## Functions

In [184]:
#function for text preprocessing
def text_preprocessing(text):
    stop_words = ENGLISH_STOP_WORDS
    wordnet_lemmatizer=WordNetLemmatizer()
    #punctuations
    text=text.translate(str.maketrans('', '', string.punctuation))
    #numbers
    translation_table = str.maketrans('', '', string.digits)
    text=text.translate(translation_table)
    #URL
    text=text.replace(r'\s*https?://\S+(\s+|$)', ' ').strip()
    #stopwords
    text=' '.join([word for word in text.split() if word not in (stop_words)])
    #lemmatization
    text=''.join([wordnet_lemmatizer.lemmatize(w) for w in text])
    return text

In [110]:
#function to tokenise the strings (LSTM)
def text_vectorisation(text):
    max_size = 10000
    # tokenising the text data
    tokenizer = Tokenizer(num_words = max_size,oov_token="<oov>")
    tokenizer.fit_on_texts(text)
    #encode the training data sentences into sequences
    text_sequences = tokenizer.texts_to_sequences(text)
    print("Vocab length", len(tokenizer.word_index) + 1)
    #max length of the training sequence
    max_length = np.max(list(map(lambda x: len(x), text_sequences)))
    print("max length:" ,max_length)
    #padding the training sequence to ensure same length
    text_padded = pad_sequences(text_sequences, padding='post',maxlen=max_length)
    return text_padded 

## Read in the dataset

## Financial News Headline

In [89]:
#financial news 
f_news_df=pd.read_csv("data\Financial_News.csv",encoding='ISO-8859-1', header = None)
#Name columns
f_news_df.columns=["Class", "News"]
#clean dataset and drop rows with na values
f_news_df=f_news_df.dropna()
f_news_df.head()
print(f_news_df.shape)

(4846, 2)


In [91]:
f_news_df['News']=f_news_df['News'].apply(lambda x: x.lower())
f_news_df['News']=f_news_df['News'].apply(lambda x:text_preprocessing(x))

In [92]:
#convert ratings into 3 classes
f_news_df['Class'] = f_news_df['Class'].replace(['negative'],'0')
f_news_df['Class'] = f_news_df['Class'].replace(['neutral'],'1')
f_news_df['Class'] = f_news_df['Class'].replace(['positive'],'2')

In [93]:
f_news_df

Unnamed: 0,Class,News
0,1,according gran company plans production russia...
1,1,technopolis plans develop stages area square m...
2,0,international electronic industry company elco...
3,2,new production plant company increase capacity...
4,2,according company s updated strategy years bas...
...,...,...
4841,0,london marketwatch share prices ended lower lo...
4842,1,rinkuskiai s beer sales fell cent million litr...
4843,0,operating profit fell eur mn eur mn including ...
4844,0,net sales paper segment decreased eur mn secon...


In [94]:
print(f_news_df.shape)

(4846, 2)


In [96]:
#tokenise
train_text_padded = text_vectorisation(f_news_df['News'])

Vocab length 9243
max length: 38


In [97]:
train_text_padded

array([[  43, 3038,    4, ...,    0,    0,    0],
       [ 630,  197,  664, ...,    0,    0,    0],
       [ 164,  467,   72, ...,    0,    0,    0],
       ...,
       [  15,   11,  127, ...,    0,    0,    0],
       [  10,    8,   76, ...,    0,    0,    0],
       [   8,   12,   81, ...,    0,    0,    0]])

#### Requires the 'glove.6B.100d.txt' file to be downloaded from Standford first : https://nlp.stanford.edu/projects/glove/

In [98]:
#glove word embedding (transfer learning)

embeddings_dictionary = dict()
embedding_dim = 100
vocab_length =9243
with open('glove.6B.100d.txt',encoding="utf8") as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions
        
max_size = 10000
# tokenising the text data
tokenizer = Tokenizer(num_words = max_size,oov_token="<oov>")
tokenizer.fit_on_texts(f_news_df['News'])
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.31106001,  0.47663999,  0.12986   , ...,  0.073349  ,
         0.27316001, -0.79343998],
       ...,
       [-0.36792001,  0.16948   ,  0.53179997, ...,  0.41356999,
         0.072457  ,  0.58749002],
       [ 0.46875   , -0.12616   ,  0.14973   , ..., -0.027374  ,
        -0.097867  , -0.092297  ],
       [ 0.37041   ,  1.05400002,  0.22189   , ..., -0.038925  ,
         0.28915   ,  1.27030003]])

In [99]:
embedding_matrix.shape

(9243, 100)

In [467]:
#save the embedded weights
np.savetxt('data/weights/weights_fnews_.csv',embedding_matrix, delimiter=",")

In [13]:
# split into train test sets
X = f_news_df['News']
y = f_news_df['Class']
y = pd.get_dummies(f_news_df['Class']).values
X_train, X_test, y_train, y_test = train_test_split(train_text_padded, y, test_size=0.2, random_state=42)

In [17]:
np.savetxt('data/train/fnews_Xtrain_.csv', X_train, delimiter=",")
np.savetxt('data/test/fnews_Xtest_.csv', X_test, delimiter=",")
np.savetxt('data/train/fnews_ytrain_.csv', y_train, delimiter=",")
np.savetxt('data/test/fnews_ytest_.csv', y_test, delimiter=",")
np.savetxt('data/test/fnews_X_.csv', train_text_padded, delimiter=",")
np.savetxt('data/test/fnews_y_.csv', y, delimiter=",")

## Original Financial News Dataset + Back Translated Dataset

In [126]:
fnews_df_bt = pd.read_csv('Back Translation - Financial News Headlines.csv')
fnews_df_bt = fnews_df_bt.drop('News', axis=1)
fnews_df_bt.rename(columns={'News.1':'News'}, inplace = True)

#convert ratings into 3 classes
fnews_df_bt['Class'] = fnews_df_bt['Class'].replace(['negative'],'0')
fnews_df_bt['Class'] = fnews_df_bt['Class'].replace(['neutral'],'1')
fnews_df_bt['Class'] = fnews_df_bt['Class'].replace(['positive'],'2')

fnews_df_bt

Unnamed: 0,Class,News
0,1,"According to Gran, the company does not intend..."
1,1,Technopolis plans to develop in stages of an a...
2,0,The International Elcoteq Electronics Society ...
3,2,"With the new production factory, the company w..."
4,2,According to the company's updated strategy fo...
...,...,...
4841,0,London Marketwatch - The equity price ended in...
4842,1,Rinkuskiai beer sales fell 6.5% to 4.16 millio...
4843,0,The operating profit fell to 35.4 mins from 68...
4844,0,Net sales of the paper segment decreased to 22...


In [127]:
#Merge both the original dataset and the back translated dataset
frames = [f_news_df, fnews_df_bt]
f_news_df2 = pd.concat(frames)
f_news_df2

Unnamed: 0,Class,News
0,1,according gran company plans production russia...
1,1,technopolis plans develop stages area square m...
2,0,international electronic industry company elco...
3,2,new production plant company increase capacity...
4,2,according company s updated strategy years bas...
...,...,...
4841,0,London Marketwatch - The equity price ended in...
4842,1,Rinkuskiai beer sales fell 6.5% to 4.16 millio...
4843,0,The operating profit fell to 35.4 mins from 68...
4844,0,Net sales of the paper segment decreased to 22...


In [128]:
#preprocess the new text data 
f_news_df2['News']=f_news_df2['News'].apply(lambda x: x.lower())
f_news_df2['News']=f_news_df2['News'].apply(lambda x:text_preprocessing(x))

In [129]:
#tokenise
train_text_padded2 = text_vectorisation(f_news_df2['News'])

Vocab length 10855
max length: 58


In [130]:
#glove word embedding (transfer learning)

embeddings_dictionary = dict()
embedding_dim = 100
vocab_length =10855
with open('glove.6B.100d.txt',encoding="utf8") as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions
        
max_size = 10000
# tokenising the text data
tokenizer = Tokenizer(num_words = max_size,oov_token="<oov>")
tokenizer.fit_on_texts(f_news_df2['News'])
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.31106001,  0.47663999,  0.12986   , ...,  0.073349  ,
         0.27316001, -0.79343998],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.30566001,  0.23728999,  0.022027  , ...,  0.10943   ,
         0.17835   , -1.0352    ],
       [-0.068443  ,  0.27186   ,  0.36601999, ...,  0.38561001,
         0.1999    , -0.24925999]])

In [131]:
embedding_matrix.shape

(10855, 100)

In [132]:
#save the embedded weights
np.savetxt('data/weights/weights_fnewsbt_.csv',embedding_matrix, delimiter=",")

In [135]:
#conduct stratified sampling
# split into train test sets
X = f_news_df2['News']
y = f_news_df2['Class']
y = pd.get_dummies(f_news_df2['Class']).values
X_train, X_test, y_train, y_test = train_test_split(train_text_padded2, y, test_size=0.2,stratify=y, random_state=42)

In [136]:
np.savetxt('data/train/fnews_Xtrainbt_.csv', X_train, delimiter=",")
np.savetxt('data/test/fnews_Xtestbt_.csv', X_test, delimiter=",")
np.savetxt('data/train/fnews_ytrainbt_.csv', y_train, delimiter=",")
np.savetxt('data/test/fnews_ytestbt_.csv', y_test, delimiter=",")

## Squidgame Reviews

In [111]:
#squid game
squidgame_df=pd.read_csv("data/Squid_Game.csv")
#Name columns
#clean dataset and drop rows with na values
squidgame_df=squidgame_df.dropna()
#drop the unused column
squidgame_df.drop(squidgame_df.columns[[0,1,2,4]], axis=1, inplace=True)
squidgame_df.head()
squidgame_df.rename(columns={'Review Rating':'Class', 'Review_body':'Review'}, inplace = True)

In [112]:
squidgame_df['Review'] = squidgame_df['Review'].apply(lambda x:text_preprocessing(x))

In [113]:
squidgame_df.head()

Unnamed: 0,Class,Review
0,10/10,Just finished SQUID GAME Korean TV thats globa...
1,9/10,Well multiple games actually accurate Now look...
2,10/10,This runaway hit South Korea centred Seong Gih...
3,5/10,I went knowing absolutely korean series And an...
4,7/10,I honestly dont know wtf main character needed...


In [114]:
#check unique values
squidgame_df['Class'].unique()
#convert ratings into 3 classes
squidgame_df['Class'] = squidgame_df['Class'].replace(['1/10','2/10','3/10', '4/10'],'0')
squidgame_df['Class'] = squidgame_df['Class'].replace(['5/10'],'1')
squidgame_df['Class'] = squidgame_df['Class'].replace(['6/10','7/10','8/10','9/10','10/10'],'2')

In [74]:
#storing a version where labels are cleaned
filepath = Path('data/Squid_Game_Cleaned.csv')   
squidgame_df.to_csv(filepath)

In [73]:
squidgame_df.head()

Unnamed: 0,Class,Review
0,2,"Just finished up SQUID GAME, the Korean TV sho..."
1,2,"Well multiple games actually, but even that yo..."
2,2,This runaway hit from South Korea is centred o...
3,1,I went into this show knowing absolutely nothi...
4,2,I honestly don't know wtf the main character n...


In [447]:
print(squidgame_df.shape)

(1185, 2)


In [449]:
#tokenise
train_text_padded = text_vectorisation(squidgame_df['Review'])

Vocab length 9067
max length: 831


In [450]:
train_text_padded

array([[  12,  406,   28, ...,    0,    0,    0],
       [ 423,  688,   16, ...,    0,    0,    0],
       [  27, 3840,  407, ...,    0,    0,    0],
       ...,
       [ 808,    3,   15, ...,    0,    0,    0],
       [ 541,   61,  180, ...,    0,    0,    0],
       [   3,   89,    3, ...,    0,    0,    0]])

#### Requires the 'glove.6B.100d.txt' file to be downloaded from Standford first : https://nlp.stanford.edu/projects/glove/

In [451]:
#trying out glove word embedding (transfer learning)

embeddings_dictionary = dict()
embedding_dim = 100
vocab_length =9067
with open('glove.6B.100d.txt',encoding="utf8") as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions
        
max_size = 10000
# tokenising the text data
tokenizer = Tokenizer(num_words = max_size,oov_token="<oov>")
tokenizer.fit_on_texts(squidgame_df['Review'])
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.69441998,  0.21122999,  0.17011   , ..., -0.64331001,
        -0.2412    ,  0.92161   ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.35752001,  0.63306999, -0.11818   , ..., -0.53319001,
        -0.12426   ,  0.80176002],
       [-0.22854   ,  0.29302001, -0.018262  , ..., -0.15219   ,
         0.35202   , -0.0081126 ]])

In [452]:
#save the embedded weights
np.savetxt('data/weights/weights_squidgame_.csv',embedding_matrix, delimiter=",")

In [401]:
# split into train test sets
X = squidgame_df['Review']
y = squidgame_df['Class']
y = pd.get_dummies(squidgame_df['Class']).values
X_train, X_test, y_train, y_test = train_test_split(train_text_padded, y, test_size=0.2, random_state=42)

In [403]:
np.savetxt('data/train/squidgame_Xtrain_.csv', X_train, delimiter=",")
np.savetxt('data/test/squidgame_Xtest_.csv', X_test, delimiter=",")
np.savetxt('data/train/squidgame_ytrain_.csv', y_train, delimiter=",")
np.savetxt('data/test/squidgame_ytest_.csv', y_test, delimiter=",")

## Original dataset + Back Translated dataset

In [185]:
squidgame_df_bt = pd.read_csv('Back Translation - Squid Game.csv')
squidgame_df_bt = squidgame_df_bt.drop('Review', axis=1)
squidgame_df_bt.rename(columns={'Review.1':'Review'}, inplace = True)

squidgame_df_bt

Unnamed: 0,Class,Review
0,2,"I just finished the Squid game, the Korean tel..."
1,2,"Well, several games actually, but even so that..."
2,2,This South Korea flight is centered on Seong G...
3,1,I entered this show by knowing absolutely noth...
4,2,"Honestly, I don't know that the main character..."
...,...,...
1180,2,It's nails that bites all the time and I'm the...
1181,2,It is a well -made version of stories and film...
1182,2,Woman and I really enjoyed the series. The sce...
1183,2,Everything is perfect on this series. Do not w...


In [186]:
#Merge both the original dataset and the back translated dataset
frames = [squidgame_df, squidgame_df_bt]
squidgame_df2 = pd.concat(frames)
squidgame_df2

Unnamed: 0,Class,Review
0,2,Just finished SQUID GAME Korean TV thats globa...
1,2,Well multiple games actually accurate Now look...
2,2,This runaway hit South Korea centred Seong Gih...
3,1,I went knowing absolutely korean series And an...
4,2,I honestly dont know wtf main character needed...
...,...,...
1180,2,It's nails that bites all the time and I'm the...
1181,2,It is a well -made version of stories and film...
1182,2,Woman and I really enjoyed the series. The sce...
1183,2,Everything is perfect on this series. Do not w...


In [187]:
#preprocess the new text data 
squidgame_df2['Review']=squidgame_df2['Review'].apply(lambda x: x.lower())
squidgame_df2['Review']=squidgame_df2['Review'].apply(lambda x:text_preprocessing(x))

In [188]:
#tokenise
train_text_padded2 = text_vectorisation(squidgame_df2['Review'])

Vocab length 10483
max length: 758


In [189]:
#glove word embedding (transfer learning)

embeddings_dictionary = dict()
embedding_dim = 100
vocab_length =10483
with open('glove.6B.100d.txt',encoding="utf8") as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions
        
max_size = 10000
# tokenising the text data
tokenizer = Tokenizer(num_words = max_size,oov_token="<oov>")
tokenizer.fit_on_texts(squidgame_df2['Review'])
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.74901998, -0.19963001,  0.24901   , ..., -0.32662001,
         0.31207001,  0.58199   ],
       ...,
       [ 0.1227    , -0.14037   ,  0.10969   , ...,  0.39884001,
        -0.14473   , -0.010557  ],
       [-0.32306999, -0.22499999,  0.22830001, ..., -0.13102999,
        -0.24221   ,  0.53162003],
       [-0.28795001,  0.92475998, -0.53697997, ..., -0.10077   ,
         0.1893    ,  0.69529003]])

In [190]:
embedding_matrix.shape

(10483, 100)

In [191]:
#save the embedded weights
np.savetxt('data/weights/weights_squidgamebt_.csv',embedding_matrix, delimiter=",")

In [192]:
#conduct stratified sampling
# split into train test sets
X = squidgame_df2['Review']
y = squidgame_df2['Class']

#slight cleaning 
squidgame_df2['Class'] = squidgame_df2['Class'].astype('int')

y = pd.get_dummies(squidgame_df2['Class']).values
X_train, X_test, y_train, y_test = train_test_split(train_text_padded2, y, test_size=0.2,stratify=y, random_state=42)

In [193]:
np.savetxt('data/train/squidgame_Xtrainbt_.csv', X_train, delimiter=",")
np.savetxt('data/test/squidgame_Xtestbt_.csv', X_test, delimiter=",")
np.savetxt('data/train/squidgame_ytrainbt_.csv', y_train, delimiter=",")
np.savetxt('data/test/squidgame_ytestbt_.csv', y_test, delimiter=",")