For text data preprocessing, we followed the following steps:
- Remove punctuation signs to remove ambiguity between words such as ‘terrible!’ and ‘terrible’.
- Convert text to lowercase.
- Remove stop words which create noise in the dataset.
- Remove numbers in the text
- Remove non-english words/ non-meaningful text such as symbols, emails, urls.
- Extract root words from every word in the data (Lemmatizing). 
- Vectorise the data into numeric tensors. 


In [322]:
import pandas as pd
import numpy as np
import json
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.stem import WordNetLemmatizer
import re
from pathlib import Path   
from transformers import MarianMTModel, MarianTokenizer

## Read in the dataset

In [323]:
#financial news 
f_news_df=pd.read_csv("data\Financial_News.csv",encoding='ISO-8859-1', header = None)
#Name columns
f_news_df.columns=["Class", "News"]
#clean dataset and drop rows with na values
f_news_df=f_news_df.dropna()
f_news_df.head()
print(f_news_df.shape)

(4846, 2)


In [324]:
#function for text preprocessing
def text_preprocessing(text):
    stop_words = ENGLISH_STOP_WORDS
    wordnet_lemmatizer=WordNetLemmatizer()
    #punctuations
    text=text.translate(str.maketrans('', '', string.punctuation))
    #numbers
    translation_table = str.maketrans('', '', string.digits)
    text=text.translate(translation_table)
    #URL
    text=text.replace(r'\s*https?://\S+(\s+|$)', ' ').strip()
    #stopwords
    text=' '.join([word for word in text.split() if word not in (stop_words)])
    #lemmatization
    text=''.join([wordnet_lemmatizer.lemmatize(w) for w in text])
    return text

In [325]:
f_news_df['News']=f_news_df['News'].apply(lambda x: x.lower())
f_news_df['News']=f_news_df['News'].apply(lambda x:text_preprocessing(x))

In [326]:
#convert ratings into 3 classes
f_news_df['Class'] = f_news_df['Class'].replace(['negative'],'0')
f_news_df['Class'] = f_news_df['Class'].replace(['neutral'],'1')
f_news_df['Class'] = f_news_df['Class'].replace(['positive'],'2')

In [327]:
#back translation
french_model = 'Helsinki-NLP/opus-mt-en-fr'
# Get the tokenizer
french_model_tkn = MarianTokenizer.from_pretrained(french_model)
# Load the pretrained model based on the name
french_model = MarianMTModel.from_pretrained(french_model)

In [328]:
eng_model = 'Helsinki-NLP/opus-mt-fr-en'
# Get the tokenizer
eng_model_tkn = MarianTokenizer.from_pretrained(eng_model)
# Load the pretrained model based on the name
eng_model = MarianMTModel.from_pretrained(eng_model)

In [329]:
### formating the text

def format_batch_texts(language_code, batch_texts):
    formated_batch = ">>{}<< {}".format(language_code, batch_texts)
    return formated_batch

def combine_texts(original_texts, back_translated_batch):
    return set(back_translated_batch) - set(original_texts)

### translation
def perform_translation(batch_texts, model, tokenizer, language="fr"):
    formated_batch_texts = format_batch_texts(language, batch_texts)
    translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding= 'max_length'))
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    translated_texts = [sub.replace('["', '') for sub in translated_texts]
    translated_texts = [sub.replace('"]', '') for sub in translated_texts]
    return translated_texts

def backtranslation(batch_texts, label, original_language="en", temporary_language="fr"):
    # Translate from english to french
    tmp_translated_batch = perform_translation(batch_texts, french_model, french_model_tkn, temporary_language)
    # Translate Back to english
    back_translated_batch = perform_translation(tmp_translated_batch, eng_model, eng_model_tkn, original_language)
    # Return The Final Result
    x = combine_texts(batch_texts, back_translated_batch)
    return x, label

In [330]:
# Execute the function for Data Augmentation
for ind in f_news_df.index:
    x, label = backtranslation(f_news_df['News'][ind],f_news_df['Class'][ind])
    insert_row = {'Class': int(label),  'News':x}
    f_news_df = pd.concat([f_news_df, pd.DataFrame([insert_row])])































































































IndexError: index out of range in self

In [331]:
f_news_df

Unnamed: 0,Class,News
0,1,according gran company plans production russia...
1,1,technopolis plans develop stages area square m...
2,0,international electronic industry company elco...
3,2,new production plant company increase capacity...
4,2,according company s updated strategy years bas...
...,...,...
0,2,{The pioneer libraries of the municipalities o...
0,2,{['Pre-tax earnings jumped by EUR million']}
0,2,{Rate of profitability compared to the previou...
0,2,{The company has received orders of a value of...


In [333]:
filepath = Path('fnews_backtrans.csv')   
f_news_df.to_csv(filepath)

In [7]:
print(f_news_df.shape)

(4846, 2)


In [463]:
#function to tokenise the strings (LSTM)
def text_vectorisation(text):
    max_size = 10000
    # tokenising the text data
    tokenizer = Tokenizer(num_words = max_size,oov_token="<oov>")
    tokenizer.fit_on_texts(text)
    #encode the training data sentences into sequences
    text_sequences = tokenizer.texts_to_sequences(text)
    print("Vocab length", len(tokenizer.word_index) + 1)
    #max length of the training sequence
    max_length = np.max(list(map(lambda x: len(x), text_sequences)))
    print("max length:" ,max_length)
    #padding the training sequence to ensure same length
    text_padded = pad_sequences(text_sequences, padding='post',maxlen=max_length)
    return text_padded
    

In [464]:
#tokenise
train_text_padded = text_vectorisation(f_news_df['News'])

Vocab length 9243
max length: 38


In [465]:
train_text_padded

array([[  43, 3038,    4, ...,    0,    0,    0],
       [ 630,  197,  664, ...,    0,    0,    0],
       [ 164,  467,   72, ...,    0,    0,    0],
       ...,
       [  15,   11,  127, ...,    0,    0,    0],
       [  10,    8,   76, ...,    0,    0,    0],
       [   8,   12,   81, ...,    0,    0,    0]])

In [466]:
#glove word embedding (transfer learning)

embeddings_dictionary = dict()
embedding_dim = 100
vocab_length =9243
with open('glove.6B.100d.txt',encoding="utf8") as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions
        
max_size = 10000
# tokenising the text data
tokenizer = Tokenizer(num_words = max_size,oov_token="<oov>")
tokenizer.fit_on_texts(f_news_df['News'])
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.31106001,  0.47663999,  0.12986   , ...,  0.073349  ,
         0.27316001, -0.79343998],
       ...,
       [-0.36792001,  0.16948   ,  0.53179997, ...,  0.41356999,
         0.072457  ,  0.58749002],
       [ 0.46875   , -0.12616   ,  0.14973   , ..., -0.027374  ,
        -0.097867  , -0.092297  ],
       [ 0.37041   ,  1.05400002,  0.22189   , ..., -0.038925  ,
         0.28915   ,  1.27030003]])

In [467]:
#save the embedded weights
np.savetxt('data/weights/weights_fnews_.csv',embedding_matrix, delimiter=",")

In [468]:
# split into train test sets
X = f_news_df['News']
y = f_news_df['Class']
y = pd.get_dummies(f_news_df['Class']).values
X_train, X_test, y_train, y_test = train_test_split(train_text_padded, y, test_size=0.2, random_state=42)

In [423]:
np.savetxt('data/train/fnews_Xtrain_.csv', X_train, delimiter=",")
np.savetxt('data/test/fnews_Xtest_.csv', X_test, delimiter=",")
np.savetxt('data/train/fnews_ytrain_.csv', y_train, delimiter=",")
np.savetxt('data/test/fnews_ytest_.csv', y_test, delimiter=",")
np.savetxt('')

In [309]:
# split into train test sets
# X = f_news_df['News']
# y = f_news_df['Class']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [179]:
# filepath1 = Path('data/train/fnews_Xtrain.csv')   
# X_train.to_csv(filepath1)
# filepath2 = Path('data/test/fnews_Xtest.csv')   
# X_test.to_csv(filepath2)
# filepath3 = Path('data/train/fnews_ytrain.csv')   
# y_train.to_csv(filepath3)
# filepath4 = Path('data/test/fnews_ytest.csv')   
# y_test.to_csv(filepath4)

In [442]:
#squid game
squidgame_df=pd.read_csv("data/Squid_Game.csv")
#Name columns
#clean dataset and drop rows with na values
squidgame_df=squidgame_df.dropna()
#drop the unused column
squidgame_df.drop(squidgame_df.columns[[0,1,2,4]], axis=1, inplace=True)
squidgame_df.head()
squidgame_df.rename(columns={'Review Rating':'Class', 'Review_body':'Review'}, inplace = True)

In [443]:
squidgame_df['Review'] = squidgame_df['Review'].apply(lambda x:text_preprocessing(x))

In [444]:
squidgame_df.head()

Unnamed: 0,Class,Review
0,10/10,Just finished SQUID GAME Korean TV thats globa...
1,9/10,Well multiple games actually accurate Now look...
2,10/10,This runaway hit South Korea centred Seong Gih...
3,5/10,I went knowing absolutely korean series And an...
4,7/10,I honestly dont know wtf main character needed...


In [445]:
#check unique values
squidgame_df['Class'].unique()
#convert ratings into 3 classes
squidgame_df['Class'] = squidgame_df['Class'].replace(['1/10','2/10','3/10', '4/10'],'0')
squidgame_df['Class'] = squidgame_df['Class'].replace(['5/10'],'1')
squidgame_df['Class'] = squidgame_df['Class'].replace(['6/10','7/10','8/10','9/10','10/10'],'2')

In [446]:
squidgame_df.head()

Unnamed: 0,Class,Review
0,2,Just finished SQUID GAME Korean TV thats globa...
1,2,Well multiple games actually accurate Now look...
2,2,This runaway hit South Korea centred Seong Gih...
3,1,I went knowing absolutely korean series And an...
4,2,I honestly dont know wtf main character needed...


In [447]:
print(squidgame_df.shape)

(1185, 2)


In [449]:
#tokenise
train_text_padded = text_vectorisation(squidgame_df['Review'])

Vocab length 9067
max length: 831


In [450]:
train_text_padded

array([[  12,  406,   28, ...,    0,    0,    0],
       [ 423,  688,   16, ...,    0,    0,    0],
       [  27, 3840,  407, ...,    0,    0,    0],
       ...,
       [ 808,    3,   15, ...,    0,    0,    0],
       [ 541,   61,  180, ...,    0,    0,    0],
       [   3,   89,    3, ...,    0,    0,    0]])

In [451]:
#trying out glove word embedding (transfer learning)

embeddings_dictionary = dict()
embedding_dim = 100
vocab_length =9067
with open('glove.6B.100d.txt',encoding="utf8") as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions
        
max_size = 10000
# tokenising the text data
tokenizer = Tokenizer(num_words = max_size,oov_token="<oov>")
tokenizer.fit_on_texts(squidgame_df['Review'])
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.69441998,  0.21122999,  0.17011   , ..., -0.64331001,
        -0.2412    ,  0.92161   ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.35752001,  0.63306999, -0.11818   , ..., -0.53319001,
        -0.12426   ,  0.80176002],
       [-0.22854   ,  0.29302001, -0.018262  , ..., -0.15219   ,
         0.35202   , -0.0081126 ]])

In [452]:
#save the embedded weights
np.savetxt('data/weights/weights_squidgame_.csv',embedding_matrix, delimiter=",")

In [401]:
# split into train test sets
X = squidgame_df['Review']
y = squidgame_df['Class']
y = pd.get_dummies(squidgame_df['Class']).values
X_train, X_test, y_train, y_test = train_test_split(train_text_padded, y, test_size=0.2, random_state=42)

In [403]:
np.savetxt('data/train/squidgame_Xtrain_.csv', X_train, delimiter=",")
np.savetxt('data/test/squidgame_Xtest_.csv', X_test, delimiter=",")
np.savetxt('data/train/squidgame_ytrain_.csv', y_train, delimiter=",")
np.savetxt('data/test/squidgame_ytest_.csv', y_test, delimiter=",")

In [196]:
# filepath1 = Path('data/train/squidgame_Xtrain.csv')   
# X_train.to_csv(filepath1)
# filepath2 = Path('data/test/squidgame_Xtest.csv')   
# X_test.to_csv(filepath2)
# filepath3 = Path('data/train/squidgame_ytrain.csv')   
# y_train.to_csv(filepath3)
# filepath4 = Path('data/test/squidgame_ytest.csv')   
# y_test.to_csv(filepath4)