In [42]:
!pip install datasets==3.6.0 --quiet

In [43]:
from datasets import load_dataset, concatenate_datasets, Dataset
import pandas as pd
import os


In [44]:
dfs = []
base_path = '/content/drive/MyDrive/multi-language-nlp'
for path in os.listdir(base_path):
  dfs.append(pd.read_csv(f'{base_path}/{path}'))

In [45]:
malayalam_df = pd.concat(dfs)
malayalam = Dataset.from_pandas(malayalam_df)

In [46]:
english = load_dataset("sentiment140", trust_remote_code=True)

In [47]:
#tamil = pd.read_excel("/content/MADTRAS Dataset.xlsx")

In [48]:
french = load_dataset("CATIE-AQ/french_book_reviews_fr_prompt_sentiment_analysis", trust_remote_code= True, )

In [49]:
chinese = load_dataset("tyqiangz/multilingual-sentiments","chinese", trust_remote_code= True, )

In [50]:
hindi = load_dataset("OdiaGenAI/sentiment_analysis_hindi", trust_remote_code= True, )

In [51]:
malayalam2 = load_dataset("wlkla/Malayalam_first_ready_for_sentiment", trust_remote_code= True)

In [52]:
malayalam2

DatasetDict({
    train: Dataset({
        features: ['label', 'query'],
        num_rows: 6739
    })
})

In [53]:
spanish = load_dataset('azherali/enlish_spanish_amazon-books-reviews', trust_remote_code=True)

In [54]:
spanish = spanish.filter(lambda x: x['language'] == 'es')

Now that we've loaded the datasets to build the multilanguage pipeline, the next step is to standardise all of the datasets to have the same columns and merge them into a single dataset. In addition to this, we also see that while languages such as Chinese and English have a lot of text to work on, languages such as Hindi, Malayalam and French have way fewer marked examples that are ready for sentiment analysis. We will address this in the modelling part.

In [55]:
spanish = spanish.map(lambda x: {'language': 'spanish'})
english = english.map(lambda x: {'language': 'english'})
malayalam = malayalam.map(lambda x: {'language': 'malayalam'})
malayalam2 = malayalam2.map(lambda x : {'language' : 'malayalam'})
french = french.map(lambda x: {'language': 'french'})
chinese = chinese.map(lambda x: {'language': 'chinese'})
hindi = hindi.map(lambda x: {'language': 'hindi'})

Map:   0%|          | 0/9240 [00:00<?, ? examples/s]

In [56]:
def standardise_dataframe(dataset, data_column, label):
  dataset = dataset.rename_column(data_column, 'input').rename_column(label, 'labels')
  return dataset.select_columns(['input','labels', 'language'])


In [57]:
spanish = standardise_dataframe(spanish['train'],'review_body', 'stars')
english = standardise_dataframe(english['train'], 'text', 'sentiment')
malayalam = standardise_dataframe(malayalam, 'clean_content', 'sentiment')
malayalam2 = standardise_dataframe(malayalam2, 'query', 'label')
french = standardise_dataframe(french['train'], 'inputs', 'targets')
chinese = standardise_dataframe(chinese['train'], 'text', 'label')
hindi = standardise_dataframe(hindi['train'], 'text', 'label')

In [58]:
english['labels'][1]

0

In [59]:
def standardise_labels(example):
  label_value = example['labels']

  mapped_label = None
  if isinstance(label_value, int):
    if label_value in [0,1,2]:
      mapped_label = label_value

    elif label_value <= 2:
      mapped_label = 0
    elif label_value == 3:
      mapped_label = 0.5
    else: # label_value is 4 or 5
      mapped_label = 1

  elif isinstance(label_value, str):
    label_map = {"neg" : 0, 'Negative': 0,
                 "neu" : 0.5, 'unknown_state' : 0.5,
                 "pos" : 1, 'Positive' : 1}
    mapped_label = label_map.get(label_value, 0.5)

  else:
    mapped_label = 0.5

  return {'label': int(mapped_label)}

In [60]:
spanish = spanish.map(standardise_labels, remove_columns=['labels'])
english = english.map(standardise_labels, remove_columns=['labels'])
malayalam = malayalam.map(standardise_labels, remove_columns=['labels'])
malayalam2 = malayalam2.map(standardise_labels, remove_columns= ['labels'])
french = french.map(standardise_labels, remove_columns=['labels'])
chinese = chinese.map(standardise_labels, remove_columns=['labels'])
hindi = hindi.map(standardise_labels, remove_columns=['labels'])

Map:   0%|          | 0/9240 [00:00<?, ? examples/s]

In [61]:
df = concatenate_datasets([spanish,english, malayalam,  french, chinese, hindi])

In [62]:
df

Dataset({
    features: ['input', 'language', 'label'],
    num_rows: 2009268
})