# Probing Pre-trained Multi-lingual Models for Sentiment Analysis on Hotel Reviews

In [None]:
# Importing packages
import pandas as pd
import numpy as np
import sys
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
from transformers import BertTokenizer, BertModel
import torch

print("done loading")

done loading


In [None]:
# Load data
english = pd.read_csv("english_reviews.csv")
spanish = pd.read_csv("spanish_reviews.csv")
chinese = pd.read_csv("chinese_reviews.csv")

# English data

Pre-processing

In [None]:
# english data has values 1-5. Need to make it binary, so 1-2 will be negative, 3 removed and 4-5 positive.
# Binary, so 0 is negative, 1 is positive

# Remove rows where the label is 3 (neutral)
english_new = english[english.iloc[:, 1] != 3]

# Change labels 1 or 2 -> 0 (negative), 4 or 5 -> 1 (positive)
english_new.iloc[:, 1] = english_new.iloc[:, 1].apply(lambda x: 0 if x <= 2 else 1)

# Get data of specific labels
positive_reviews = english_new[english_new['Rating'] == 1]
negative_reviews = english_new[english_new['Rating'] == 0]

# Limit to 1000 reviews each
positive_reviews = positive_reviews.sample(n=1000, random_state=42)
negative_reviews = negative_reviews.sample(n=1000, random_state=42)

# Concatenate
balanced_data = pd.concat([positive_reviews, negative_reviews])

# Shuffle the combined data
english_new = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print(english_new.head())
print(len(english_new))

                                              Review  Rating
0  think twice booking breezes, just got breezes ...       0
1  excellent value money just little finding hote...       1
2  new enter modern hotel anonymous doorway calla...       0
3  hotel wife just returned 14 days inna grand ba...       1
4  stay no choice located shinjuku near train sta...       0
2000


# mBERT

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModel

english_new2= english_new.copy()

# Initialize the tokenizer and model for mBERT
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
model = AutoModel.from_pretrained('bert-base-multilingual-cased')

# Get [CLS] token embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence and return PyTorch tensors
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Get model outputs (embedding for each token in the sentence)
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token's embedding (first token)
    cls_embedding = outputs.last_hidden_state[0][0].numpy()  # Convert to NumPy array
    return cls_embedding

# Apply the function to the first column
english_new2['CLS_embedding'] = english_new2.iloc[:, 0].apply(get_cls_embedding)

# `english_new2['CLS_embedding']` contains the embeddings
print(english_new2.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

                                              Review  Rating  \
0  think twice booking breezes, just got breezes ...       0   
1  excellent value money just little finding hote...       1   
2  new enter modern hotel anonymous doorway calla...       0   
3  hotel wife just returned 14 days inna grand ba...       1   
4  stay no choice located shinjuku near train sta...       0   

                                       CLS_embedding  
0  [0.07050472, 0.03498006, -0.08864682, 0.080741...  
1  [0.122898206, 0.05775193, 0.09020899, -0.00633...  
2  [0.10109732, 0.07443803, 0.06628121, -0.011516...  
3  [0.07140106, 0.024289127, 0.0345903, -0.026506...  
4  [-0.01329894, 0.08950708, -0.060090587, 0.0147...  


Getting X and y for training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

X_english = english_new2.iloc[:, 2]  # CLS embedding
y_english = english_new2.iloc[:, 1]  # Sentiment labels

X_english = np.array(X_english.tolist())  # Convert to NumPy array

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_english, y_english, test_size=0.2, random_state=42)

# Initialize logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the performance
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Confusion matrix: {confusion_matrix(y_test, y_pred)}")

Accuracy: 0.8425
Precision: 0.8641304347826086
Recall: 0.8071065989847716
F1 score: 0.8346456692913385
Confusion matrix: [[178  25]
 [ 38 159]]


# XLM-R

In [None]:
english_new2= english_new.copy()

# Initialize the tokenizer and model for XLM-R
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModel.from_pretrained('xlm-roberta-base')

# Function to get [CLS] token embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence and return PyTorch tensors
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Get model outputs (embedding for each token in the sentence)
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token's embedding (first token)
    cls_embedding = outputs.last_hidden_state[0][0].numpy()  # Convert to NumPy array
    return cls_embedding

# Apply the function to the first column
english_new2['CLS_embedding'] = english_new2.iloc[:, 0].apply(get_cls_embedding)

# `english_new2['CLS_embedding']` contains the embeddings
print(english_new2.head())

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

                                              Review  Rating  \
0  think twice booking breezes, just got breezes ...       0   
1  excellent value money just little finding hote...       1   
2  new enter modern hotel anonymous doorway calla...       0   
3  hotel wife just returned 14 days inna grand ba...       1   
4  stay no choice located shinjuku near train sta...       0   

                                       CLS_embedding  
0  [0.07147053, 0.09678139, 0.06532947, -0.034413...  
1  [0.09937637, 0.084749095, 0.061825365, -0.0058...  
2  [0.06929007, 0.11022584, 0.06681558, -0.021043...  
3  [0.066286325, 0.07663056, 0.049377903, -0.0072...  
4  [0.08022203, 0.08846704, 0.060778756, -0.02170...  


In [None]:
X_english = english_new2.iloc[:, 2]  # CLS embedding
y_english = english_new2.iloc[:, 1]  # Sentiment labels

X_english = np.array(X_english.tolist())  # Convert to NumPy array

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_english, y_english, test_size=0.2, random_state=42)

# Initialize logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the performance
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Confusion matrix: {confusion_matrix(y_test, y_pred)}")

Accuracy: 0.9
Precision: 0.9243243243243243
Recall: 0.868020304568528
F1 score: 0.8952879581151832
Confusion matrix: [[189  14]
 [ 26 171]]


# mBART

In [None]:
english_new2= english_new.copy()

# Load mBART
tokenizer = AutoTokenizer.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')
model = AutoModel.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')

# Function to get [CLS] token embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence and return PyTorch tensors
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Get model outputs (embedding for each token in the sentence)
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token's embedding (first token)
    cls_embedding = outputs.last_hidden_state[0][0].numpy()  # Convert to NumPy array
    return cls_embedding

# Apply the function to the first column
english_new2['CLS_embedding'] = english_new2.iloc[:, 0].apply(get_cls_embedding)

# `english_new2['CLS_embedding']` contains the embeddings
print(english_new2.head())

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

                                              Review  Rating  \
0  think twice booking breezes, just got breezes ...       0   
1  excellent value money just little finding hote...       1   
2  new enter modern hotel anonymous doorway calla...       0   
3  hotel wife just returned 14 days inna grand ba...       1   
4  stay no choice located shinjuku near train sta...       0   

                                       CLS_embedding  
0  [1.0167671, 0.39027333, -1.079385, -0.79432875...  
1  [1.0671965, 0.5104162, -1.1184089, -0.7852819,...  
2  [0.9918766, 0.45207736, -1.3687835, -0.7687571...  
3  [0.6574287, 0.3549424, -0.8083833, -0.87768316...  
4  [1.1680708, 0.35562834, -0.87934947, -0.950330...  


In [None]:
X_english = english_new2.iloc[:, 2]  # CLS embedding
y_english = english_new2.iloc[:, 1]  # Sentiment labels

X_english = np.array(X_english.tolist())  # Convert to NumPy array

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_english, y_english, test_size=0.2, random_state=42)

# Initialize logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the performance
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Confusion matrix: {confusion_matrix(y_test, y_pred)}")

Accuracy: 0.9025
Precision: 0.9157894736842105
Recall: 0.883248730964467
F1 score: 0.8992248062015504
Confusion matrix: [[187  16]
 [ 23 174]]


# Mandarin Chinese data

Pre-processing

In [None]:
# Get data of specific labels
positive_reviews = chinese[chinese['label'] == 1]
negative_reviews = chinese[chinese['label'] == 0]

# Limit to 2000 reviews each
positive_reviews = positive_reviews.sample(n=2000, random_state=42)
negative_reviews = negative_reviews.sample(n=2000, random_state=42)

# Concatenate
balanced_data = pd.concat([positive_reviews, negative_reviews])

# Shuffle the combined data
chinese_new = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
chinese_new = chinese_new[:2000]

print(chinese_new.head())

   label                                             review
0      1                              酒店还是不错，不过携程的价格还不够好一点。
1      0                    第一次，也是最后一次，服务与星级不成正比，房间设施也非常一般！
2      1  这是时隔两年第二次入住这个宾馆了。环境依旧很好，第一次和朋友入住时一致认为很灵，第二次陪老妈...
3      0  1、两天不作房间清洁，服务员告诉我，忘记了；2、只要洗澡，卫生间就漏水；3、免费的Inter...
4      0  住过所有三星酒店感觉最差的一家.1,周边环境极差,2.二楼的卡啦OK感觉很暧昧,电梯上有很"...


# mBERT

In [None]:
# Initialize the tokenizer and model for mBERT
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
model = AutoModel.from_pretrained('bert-base-multilingual-cased')

# Function to get [CLS] token embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence and return PyTorch tensors
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Get model outputs (embedding for each token in the sentence)
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token's embedding (first token)
    cls_embedding = outputs.last_hidden_state[0][0].numpy()  # Convert to NumPy array
    return cls_embedding


chinese_new_copy = chinese_new.copy()

# Apply the function to the second column
chinese_new_copy['CLS_embedding'] = chinese_new_copy.iloc[:, 1].apply(get_cls_embedding)

# `chinese_new_copy['CLS_embedding']` contains the embeddings
print(chinese_new_copy.head())

   label                                             review  \
0      1                              酒店还是不错，不过携程的价格还不够好一点。   
1      0                    第一次，也是最后一次，服务与星级不成正比，房间设施也非常一般！   
2      1  这是时隔两年第二次入住这个宾馆了。环境依旧很好，第一次和朋友入住时一致认为很灵，第二次陪老妈...   
3      0  1、两天不作房间清洁，服务员告诉我，忘记了；2、只要洗澡，卫生间就漏水；3、免费的Inter...   
4      0  住过所有三星酒店感觉最差的一家.1,周边环境极差,2.二楼的卡啦OK感觉很暧昧,电梯上有很"...   

                                       CLS_embedding  
0  [-0.084317625, 0.024835369, 0.053243082, 0.219...  
1  [-0.22353387, 0.28405857, -0.12228805, 0.17710...  
2  [-0.20742114, 0.15495782, 0.12652019, 0.013782...  
3  [-0.03661175, 0.05029537, -0.01501677, 0.04467...  
4  [0.116759725, -0.3048196, 0.2627409, -0.016275...  


In [None]:
X_chinese = chinese_new_copy.iloc[:, 2]  # CLS embedding
y_chinese = chinese_new_copy.iloc[:, 0]  # Sentiment labels

X_chinese = np.array(X_chinese.tolist())  # Convert to NumPy array

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_chinese, y_chinese, test_size=0.2, random_state=42)

# Initialize logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the performance
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Confusion matrix: {confusion_matrix(y_test, y_pred)}")

Accuracy: 0.7625
Precision: 0.7936507936507936
Recall: 0.7281553398058253
F1 score: 0.759493670886076
Confusion matrix: [[155  39]
 [ 56 150]]


# XLM-R

In [None]:
# Initialize the tokenizer and model for XLM-R
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModel.from_pretrained('xlm-roberta-base')

# Function to get [CLS] token embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence and return PyTorch tensors
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Get model outputs (embedding for each token in the sentence)
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token's embedding (first token)
    cls_embedding = outputs.last_hidden_state[0][0].numpy()  # Convert to NumPy array
    return cls_embedding


chinese_new_copy = chinese_new.copy()

# Apply the function
chinese_new_copy['CLS_embedding'] = chinese_new_copy.iloc[:, 1].apply(get_cls_embedding)

# chinese_new_copy['CLS_embedding'] contains the embeddings
print(chinese_new_copy.head())

   label                                             review  \
0      1                              酒店还是不错，不过携程的价格还不够好一点。   
1      0                    第一次，也是最后一次，服务与星级不成正比，房间设施也非常一般！   
2      1  这是时隔两年第二次入住这个宾馆了。环境依旧很好，第一次和朋友入住时一致认为很灵，第二次陪老妈...   
3      0  1、两天不作房间清洁，服务员告诉我，忘记了；2、只要洗澡，卫生间就漏水；3、免费的Inter...   
4      0  住过所有三星酒店感觉最差的一家.1,周边环境极差,2.二楼的卡啦OK感觉很暧昧,电梯上有很"...   

                                       CLS_embedding  
0  [0.21132883, 0.14600998, 0.10044359, -0.058698...  
1  [0.15243456, 0.071038656, 0.07884487, -0.01282...  
2  [0.13953419, 0.05961455, 0.08232439, -0.013806...  
3  [0.117984496, 0.08286513, 0.057191987, -0.0654...  
4  [0.11744339, 0.091990285, 0.08524982, -0.01401...  


In [None]:
X_chinese = chinese_new_copy.iloc[:, 2]  # CLS embedding
y_chinese = chinese_new_copy.iloc[:, 0]  # Sentiment labels

X_chinese = np.array(X_chinese.tolist())  # Convert to NumPy array

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_chinese, y_chinese, test_size=0.2, random_state=42)

# Initialize logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the performance
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Confusion matrix: {confusion_matrix(y_test, y_pred)}")

Accuracy: 0.8425
Precision: 0.8864864864864865
Recall: 0.7961165048543689
F1 score: 0.8388746803069054
Confusion matrix: [[173  21]
 [ 42 164]]


# mBART

In [None]:
# Load mBART
tokenizer = AutoTokenizer.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')
model = AutoModel.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')

# Function to get [CLS] token embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence and return PyTorch tensors
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Get model outputs (embedding for each token in the sentence)
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token's embedding (first token)
    cls_embedding = outputs.last_hidden_state[0][0].numpy()  # Convert to NumPy array
    return cls_embedding


chinese_new_copy = chinese_new.copy()

# Apply the function
chinese_new_copy['CLS_embedding'] = chinese_new_copy.iloc[:, 1].apply(get_cls_embedding)

# chinese_new_copy['CLS_embedding'] contains the embeddings
print(chinese_new_copy.head())

   label                                             review  \
0      1                              酒店还是不错，不过携程的价格还不够好一点。   
1      0                    第一次，也是最后一次，服务与星级不成正比，房间设施也非常一般！   
2      1  这是时隔两年第二次入住这个宾馆了。环境依旧很好，第一次和朋友入住时一致认为很灵，第二次陪老妈...   
3      0  1、两天不作房间清洁，服务员告诉我，忘记了；2、只要洗澡，卫生间就漏水；3、免费的Inter...   
4      0  住过所有三星酒店感觉最差的一家.1,周边环境极差,2.二楼的卡啦OK感觉很暧昧,电梯上有很"...   

                                       CLS_embedding  
0  [0.86533076, 0.45918104, -0.8063595, -1.030598...  
1  [1.0130745, 0.52224183, -0.9112447, -0.5969363...  
2  [1.1121508, 1.0001096, 0.071922615, -1.0577495...  
3  [1.1682708, 0.8721357, -0.46876866, -0.4884133...  
4  [1.5721874, 0.7377444, -0.4971084, -0.88967603...  


In [None]:
X_chinese = chinese_new_copy.iloc[:, 2]  # CLS embedding
y_chinese = chinese_new_copy.iloc[:, 0]  # Sentiment labels

X_chinese = np.array(X_chinese.tolist())  # Convert to NumPy array

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_chinese, y_chinese, test_size=0.2, random_state=42)

# Initialize logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the performance
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Confusion matrix: {confusion_matrix(y_test, y_pred)}")

Accuracy: 0.8375
Precision: 0.8405797101449275
Recall: 0.8446601941747572
F1 score: 0.8426150121065376
Confusion matrix: [[161  33]
 [ 32 174]]


# Spanish data

Pre-processing

In [None]:
spanish_new = spanish.copy()

# Remove all instances where label is 3, cause those reviews are neutral and not needed
spanish_new = spanish_new[spanish_new.iloc[:, 6] != 3]

# Shuffle and reset index
spanish_new = spanish_new.sample(frac=1, random_state=42).reset_index(drop=True)

# The amount of data being used
spanish_new = spanish_new[:2000]

# mBERT

In [None]:
# Initialize the tokenizer and model for mBERT
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
model = AutoModel.from_pretrained('bert-base-multilingual-cased')

# Function to get [CLS] token embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence and return PyTorch tensors
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Get model outputs (embedding for each token in the sentence)
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token's embedding (first token)
    cls_embedding = outputs.last_hidden_state[0][0].numpy()  # Convert to NumPy array
    return cls_embedding

# Apply the function
spanish_new['CLS_embedding'] = spanish_new.iloc[:, 3].apply(get_cls_embedding)

# `english_new2['CLS_embedding']` contains the embeddings
print(spanish_new.head())

   Unnamed: 0                            title  rating  \
0        5639                         Repetiré       5   
1        3717               Si puedo no vuelvo       1   
2        7610  Muy buen hotel, recien abierto.       5   
3        6519                   Una sola noche       5   
4         291  No apto para comidas de empresa       2   

                                         review_text  \
0  La habitación es cómoda, con los espacios sufi...   
1  Antiguo..algo desfasado...el servicio parece q...   
2  Bastante cerca del centro de sevilla, servicio...   
3  Es la segunda vez que nos alojamos  en este ho...   
4  El hotel es bonito y tiene una de las mejores ...   

                                    location                    hotel  label  \
0      Seville_Province_of_Seville_Andalucia    Hotel_Adriano_Sevilla      1   
1  Torredonjimeno_Province_of_Jaen_Andalucia              Hotel_Twist      0   
2      Seville_Province_of_Seville_Andalucia   Eurostars_Guadalquivir     

In [None]:
X_spanish = spanish_new.iloc[:, 7]  # CLS embedding
y_spanish = spanish_new.iloc[:, 6]  # Sentiment labels

X_spanish = np.array(X_spanish.tolist())  # Convert to NumPy array

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_spanish, y_spanish, test_size=0.2, random_state=42)

# Initialize logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the performance
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Confusion matrix: {confusion_matrix(y_test, y_pred)}")

Accuracy: 0.9175
Precision: 0.9035532994923858
Recall: 0.9270833333333334
F1 score: 0.9151670951156813
Confusion matrix: [[189  19]
 [ 14 178]]


# XLM-R

In [None]:
# Initialize the tokenizer and model for XLM-R
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModel.from_pretrained('xlm-roberta-base')

# Function to get [CLS] token embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence and return PyTorch tensors
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Get model outputs (embedding for each token in the sentence)
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token's embedding (first token)
    cls_embedding = outputs.last_hidden_state[0][0].numpy()  # Convert to NumPy array
    return cls_embedding

# Apply the function
spanish_new['CLS_embedding'] = spanish_new.iloc[:, 3].apply(get_cls_embedding)

# `english_new2['CLS_embedding']` contains the embeddings
print(spanish_new.head())

   Unnamed: 0                            title  rating  \
0        5639                         Repetiré       5   
1        3717               Si puedo no vuelvo       1   
2        7610  Muy buen hotel, recien abierto.       5   
3        6519                   Una sola noche       5   
4         291  No apto para comidas de empresa       2   

                                         review_text  \
0  La habitación es cómoda, con los espacios sufi...   
1  Antiguo..algo desfasado...el servicio parece q...   
2  Bastante cerca del centro de sevilla, servicio...   
3  Es la segunda vez que nos alojamos  en este ho...   
4  El hotel es bonito y tiene una de las mejores ...   

                                    location                    hotel  label  \
0      Seville_Province_of_Seville_Andalucia    Hotel_Adriano_Sevilla      1   
1  Torredonjimeno_Province_of_Jaen_Andalucia              Hotel_Twist      0   
2      Seville_Province_of_Seville_Andalucia   Eurostars_Guadalquivir     

In [None]:
X_spanish = spanish_new.iloc[:, 7]  # CLS embedding
y_spanish = spanish_new.iloc[:, 6]  # Sentiment labels

X_spanish = np.array(X_spanish.tolist())  # Convert to NumPy array

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_spanish, y_spanish, test_size=0.2, random_state=42)

# Initialize logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the performance
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Confusion matrix: {confusion_matrix(y_test, y_pred)}")

Accuracy: 0.9675
Precision: 0.9890710382513661
Recall: 0.9427083333333334
F1 score: 0.9653333333333334
Confusion matrix: [[206   2]
 [ 11 181]]


# mBART

In [None]:
# Load mBART
tokenizer = AutoTokenizer.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')
model = AutoModel.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')

# Function to get [CLS] token embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence and return PyTorch tensors
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Get model outputs (embedding for each token in the sentence)
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token's embedding (first token)
    cls_embedding = outputs.last_hidden_state[0][0].numpy()  # Convert to NumPy array
    return cls_embedding

# Apply the function
spanish_new['CLS_embedding'] = spanish_new.iloc[:, 3].apply(get_cls_embedding)

# `english_new2['CLS_embedding']` contains the embeddings
print(spanish_new.head())

   Unnamed: 0                            title  rating  \
0        5639                         Repetiré       5   
1        3717               Si puedo no vuelvo       1   
2        7610  Muy buen hotel, recien abierto.       5   
3        6519                   Una sola noche       5   
4         291  No apto para comidas de empresa       2   

                                         review_text  \
0  La habitación es cómoda, con los espacios sufi...   
1  Antiguo..algo desfasado...el servicio parece q...   
2  Bastante cerca del centro de sevilla, servicio...   
3  Es la segunda vez que nos alojamos  en este ho...   
4  El hotel es bonito y tiene una de las mejores ...   

                                    location                    hotel  label  \
0      Seville_Province_of_Seville_Andalucia    Hotel_Adriano_Sevilla      1   
1  Torredonjimeno_Province_of_Jaen_Andalucia              Hotel_Twist      0   
2      Seville_Province_of_Seville_Andalucia   Eurostars_Guadalquivir     

In [17]:
X_spanish = spanish_new.iloc[:, 7]  # CLS embedding
y_spanish = spanish_new.iloc[:, 6]  # Sentiment labels

X_spanish = np.array(X_spanish.tolist())  # Convert to NumPy array

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_spanish, y_spanish, test_size=0.2, random_state=42)

# Initialize logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the performance
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred)}")
print(f"Confusion matrix: {confusion_matrix(y_test, y_pred)}")

Accuracy: 0.9675
Precision: 0.9735449735449735
Recall: 0.9583333333333334
F1 score: 0.9658792650918635
Confusion matrix: [[203   5]
 [  8 184]]


Nu andere modellen erbij pakken om te vergelijken