# Inference

Here we build the inference on the 4 countries we want to analyze : Germany, Italy, Spain, France. In particular, we will use our fine-tuned models. We select only the comments our models are most confident in, i.e. over 90% for this part, as we want to make sure that we interpret on as less noise as possible, while simultaneously still having a good amount of comments to analyze.

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer, pipeline
import pandas as pd
from google.colab import files
from google.colab import drive
import zipfile
import os

In [2]:
# Open a file upload dialog
# Select here all files to upload!
# If already uploaded, just press 'Cancel Upload'
uploaded = files.upload()

Saving spanish_combined_ready_for_inference.csv to spanish_combined_ready_for_inference.csv


In [3]:
# Mount drive to access fine-tuned models
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Set the path to the data
# On local machine use the relative path, for example
# path = 'NLP labelled data preview/english set/'
# On Google Colab use this path
# '/content/'
path = '/content/'

In [5]:
# Define the language
language = 'spanish'

In [6]:
# Load the data for inference and the model
inference_comments_df = pd.read_csv(path + language + '_combined_ready_for_inference.csv')
model_path = path + 'drive/MyDrive/' + language + '_model'

In [11]:
# GERMAN

model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)



inference_comments = inference_comments_df['Comment'].tolist()
# Make sure the comments are strings
inference_comments = [str(comment) for comment in inference_comments]

inputs = tokenizer(inference_comments, return_tensors="pt", padding='max_length', truncation=True, max_length=64)

# Predict sentiment
with torch.no_grad():
    outputs = model(**inputs)

# Apply softmax to get probabilities
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

# Define sentiment classes
sentiment_classes = ['negative', 'neutral', 'positive']

# Get predictions for each input text
predicted_classes = probabilities.argmax(dim=1)

# Get the highest probability for each input text (the score of the predicted class)
score = probabilities.max(dim=1).values

# Create dataframe with comments, the predicted sentiment and the scores
inference_df = pd.DataFrame({'Comment': inference_comments, 'Sentiment': [sentiment_classes[p] for p in predicted_classes], 'Score': score.tolist()})

# Only keep scores above 0.90
inference_df = inference_df[inference_df['Score'] > 0.90]

# Send to csv
inference_df.to_csv('german_results.csv', index=True)

# Download it to your local machine
files.download('german_results.csv')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
# SPANISH

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

inference_comments = inference_comments_df['Comment'].tolist()
# Make sure the comments are strings
inference_comments = [str(comment) for comment in inference_comments]
# We have to do in blocks of 10000 comments, else the system-RAM is too full and
# Google Colab crashes
inference_comments = inference_comments[70000:]

inputs = tokenizer(inference_comments, return_tensors="pt", padding='max_length', truncation=True, max_length=128)

with torch.no_grad():
    outputs = model(**inputs)

probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

sentiment_classes = ['negative', 'positive']

# Get predictions for each input text
predicted_classes = probabilities.argmax(dim=1)

# Get the highest probability for each input text (the score of the predicted class)
score = probabilities.max(dim=1).values

# Create dataframe with comments, the predicted sentiment and the scores
inference_df = pd.DataFrame({'Comment': inference_comments, 'Sentiment': [sentiment_classes[p] for p in predicted_classes], 'Score': score.tolist()})

# Only keep scores above 0.90
inference_df = inference_df[inference_df['Score'] > 0.90]

# Send to csv
inference_df.to_csv('spanish_results_7.csv', index=True)

# Download it to your local machine
files.download('spanish_results_7.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
# FRENCH

model=AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer=AutoTokenizer.from_pretrained(model_path)
analyzer = pipeline(
task='text-classification', model=model, tokenizer=tokenizer)

# Load the data for inference

inference_comments = inference_comments_df['Comment'].tolist()
# Make sure the comments are strings
inference_comments = [str(comment) for comment in inference_comments]

comments = []
labels = []
scores = []


for text in inference_comments:
    result = analyzer(text, return_all_scores=False) # list of a set
    # convert the labels : '1 star' -> negative, '2 star' -> negative, '3 star' -> neutral, '4 stars' -> positive,  '5 stars' -> positive

    if result[0]['score'] < 0.90: # exlude the comments with score less than 0.90
        pass
    else:
        if result[0]['label'] == '1 star' or result[0]['label'] == '2 stars':
            result[0]['label'] = 'negative'
        if result[0]['label'] == '4 stars' or result[0]['label'] == '5 stars':
            result[0]['label'] = 'positive'
        if result[0]['label'] == '3 stars':
            result[0]['label'] = 'neutral'

        comments.append(text)
        labels.append(result[0]['label'])
        scores.append(result[0]['score'])

# Create dataframe with comments, the predicted sentiment and the scores
inference_df = pd.DataFrame({'Comment': comments, 'Sentiment': labels, 'Score': scores})

# Send to csv
inference_df.to_csv('french_results.csv', index=True)
# Download it to your local machine
files.download('french_results.csv')






<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
# ITALIAN

model=AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer=AutoTokenizer.from_pretrained(model_path)
analyzer = pipeline(
task='text-classification', model=model, tokenizer=tokenizer)

# Load the data for inference

inference_comments = inference_comments_df['Comment'].tolist()
# Make sure the comments are strings
inference_comments = [str(comment) for comment in inference_comments]

comments = []
labels = []
scores = []


for text in inference_comments:
    result = analyzer(text, return_all_scores=False) # list of a set
    # convert the labels : NEGATIVE -> negative, POSITIVE -> positive

    if result[0]['score'] < 0.90: # exlude the comments with score less than 0.90
        pass
    else:
        if result[0]['label'] == 'NEGATIVE':
            result[0]['label'] = 'negative'
        if result[0]['label'] == 'POSITIVE':
            result[0]['label'] = 'positive'


        comments.append(text)
        labels.append(result[0]['label'])
        scores.append(result[0]['score'])

# Create dataframe with comments, the predicted sentiment and the scores
inference_df = pd.DataFrame({'Comment': comments, 'Sentiment': labels, 'Score': scores})

# Send to csv
inference_df.to_csv('italian_results.csv', index=True)
# Download it to your local machine
files.download('italian_results.csv')




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>