# Prediction

In [9]:
try:
    import google.colab
    googleColab = True
except:
    googleColab = False

print(f"In google colab: {googleColab}")

In google colab: False


In [10]:
import numpy as np
import pandas as pd
import subprocess, time
from datasets import Dataset, DatasetDict
import os

In [11]:
import torch

device = torch.device("cpu")
if torch.cuda.is_available():
    running = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE).stdout.decode("utf-8")
    device = torch.device("cuda:0")
    print(running)
else:
    print("False")

False


In [12]:
rootPath = os.getcwd()

if not googleColab:
    rootPath = os.path.dirname(rootPath)

modelsPath = os.path.join(rootPath, "models")
dataPath = os.path.join(rootPath, "data")

print(f"Data path: {dataPath}\n\nModels path: {modelsPath}")

Data path: c:\Users\CND1183F38\Documents\Github\ML-FastAPI-Docker\data

Models path: c:\Users\CND1183F38\Documents\Github\ML-FastAPI-Docker\models


In [13]:
nameFile = "Language Detection.csv"
datasetPath = os.path.join(dataPath, nameFile)

In [14]:
import pandas as pd

colnames = ["text", "label"]
df_test = pd.read_csv(datasetPath, header=0, names=colnames)
df_test = df_test.sample(n=30, random_state=1)
df_test

Unnamed: 0,text,label
7413,Data una codifica della (nota) conoscenza di b...,Italian
3931,alors la prochaine fois que vous entendrez par...,French
4195,bravo je l'apprécie vraiment.,French
2048,"[9][10][11][12] மேலும், இது அலெக்சா இணையத்தளத்...",Tamil
6945,prøv at kopiere min udtale nøjagtigt inklusive...,Danish
2580,"Edições em diferentes idiomas, cada uma sob co...",Portugeese
7770,quando ha starnutito oh oh e l'ultima fermata ...,Italian
2680,"Usuários não registrados são, em algum sentido...",Portugeese
5382,"No puedo creerlo, tienes que estar bromeando p...",Spanish
7311,Facendo riferimento alla Legge di Linus sullo ...,Italian


In [15]:
from tqdm.notebook import tqdm
import os

name_model = "finetuning-binary-language-classifier"
output_dir = os.path.join(modelsPath, name_model)

In [16]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained(output_dir)
model = BertForSequenceClassification.from_pretrained(output_dir)

In [17]:
print(f"Model max length: {tokenizer.model_max_length}\nModel num labels: {model.num_labels}")

Model max length: 512
Model num labels: 2


In [18]:
from transformers import pipeline
pipe = pipeline(task="text-classification", model=model, tokenizer=tokenizer, device=device)

In [19]:
dataForPrediction = Dataset.from_dict(df_test)["text"] # it is preferred Dataset obj, as in this way

In [20]:
tokenizer_kwargs = {
    #"padding":True,
    "truncation": True,
    "max_length": tokenizer.model_max_length, # if you want you can change, maybe
    "top_k": 1,
    "batch_size": 256,
    #return_tensors" : "pt"
}

#prediction = pipe(test_df["text"].to_list(), **tokenizer_kwargs)
prediction = []
for out in tqdm(pipe(dataForPrediction, **tokenizer_kwargs), total=len(dataForPrediction)):
    prediction.append(out)

  0%|          | 0/30 [00:00<?, ?it/s]

In [21]:
mapTarget = {
    0 : "Non italiano",
    1 : "Italiano"
}

def mapTargetFun(row):
    return mapTarget[row]

df_test.loc[ df_test["label"] == "Italian", "target"] = 1
df_test.loc[ df_test["label"] != "Italian", "target"] = 0
df_test["target"] = df_test["target"].astype(int)
df_test["target"] = df_test["target"].apply(mapTargetFun)

In [22]:
mapTarget = {
    "LABEL_0" : "Non italiano",
    "LABEL_1" : "Italiano"
}

df_test["prediction"] = [mapTarget[pred[0]["label"]] for pred in prediction]
df_test["score"] = [pred[0]["score"] for pred in prediction]
df_test

Unnamed: 0,text,label,target,prediction,score
7413,Data una codifica della (nota) conoscenza di b...,Italian,Italiano,Italiano,0.999783
3931,alors la prochaine fois que vous entendrez par...,French,Non italiano,Non italiano,0.999943
4195,bravo je l'apprécie vraiment.,French,Non italiano,Non italiano,0.999929
2048,"[9][10][11][12] மேலும், இது அலெக்சா இணையத்தளத்...",Tamil,Non italiano,Non italiano,0.999939
6945,prøv at kopiere min udtale nøjagtigt inklusive...,Danish,Non italiano,Non italiano,0.999943
2580,"Edições em diferentes idiomas, cada uma sob co...",Portugeese,Non italiano,Non italiano,0.999944
7770,quando ha starnutito oh oh e l'ultima fermata ...,Italian,Italiano,Italiano,0.999818
2680,"Usuários não registrados são, em algum sentido...",Portugeese,Non italiano,Non italiano,0.999942
5382,"No puedo creerlo, tienes que estar bromeando p...",Spanish,Non italiano,Non italiano,0.999946
7311,Facendo riferimento alla Legge di Linus sullo ...,Italian,Italiano,Italiano,0.999785
