In [1]:
import pandas as pd
import torch
import re
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import numpy as np

model_ckpt = "papluca/xlm-roberta-base-language-detection"
lang_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
lang_model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)


sent_tokenizer = BertTokenizer.from_pretrained("kk08/CryptoBERT")
sent_model = BertForSequenceClassification.from_pretrained("kk08/CryptoBERT")
sent_classifier = pipeline("sentiment-analysis", model=sent_model, tokenizer=sent_tokenizer, batch_size=16)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device set to use cpu


In [2]:
text = "Bitcoin (BTC) touches $29k, Ethereum (ETH) Set To Explode, RenQ Finance (RENQ) Crosses Massive Milestone"
result = sent_classifier([text, text])

In [3]:
result

[{'label': 'LABEL_1', 'score': 0.9678454399108887},
 {'label': 'LABEL_1', 'score': 0.9678454399108887}]

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
print(os.listdir("/content/drive/My Drive/DVA-spr2025"))

['tweets.csv', 'dva_small_dataset.ipynb', 'out_25k.csv', 'out_26k.csv', 'out_27k.csv']


In [6]:
# #!/bin/bash
# curl -L -o ~/Downloads/bitcoin-tweets-20160101-to-20190329.zip\
#   https://www.kaggle.com/api/v1/datasets/download/alaix14/bitcoin-tweets-20160101-to-20190329

In [7]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove RT prefix
    pattern = r'^RT\s*@[\w]+:'
    # Remove the pattern and strip any extra spaces
    text = re.sub(pattern, '', text, flags=re.IGNORECASE).strip()
    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def pred_lang(text_ls):
    inputs = lang_tokenizer(text_ls, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        logits = lang_model(**inputs).logits

    preds = torch.softmax(logits, dim=-1)
    vals, idxs = torch.max(preds, dim=1)
    return [{id2lang[k.item()]: v.item()} for k, v in zip(idxs, vals)]


# data = pd.read_csv('/content/drive/My Drive/DVA-spr2025/tweets.csv', sep=';', nrows=10_000)
# data['clean_text'] = data['text'].map(clean_text)
# data = data[data['clean_text'].str.split(' ').str.len() > 1]
# data.reset_index(inplace=True, drop=True)

# # Map raw predictions to languages
id2lang = lang_model.config.id2label
# lang_pred = pred_lang(data['clean_text'].to_list())
# lang_pred_df = pd.DataFrame([(lang, val) for d in lang_pred for lang, val in d.items()],
#                   columns=['lang', 'lang_score'])

# data = pd.concat([data, lang_pred_df], axis=1)
# data = data[data['lang']=='en']
# data.reset_index(inplace=True, drop=True)

# hashtag_pattern = r'#([A-Za-z0-9_]+)'
# data['topic'] = data['clean_text'].str.extractall(hashtag_pattern)[0].groupby(level=0).apply(list)
# data['topic'] = data['topic'].map(lambda x: '' if type(x)==float else ', '.join(x))

# sent_pred = sent_classifier([x if type(x)!=float else '' for x in data['clean_text'].to_list()])
# sent_pred_df = pd.DataFrame(sent_pred)
# sent_pred_df.columns=['sentiment_label', 'sentiment_score']
# data = pd.concat([data, sent_pred_df], axis=1)
# data['sentiment_label'] = data['sentiment_label'].map(lambda x: 'negative' if x=='LABEL_0' else 'positive')


# data.reset_index(inplace=True, drop=True)

In [8]:
def process_chunk(chunk):
    chunk['clean_text'] = chunk['text'].map(clean_text)
    chunk = chunk[chunk['clean_text'].str.split(' ').str.len() > 1]
    chunk.reset_index(inplace=True, drop=True)

    # Map raw predictions to languages
    lang_pred = pred_lang(chunk['clean_text'].to_list())
    lang_pred_df = pd.DataFrame([(lang, val) for d in lang_pred for lang, val in d.items()],
                                columns=['lang', 'lang_score'])

    chunk = pd.concat([chunk, lang_pred_df], axis=1)
    chunk = chunk[chunk['lang'] == 'en']
    chunk.reset_index(inplace=True, drop=True)

    hashtag_pattern = r'#([A-Za-z0-9_]+)'
    chunk['topic'] = chunk['clean_text'].str.extractall(hashtag_pattern)[0].groupby(level=0).apply(list)
    chunk['topic'] = chunk['topic'].map(lambda x: '' if type(x) == float else ', '.join(x))

    sent_pred = sent_classifier([x if type(x) != float else '' for x in chunk['clean_text'].to_list()])
    sent_pred_df = pd.DataFrame(sent_pred)
    sent_pred_df.columns = ['sentiment_label', 'sentiment_score']
    chunk = pd.concat([chunk, sent_pred_df], axis=1)
    chunk['sentiment_label'] = chunk['sentiment_label'].map(lambda x: 'negative' if x == 'LABEL_0' else 'positive')

    chunk.reset_index(inplace=True, drop=True)

    return chunk

In [9]:
# Read the CSV in chunks
chunks = pd.read_csv('/content/drive/My Drive/DVA-spr2025/tweets.csv', sep=';',
                     chunksize=2_000, nrows=4_000)

# Process each chunk and store results
processed_chunks = [process_chunk(chunk) for chunk in chunks]

# Concatenate all processed chunks into a single DataFrame
data = pd.concat(processed_chunks, ignore_index=True)

In [10]:
data

Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text,clean_text,lang,lang_score,topic,sentiment_label,sentiment_score
0,1132977023893139456,3eyedbran,Bran - 3 Eyed Raven,,2019-05-27 11:49:06+00,0,2,1,Another Test tweet that wasn't caught in the s...,Another Test tweet that wasn't caught in the s...,en,0.897406,,negative,0.782859
1,1132977089089556481,DetroitCrypto,J. Scardina,,2019-05-27 11:49:22+00,0,0,0,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...,Current Crypto Prices! BTC: $8721.99 USD ETH: ...,en,0.672166,,positive,0.940463
2,1132977101638897665,evilrobotted,evilrobotted,,2019-05-27 11:49:25+00,0,0,0,@nwoodfine We have been building on the real #...,@nwoodfine We have been building on the real #...,en,0.845931,bitcoin,positive,0.962464
3,1132977132714561536,MLWright15,ML Wright,,2019-05-27 11:49:32+00,0,0,0,"CHANGE IS COMING...GET READY!!! Boom, Another ...","CHANGE IS COMING...GET READY!!! Boom, Another ...",en,0.801848,,positive,0.964282
4,1132977076921933825,ltonews,LTONEWS,,2019-05-27 11:49:19+00,0,14,2,One of the useful articles of Stefan; here is ...,One of the useful articles of Stefan; here is ...,en,0.825371,"ltonetwork, Eth, xrpcommmunity, crypto, xlm, x...",positive,0.931335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2582,1126824221894529024,CryptoTraderPro,Crypto Trader Pro,,2019-05-10 10:20:04+00,0,0,0,Ⓜ via → https://t.co/4sRUrToww3 VIDEO ANALYSIS...,Ⓜ via → VIDEO ANALYSIS: SHOULD YOU BUY INTO BI...,en,0.361812,,positive,0.955238
2583,1126824223585050624,diecast_talk,diecast talk,,2019-05-10 10:20:04+00,0,0,0,To those buttcoiners who posted this meme to t...,To those buttcoiners who posted this meme to t...,en,0.799272,,negative,0.850114
2584,1126824224860065793,rossdonna1500,DLR,,2019-05-10 10:20:05+00,0,0,0,Binance pledges to ‘significantly’ increase se...,Binance pledges to ‘significantly’ increase se...,en,0.725170,,positive,0.964759
2585,1126824223379546112,bitcoinagile,BitcoinAgile,,2019-05-10 10:20:04+00,0,0,0,#bitcoin Target Road-map for COINBASE:BTCUSD b...,#bitcoin Target Road-map for COINBASE:BTCUSD b...,en,0.659320,"bitcoin, BTCUSD",positive,0.951669


In [None]:
for i
# Read the CSV in chunks
chunks = pd.read_csv('/content/drive/My Drive/DVA-spr2025/tweets.csv', sep=';',
                     chunksize=2_000, nrows=100_000)

# Process each chunk and store results
processed_chunks = [process_chunk(chunk) for chunk in chunks]

# Concatenate all processed chunks into a single DataFrame
data = pd.concat(processed_chunks, ignore_index=True)
data.to_csv(f'/content/drive/My Drive/DVA-spr2025/out_{}k.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['topic'] = chunk['clean_text'].str.extractall(hashtag_pattern)[0].groupby(level=0).apply(list)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['topic'] = chunk['topic'].map(lambda x: '' if type(x) == float else ', '.join(x))


In [None]:
import pandas as pd
from tqdm import tqdm


# Define the chunk size and total rows to process
chunk_size = 1000
total_rows = 100_000
# Step 1: Read the header from the first row of the CSV
header = pd.read_csv('/content/drive/My Drive/DVA-spr2025/tweets.csv',
                     sep=';',
                     nrows=0)  # Only read the header
column_names = header.columns.tolist()

# List to store processed chunks
processed_chunks = []

# Calculate number of iterations for tqdm
n_iterations = (total_rows - 1000) // chunk_size

# Loop through the ranges with tqdm
for start in tqdm(range(1000, total_rows, chunk_size),
                 total=n_iterations,
                 desc="Processing chunks"):

# # Loop through the ranges
# for start in range(1000, total_rows, chunk_size):
    # Read the specific range of rows
    chunk = pd.read_csv('/content/drive/My Drive/DVA-spr2025/tweets.csv',
                       sep=';',
                       skiprows=start,  # Start at this row
                       nrows=chunk_size,  # Read this many rows
                       names=column_names)

    # Process the chunk and append to list
    processed_chunk = process_chunk(chunk)
    processed_chunks.append(processed_chunk)

    # Concatenate all processed chunks into a single DataFrame
    data = pd.concat(processed_chunks, ignore_index=True)
    data.to_csv(f'/content/drive/My Drive/DVA-spr2025/out_{start//1000}k.csv')



Processing chunks:   6%|▌         | 6/99 [27:24<7:14:30, 280.33s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing chunks:  27%|██▋       | 27/99 [2:02:47<5:10:19, 258.61s/it]

In [15]:
processed_chunks = [pd.read_csv(f'/content/drive/My Drive/DVA-spr2025/out_27k.csv', encoding='utf-8-sig')]

In [None]:
import pandas as pd
from tqdm import tqdm


# Define the chunk size and total rows to process
chunk_size = 1000
total_rows = 100_000
# Step 1: Read the header from the first row of the CSV
header = pd.read_csv('/content/drive/My Drive/DVA-spr2025/tweets.csv',
                     sep=';',
                     nrows=0)  # Only read the header
column_names = header.columns.tolist()

# List to store processed chunks
# processed_chunks = []

# Calculate number of iterations for tqdm
n_iterations = (total_rows - 1000) // chunk_size

# Loop through the ranges with tqdm
for start in tqdm(range(27_000, total_rows, chunk_size),
                 total=n_iterations,
                 desc="Processing chunks"):

# # Loop through the ranges
# for start in range(1000, total_rows, chunk_size):
    # Read the specific range of rows
    chunk = pd.read_csv('/content/drive/My Drive/DVA-spr2025/tweets.csv',
                       sep=';',
                       skiprows=start,  # Start at this row
                       nrows=chunk_size,  # Read this many rows
                       names=column_names)

    # Process the chunk and append to list
    processed_chunk = process_chunk(chunk)
    processed_chunks.append(processed_chunk)

    # Concatenate all processed chunks into a single DataFrame
    data = pd.concat(processed_chunks, ignore_index=True)
    data.to_csv(f'/content/drive/My Drive/DVA-spr2025/out_{start//1000}k.csv', encoding='utf-8-sig')



Processing chunks:   7%|▋         | 7/99 [1:32:54<24:11:29, 946.62s/it]

In [11]:
chunk

Unnamed: 0,1132979855174164480,rat_race,jumbo,Unnamed: 3,2019-05-27 12:00:21+00,0,0.1,0.2,【5月中に書けば来月ビットコインがもらえます】Tadacoinの紹介記事をブログで書いたら500円分のビットコインプレゼント! https://t.co/hrNLJXb3RN #Tadacoin #タダコイン #Bitcoin #Faucet #お小遣い #副業 #懸賞 #ポイントサイト #ポイ活 @tadacoinさんから
0,1132979856507981825,ueno_sakura,ももか@仮想通貨アドバイザー,,2019-05-27 12:00:22+00,0,0,0,ビットコインから始まる仮想通貨の流れは誰にも止められません。詳細はこちらをアクセス→→ h...
1,1132979857489584128,LoremCrypto,LoremCrypto,,2019-05-27 12:00:22+00,0,0,0,Litecoin ICO zero-knowledge proof miner full n...
2,1132979860018782214,BitBase_es,BitBase ₿,,2019-05-27 12:00:23+00,0,0,0,Un informe reveló que el Tribunal Popular de H...
3,1132979862807953408,CoinTradingBot,Coin Trading Analytics,,2019-05-27 12:00:23+00,0,0,0,"Top 100 avg 1h return: -0.2±1.1%; 28 up, 72 do..."
4,1132979863755747333,litecoinstrings,Litecoin Strings,,2019-05-27 12:00:23+00,0,0,0,\BJ/BTC.COM/LTC #litecoin https://t.co/D3hUdQTh4X
...,...,...,...,...,...,...,...,...,...
995,1132982282635976704,tr_tradingview,TradingView Türkiye,,2019-05-27 12:10:00+00,0,0,0,#BTCUSD - Bitcoin - TradingView - https://t.co...
996,1132982283085012993,diegogurpegui,Diego H. Gurpegui,,2019-05-27 12:10:00+00,0,0,0,Deciding the open world-changing technology to...
997,1126819144165462016,djibrilww,$Djibril,,2019-05-10 09:59:53+00,0,0,0,https://t.co/OanG0TkGdR
998,1132982281910534144,airdro7,エアドロップ（airdrop)@仮想通貨,,2019-05-27 12:10:00+00,0,0,0,[BTC Surged by 1.07% Within 5 Mins]\n\nCoinNes...


In [1]:
data

NameError: name 'data' is not defined

In [11]:
data.to_csv("temp_4k.csv")

In [None]:
data.head()

Unnamed: 0,index,id,user,fullname,url,timestamp,replies,likes,retweets,text,clean_text,lang,lang_score,topic,sentiment_label,sentiment_score
0,0,1132977055300300800,KamdemAbdiel,Abdiel kamdem,,2019-05-27 11:49:14+00,0,0,0,È appena uscito un nuovo video! LES CRYPTOMONN...,È appena uscito un nuovo video! LES CRYPTOMONN...,it,0.996178,,LABEL_1,0.933729
1,1,1132977073402736640,bitcointe,Bitcointe,,2019-05-27 11:49:18+00,0,0,0,Cardano: Digitize Currencies; EOS https://t.co...,Cardano: Digitize Currencies; EOS 6500% ROI; A...,it,0.98346,"FolloForFolloBack, follo4folloback, followforf...",LABEL_1,0.950306
2,2,1132977023893139456,3eyedbran,Bran - 3 Eyed Raven,,2019-05-27 11:49:06+00,0,2,1,Another Test tweet that wasn't caught in the s...,Another Test tweet that wasn't caught in the s...,en,0.897405,,LABEL_0,0.78286
3,3,1132977089089556481,DetroitCrypto,J. Scardina,,2019-05-27 11:49:22+00,0,0,0,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...,Current Crypto Prices! BTC: $8721.99 USD ETH: ...,en,0.672167,,LABEL_1,0.940463
4,4,1132977092340191232,mmursaleen72,Muhammad Mursaleen,,2019-05-27 11:49:23+00,0,0,0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...,it,0.804191,,LABEL_1,0.932896


In [None]:
np.isnan(data['topic'][1])

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
data['topic'].isnull().index

Index([ 0,  1,  2,  3,  4,  5,  6,  7,  9, 10, 11, 12, 13, 14, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41, 42, 43, 45, 46, 47, 49,  8, 15, 28, 44],
      dtype='int64')

In [None]:
data.loc[data['topic'].isnull().index, 'topic'] = [[] for _ in range(data['topic'].isnull().sum())]

ValueError: Must have equal len keys and value when setting with an ndarray

In [None]:
pip show pandas

Name: pandas
Version: 2.2.3
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: 
Author-email: The Pandas Development Team <pandas-dev@python.org>
License: BSD 3-Clause License
        
        Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
        All rights reserved.
        
        Copyright (c) 2011-2023, Open source contributors.
        
        Redistribution and use in source and binary forms, with or without
        modification, are permitted provided that the following conditions are met:
        
        * Redistributions of source code must retain the above copyright notice, this
          list of conditions and the following disclaimer.
        
        * Redistributions in binary form must reproduce the above copyright notice,
          this list of conditions and the following disclaimer in the documentation
          an

In [None]:
[[]*(data['topic'].isnull().sum())], data['topic'].isnull().sum()

([[]], 28)

In [None]:
data['text'].to_list()

['È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT BITCOIN EN 2019 https://t.co/yCsQMvRnyS',
 'Cardano: Digitize Currencies; EOS https://t.co/1kTKqKEBlS 6500% ROI; AT&amp;T Bitcoin Bill Pay https://t.co/eQCwOXKHK0   |  Cardano (ADA) 🌏📢😎🤑💵 | #FolloForFolloBack #follo4folloback #followforfollow #bitcointe #cryptocurrency',
 "Another Test tweet that wasn't caught in the stream ! bitcoin",
 'Current Crypto Prices! \n\nBTC: $8721.99 USD\nETH: $266.62 USD\nLTC: $114.03 USD\nBCH: $432.02 USD\nXLM: $0.133 USD\nDOGE: $ 0.00314 USD\nNEO: $12.04 USD\nXRP: $0.4092 USD\nCANN: $0.001482 USD\nEMC2: $0.1198 USD\nXMR: $94.65 USD\nBTG: $24.31 USD',
 'Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NOT A Currency.\n\nhttps://t.co/FJru1ooxSM\n\nhttps://t.co/fYBX4H6r6r\n\nhttps://t.co/xZSjmZ0h3K\n\nhttps://t.co/v7GRFIQG7w\n\nhttps://t.co/Oq5hQt5hNn',
 '#btc inceldiği yerden kopsun bakalım 17:00 ye kadar bir hareket bekliyorum, yukarı yönlü olur umarın sanırım inşallah yani 😁 https://t.co/pIMy

In [None]:
data['clean_text'].tolist()[:10]

['È appena uscito un nuovo video! LES CRYPTOMONNAIES QUI PULVÉRISENT BITCOIN EN 2019',
 'Cardano: Digitize Currencies; EOS 6500% ROI; AT&amp;T Bitcoin Bill Pay | Cardano (ADA) 🌏📢😎🤑💵 | #FolloForFolloBack #follo4folloback #followforfollow #bitcointe #cryptocurrency',
 "Another Test tweet that wasn't caught in the stream ! bitcoin",
 'Current Crypto Prices! BTC: $8721.99 USD ETH: $266.62 USD LTC: $114.03 USD BCH: $432.02 USD XLM: $0.133 USD DOGE: $ 0.00314 USD NEO: $12.04 USD XRP: $0.4092 USD CANN: $0.001482 USD EMC2: $0.1198 USD XMR: $94.65 USD BTG: $24.31 USD',
 'Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NOT A Currency.',
 '#btc inceldiği yerden kopsun bakalım 17:00 ye kadar bir hareket bekliyorum, yukarı yönlü olur umarın sanırım inşallah yani 😁',
 '@nwoodfine We have been building on the real #bitcoin SV. What have you been building on broken BTC?',
 '@pedronauck como investidor, vc é um ótimo dev. Sorte q eu comprei os BTC, subiu a poha toda :o',
 'ブラジルはまぁ置いといてもドイツは存在感出してくるのかな。ロシアも

[{'it': 0.9961777925491333},
 {'it': 0.9834603667259216},
 {'en': 0.8974047303199768},
 {'en': 0.6721665263175964},
 {'it': 0.8041912913322449},
 {'tr': 0.9954675436019897},
 {'en': 0.8459301590919495},
 {'pt': 0.9956269264221191},
 {'ja': 0.9944883584976196},
 {'en': 0.8018485903739929},
 {'en': 0.8253710865974426},
 {'en': 0.8949164748191833},
 {'en': 0.8436959981918335},
 {'en': 0.633591890335083},
 {'en': 0.6794195771217346},
 {'en': 0.8107951283454895},
 {'ja': 0.9931315779685974},
 {'en': 0.7713034749031067},
 {'en': 0.8553143739700317},
 {'en': 0.8827056884765625},
 {'hi': 0.967934787273407},
 {'en': 0.8403323888778687},
 {'sw': 0.49156707525253296},
 {'en': 0.8197625279426575},
 {'pt': 0.9958803653717041},
 {'en': 0.7726256847381592},
 {'pt': 0.9955266118049622},
 {'en': 0.7936965823173523},
 {'ur': 0.9002540707588196},
 {'en': 0.766974687576294},
 {'en': 0.9251198768615723},
 {'en': 0.9150031805038452},
 {'en': 0.7838495969772339},
 {'en': 0.9475178718566895},
 {'en': 0.957517

In [None]:
pred = [{id2lang[k]: v} for id_, val in zip(idxs, vals)]

KeyError: tensor(5)

In [None]:
idxs

tensor([ 5,  5, 13, 13,  9,  7, 13,  6,  0, 13, 13, 13, 13, 13, 13, 13,  0, 13,
        13, 13,  9, 13, 11, 13,  6, 13,  6, 13, 11, 13, 13, 13, 13, 13, 13,  1,
        13, 13, 13, 18, 13, 13, 13,  8, 18, 13,  8, 13,  0, 13,  5,  4, 11, 13,
        13, 13, 13,  1, 13, 13,  0,  6, 13,  9, 13, 13,  4, 13, 13,  0,  1, 13,
         0, 13, 13, 11,  0, 13,  7, 13,  9, 14,  9,  4,  6,  7, 13,  9, 13, 13,
        11, 13, 13, 13, 13, 13, 13, 13,  7, 13])

In [None]:
dir(idxs[0])

['H',
 'T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__complex__',
 '__contains__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__dlpack__',
 '__dlpack_device__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__idiv__',
 '__ifloordiv__',
 '__ilshift__',
 '__imod__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed_