In [1]:
import pandas as pd
# Read parquet file
df = pd.read_parquet('TrainingData/treino12.parquet')


In [20]:
import matplotlib.pyplot as plt

In [26]:
df.shape

(202203, 3)

In [31]:
df[150000:150050]

Unnamed: 0,prompt,response,source
150000,Make a detailed description of a sarcophagus i...,The sarcophagus in the temple is a grand and i...,laion/unified_chip2
150001,<p>I have a two Pandas dataframes and want to ...,You can simply select the desired columns from...,pacovaldez/stackoverflow-questions
150002,What steps do I need to take to set up a small...,"As an AI language model, I don't have real-wor...",laion/unified_chip2
150003,<p>I'm working on a project that requires http...,Here's an example of how you can use the https...,pacovaldez/stackoverflow-questions
150004,<p>I have a interesting JSON file that is a bi...,Your struct looks correct for the JSON file yo...,pacovaldez/stackoverflow-questions
150005,Read this and respond based on my instructions...,"The package that offers Fox Sports HD is ""Sky ...",unified_unifiedskg_instructions
150006,Produce an article summary of the following ne...,Washington Capitals forward Marcus Johansson h...,unified_multi_sum
150007,Give me an article about John Peter Farnham AO...,"John Peter Farnham AO, better known as John Fa...",unified_abstract_infill_output_0-100_000.jsonl
150008,"<p>I've never ran into this problem, so I real...","As the error message suggests, the problem is ...",pacovaldez/stackoverflow-questions
150009,Hornindalsvatnet is Norway's and Europe's deep...,Hornindalsvatnet is Norway's and Europe's deep...,unified_abstract_infill_output-100-000-x.jsonl


In [7]:
from langdetect import detect
from collections import Counter
import os
import multiprocessing
from concurrent.futures import ThreadPoolExecutor, as_completed
import langcodes

num_cores = os.cpu_count()
num_threads = min(num_cores, multiprocessing.cpu_count())


# define a function to detect the language of a string
def detect_language(text):
    try:
        return detect(text)
    except:
        return None
    
def get_language_name(code):
    lang_obj = langcodes.Language(code)
    return lang_obj.display_name('en')
    
def calculate(lower_limit, upper_limit, df):
    # create a list of all the texts in the dataframe
    texts = list(df[lower_limit:upper_limit]['prompt'].values)

    # create a thread pool executor
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # submit the language detection tasks to the executor
        futures = [executor.submit(detect_language, text) for text in texts]

        # create a counter to store the language counts
        counts = Counter()

        # iterate over the completed futures and update the counter
        for future in as_completed(futures):
            language = future.result()
            if language:
                counts[language] += 1

    # print the language distribution
    for language, count in counts.items():
        print(f'{get_language_name(language)}: {count}')

In [19]:
calculate(0, 10000)

English: 9880
Catalan: 33
French: 33
Dutch: 34
Danish: 11
Spanish: 4
Portuguese: 2
Tagalog: 2
Swedish: 1


In [6]:
calculate(10000, 20000)

Afrikaans: 21
Catalan: 36
Danish: 5
English: 7558
Dutch: 11
Croatian: 2
French: 18
Portuguese: 1
Italian: 2
Spanish: 1
Estonian: 1
Hungarian: 1
German: 1


In [3]:
calculate(20000,100000)

Afrikaans: 19
French: 117
English: 79532
German: 18
Catalan: 157
Danish: 16
Dutch: 49
Portuguese: 7
Swedish: 3
Italian: 23
Indonesian: 17
Spanish: 25
Somali: 4
Welsh: 2
Swahili: 2
Romanian: 3
Lithuanian: 1
Polish: 3
Norwegian: 1
Russian: 1


In [4]:
calculate(-100000,-20000)

English: 79581
French: 79
Vietnamese: 5
Italian: 35
Dutch: 48
Catalan: 71
Spanish: 28
Slovenian: 4
German: 29
Polish: 3
Indonesian: 20
Danish: 15
Afrikaans: 8
Croatian: 3
Welsh: 11
Estonian: 3
Tagalog: 8
Portuguese: 18
Hungarian: 1
Norwegian: 13
Romanian: 8
Somali: 3
Turkish: 1
Swedish: 5


In [5]:
calculate(-20000, -10000)

English: 9954
Indonesian: 5
French: 10
Romanian: 1
Dutch: 5
Spanish: 4
Italian: 6
German: 5
Catalan: 5
Estonian: 1
Welsh: 1
Swedish: 1
Norwegian: 1
Afrikaans: 1


In [6]:
calculate(-10000, -1)

English: 9945
Indonesian: 3
French: 10
Dutch: 5
Catalan: 9
German: 7
Spanish: 5
Danish: 2
Norwegian: 2
Portuguese: 4
Afrikaans: 4
Romanian: 1
Somali: 1
Italian: 1


### now seeing df2 (the second training data)

In [9]:
df2 = pd.read_parquet('TrainingData/treino2.parquet')

In [10]:
# Post refresh (calculate now receives df and df2 is declared)
calculate(0, df2.shape[0], df2)

English: 200858
Italian: 89
Somali: 28
Spanish: 61
German: 206
Norwegian: 29
Vietnamese: 6
Polish: 21
French: 126
Portuguese: 41
Swedish: 36
Dutch: 234
Catalan: 106
Indonesian: 86
Danish: 48
Slovak: 9
Romanian: 24
Welsh: 23
Estonian: 17
Tagalog: 34
Croatian: 12
Afrikaans: 54
Hungarian: 3
Slovenian: 11
Finnish: 14
Swahili: 17
Albanian: 1
Czech: 3
Turkish: 4
Bulgarian: 1
Lithuanian: 1


In [12]:
df2.shape

(202203, 3)