In [1]:
import gzip
import csv
import re
from fast_langdetect import detect, detect_multilingual


In [35]:
# Single language detection
print(detect("Hello, world!"))
# Output: {'lang': 'en', 'score': 0.12450417876243591}

# `use_strict_mode` determines whether the model loading process should enforce strict conditions before using fallback options.
# If `use_strict_mode` is set to True, we will load only the selected model, not the fallback model.
print(detect("Hello, world!", low_memory=False, use_strict_mode=True))

# How to deal with multiline text
multiline_text = """
Hello, world!
This is a multiline text.
But we need remove `\n` characters or it will raise an ValueError.
"""
multiline_text = multiline_text.replace("\n", "")  # NOTE:ITS IMPORTANT TO REMOVE \n CHARACTERS
print(detect(multiline_text))
# Output: {'lang': 'en', 'score': 0.8509423136711121}

print(detect("Привет, мир!"))#["lang"])
# Output: ru

# Multi-language detection
print(detect_multilingual("Hello, world!你好世界!Привет, мир!"))
# Output: [{'lang': 'ja', 'score': 0.32009604573249817}, {'lang': 'uk', 'score': 0.27781224250793457}, {'lang': 'zh', 'score': 0.17542070150375366}, {'lang': 'sr', 'score': 0.08751443773508072}, {'lang': 'bg', 'score': 0.05222449079155922}]

# Multi-language detection with low memory mode disabled
print(detect_multilingual("Puszka 0,33l dzięki Christoph . I really like this beer,Because with a shorter text it's not so accurate", low_memory=True))

{'lang': 'en', 'score': 0.12450417876243591}
{'lang': 'en', 'score': 0.25001001358032227}
{'lang': 'en', 'score': 0.8509423136711121}
{'lang': 'ru', 'score': 0.4549740254878998}
[{'lang': 'ja', 'score': 0.32009604573249817}, {'lang': 'uk', 'score': 0.27781224250793457}, {'lang': 'zh', 'score': 0.17542070150375366}, {'lang': 'sr', 'score': 0.08751443773508072}, {'lang': 'bg', 'score': 0.05222449079155922}]
[{'lang': 'en', 'score': 0.7895520925521851}, {'lang': 'pl', 'score': 0.11386539041996002}, {'lang': 'es', 'score': 0.013312462717294693}, {'lang': 'ru', 'score': 0.012153928168118}, {'lang': 'de', 'score': 0.010696609504520893}]


In [34]:
print(detect_multilingual("Hello, world!"))

[{'lang': 'en', 'score': 0.12450417876243591}, {'lang': 'ca', 'score': 0.08594832569360733}, {'lang': 'de', 'score': 0.0802881047129631}, {'lang': 'oc', 'score': 0.054167523980140686}, {'lang': 'fr', 'score': 0.05338495597243309}]


In [18]:
def lang_tagger(text):
    text = text.replace("\n", "")
    res = ""

    for couple in detect_multilingual(text):
       if couple['score'] > 0.2:
          res = res + ('' if res=='' else ", ") + couple['lang']
    return res

# Step 1: Read and process the file line by line from the .gz file
def parse_and_write_to_csv(input_gz_file, output_csv_file):
    # Regular expression for matching 'field: value' format
    field_regex = re.compile(r'(\w+):\s*(.*)')

    with gzip.open(input_gz_file, 'rt', encoding='utf-8') as f, open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_file:
        # Initialize CSV writer
        writer = None
        current_entry = {}

        for line in f:
            line = line.strip()  # Remove leading/trailing whitespaces
            match = field_regex.match(line)

            if match:
                key, value = match.groups()
                if key in keys:
                    current_entry[key] = value
                elif key == 'text':
                    current_entry['lang_tag'] = lang_tagger(value)
            else:
                if current_entry:
                    if writer is None:
                        headers = current_entry.keys()
                        writer = csv.DictWriter(csv_file, fieldnames=headers)
                        writer.writeheader()

                    # Write the current entry
                    writer.writerow(current_entry)
                    current_entry = {}  

        # Write the last entry (if any)
        if current_entry:
            writer.writerow(current_entry)

# Main function to execute the process
def convert_large_txt_gz_to_csv(input_gz_file, output_csv_file):
    parse_and_write_to_csv(input_gz_file, output_csv_file)

keys = ['beer_id','date', 'user_id']

dirs = ["RateBeer/", "BeerAdvocate/"]

input_gz_file = 'reviews.txt.gz'  
output_csv_file = 'reviews_tagged.csv'   

for d in dirs:
    convert_large_txt_gz_to_csv(d+input_gz_file, d+output_csv_file)


In [19]:
import pandas as pd

d_ba = pd.read_csv(dirs[1]+"reviews_tagged.csv")
d_rb = pd.read_csv(dirs[0]+"reviews_tagged.csv")


In [26]:
d_ba["lang_tag"].value_counts()['en']/d_ba["lang_tag"].count()

0.9997474467601165

In [27]:
d_rb["lang_tag"].value_counts()['en']/d_rb["lang_tag"].count()

0.9284400784463177

In [28]:
d_ba["lang_tag"].value_counts()

lang_tag
en        2588899
fr            340
es             66
en, fr         55
pt             40
fi             27
en, de         16
de             11
fr, en         10
de, en          7
fi, en          7
it              7
ru              6
cs              6
nl              5
ja              5
en, es          4
zh              4
ca              3
en, no          3
en, pt          3
uk              2
en, fi          2
pl              2
es, en          2
sv              2
en, id          1
en, os          1
ja, en          1
uk, ja          1
en, ru          1
de, ja          1
en, sh          1
en, sv          1
ja, zh          1
pt, en          1
en, ca          1
et              1
tr              1
hr, sh          1
da              1
en, pl          1
en, it          1
da, en          1
no              1
Name: count, dtype: int64

In [32]:
d_rb["lang_tag"].value_counts()[d_rb["lang_tag"].value_counts() > 1000]

lang_tag
en        6606969
pl         104950
fr          99688
de          73328
nl          36357
it          32113
sv          30586
es          28988
no          12751
da          12446
pt           9533
no, da       8123
en, fr       6884
hu           6797
da, no       5713
fi           5086
ru           3030
fr, en       2824
cs           2678
no, nn       2650
sk           1672
sv, da       1639
pt, es       1177
ca           1027
Name: count, dtype: int64

In [22]:
d_ba.head()

Unnamed: 0,beer_id,date,user_id,lang_tag
0,142544,1440064800,nmann08.184925,en
1,19590,1235127600,stjamesgate.163714,en
2,19590,1142247600,mdagnew.19527,en
3,19590,1101898800,helloloser12345.10867,en
4,19590,1093860000,cypressbob.3708,en


In [23]:
d_rb.head()

Unnamed: 0,beer_id,date,user_id,lang_tag
0,410549,1461664800,175852,pl
1,105273,1487329200,442761,es
2,105273,1466762400,288889,pl
3,105273,1451646000,250510,es
4,105273,1445594400,122778,en
