In [1]:
# We import all necessary libraries.
from collections import Counter
from pathlib import Path
import ujson
import glob


In [2]:
# Input folder.
folder = r'C:\UOC\TFM\Procesado_tweets\01_Clasificacion_por_fecha'

# List that will contain the languages of all tweets.
list_languages = []

# Iterate over all files located in the defined folder.
for file in Path(folder).glob('*.json'):
    # For each line of the file.
    for line in open(file, 'r', encoding="utf8"):
        # Skip empty lines.
        if not line.strip (): 
            continue
        # If the line contains a tweet.
        if line.startswith('{"created_at":') or line.startswith('{"contributors":'):
            # Get the language in which it was published.    
            tweet_language = ujson.loads(line)['lang']
            # Add the language in the list of languages.
            list_languages.append(tweet_language)



In [3]:
# Print a list with the unique languages and their frequency.
print(Counter(list_languages))

Counter({'es': 1409373, 'und': 117540, 'ca': 72180, 'pt': 58098, 'en': 26968, 'it': 20946, 'tr': 15848, 'fr': 7377, 'de': 2952, 'eu': 2565, 'ja': 1332, 'et': 377, 'in': 366, 'tl': 303, 'pl': 261, 'ru': 224, 'ht': 191, 'ro': 136, 'ko': 130, 'fa': 127, 'cy': 116, 'el': 104, 'lt': 97, 'fi': 83, 'is': 80, 'hu': 75, 'cs': 67, 'da': 67, 'bn': 66, 'no': 65, 'ar': 63, 'nl': 61, 'sv': 43, 'lv': 40, 'zh': 21, 'ur': 14, 'vi': 12, 'sl': 10, 'hi': 9, 'ckb': 7, 'th': 3, 'sr': 1, 'ka': 1, 'iw': 1})


https://support.gnip.com/apis/powertrack2.0/rules.html#Operators
http://www.lingoes.net/en/translator/langcode.htm

es: español
und: indefinido 
ca: catalan
pt: portugués
en: inglés
it: italiano
tr: turco
fr: francés
de: alemán
eu: vasco
ja: japonés
et: estonio
in: indonesio
tl: tagalo
pl: polaco
ru: ruso 
ht: haitiano
ro: rumano
ko: coreano
fa: persa
cy: galés
el: griego
lt: lituano
fi: finés
is: islandés
cs: checo
hu: húngaro
da: danés
bn: bengalí
no: noruego
ar: árabe
nl: holandés
sv: sueco
lv: letón
zh: chino
ur: urdu
vi: vietnamita
hi: indio
sl: esloveno
ckb: kurdo
th: tailandés
sr: serbio
ka: georgiano
iw: hebreo

In [5]:
# Output folders.
file_6M = r'C:\UOC\TFM\Procesado_tweets\02_Idiomas_filtrados\id_filt_6M.json'
file_7M = r'C:\UOC\TFM\Procesado_tweets\02_Idiomas_filtrados\id_filt_7M.json'
file_8M = r'C:\UOC\TFM\Procesado_tweets\02_Idiomas_filtrados\id_filt_8M.json'
file_9M = r'C:\UOC\TFM\Procesado_tweets\02_Idiomas_filtrados\id_filt_9M.json'
file_10M = r'C:\UOC\TFM\Procesado_tweets\02_Idiomas_filtrados\id_filt_10M.json'

# Counters by date to know how many tweets are included in each file. 
count_6M = 0
count_7M = 0
count_8M = 0
count_9M = 0
count_10M = 0

# Lists of languages by day.
lang_6M = []
lang_7M = []
lang_8M = []
lang_9M = []
lang_10M = []

# List with accepted languages for our study.
accepted_lang = ['es', 'ca', 'eu', 'und']

# Function that writes a tweet in a file and appends its language to the languages list by day.
def write_and_append(file_by_date, list_lang):
    with open(file_by_date, "a") as outfile: 
        outfile.write(line)
    list_lang.append(tweet_language)


# Iterate through all files located in the defined folder.
for file in Path(folder).glob('*.json'):
    # For each line of the file.
    for line in open(file, 'r', encoding="utf8"):
        # Skip empty lines.
        if not line.strip (): 
            continue
        # If the line contains a tweet.
        if line.startswith('{"created_at":') or line.startswith('{"contributors":'):
            # Get the language in which it was published.     
            tweet_language = ujson.loads(line)['lang']
            # If the language is one of the accepted, write the tweet in the corresponding filtered file.
            if tweet_language in accepted_lang:
                if '6M' in file.name:
                    write_and_append(file_6M, lang_6M)
                    count_6M += 1                
                elif '7M' in file.name:
                    write_and_append(file_7M, lang_7M)
                    count_7M += 1 
                elif '8M' in file.name:
                    write_and_append(file_8M, lang_8M)
                    count_8M += 1                        
                elif '9M' in file.name:
                    write_and_append(file_9M, lang_9M)
                    count_9M += 1                    
                else: 
                    write_and_append(file_10M, lang_10M)
                    count_10M += 1                     
                                   
                    

In [7]:
# Print the number of tweets by date after filtering the language.
print("Número de tweets antes del 8M: {}".format(count_6M + count_7M))
print("Número de tweets el 8M: {}".format(count_8M))
print("Número de tweets después del 8M: {}\n".format(count_9M + count_10M))
print("Número de tweets por idioma antes del 8M: {}".format(Counter(lang_6M) + Counter(lang_7M)))
print("Número de tweets por idioma el 8M: {}".format(Counter(lang_8M)))
print("Número de tweets por idioma después del 8M: {}".format(Counter(lang_9M) + Counter(lang_10M)))


Número de tweets antes del 8M: 167738
Número de tweets el 8M: 951821
Número de tweets después del 8M: 482099

Número de tweets por idioma antes del 8M: Counter({'es': 144432, 'ca': 17041, 'und': 5837, 'eu': 428})
Número de tweets por idioma el 8M: Counter({'es': 830342, 'und': 75059, 'ca': 44628, 'eu': 1792})
Número de tweets por idioma después del 8M: Counter({'es': 434599, 'und': 36644, 'ca': 10511, 'eu': 345})
