In [2]:
# Installing the langdetect library

!pip install langdetect langid

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ------------------------------------- 981.5/981.5 kB 12.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting langid
  Downloading langid-1.1.6.tar.gz (1.9 MB)
     ---------------------------------------- 1.9/1.9 MB 17.6 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: langdetect, langid
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993225 sha256=881395b80f4b5d8cfe5d925679037cdbbf662c6a44e20c1c836df911c98f42e9
  Stored in directory: c:\users\todor\appdata\local\pip\cache\wheels\d1\c1\d9\7e068de779d863bc8f8fc9467d85e25cfe47fa5051fff1a1bb
  Building wheel for langid (setup.p

In [57]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
import spacy

from langdetect import detect, detect_langs
from googletrans import Translator

In [29]:
# Downloads

!python -m spacy download en_core_web_sm 
!python -m spacy download de_core_news_sm 

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 17.2 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
     --------------------------------------- 14.6/14.6 MB 15.6 MB/s eta 0:00:00
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


# Agenda

1. Language detection
2. Language detection with probabilities
3. Tokenization based on detected language
4. Basic translator
5. Translator with user input

#### 1. Language detection

In [10]:
# Detecting the language with defined text

text = "Guten Tag, wie geht es Ihnen?"
detected_language = detect(text)
print(f"The detected language is: {detected_language}")

The detected language is: de


In [18]:
# Detecting the language with user input

while True:
    user_text = input("Enter your text: ")
    
    if user_text == "Stop":
        break
    else:
        detected_language = detect(user_text)
        print(f"The detected language is: {detected_language}")
    

Enter your text: Guten Tag
The detected language is: de
Enter your text: whats up
The detected language is: en
Enter your text: como esta
The detected language is: pt
Enter your text: me llamo Niko
The detected language is: sw
Enter your text: Stop


#### 2. Language detection with probabilities

In [21]:
# Detecting the language probability

text = "Como esta?"
probable_languages = detect_langs(text)
print(probable_languages)

[es:0.9999920361646811]


In [22]:
# Detecting the language with user input

while True:
    user_text = input("Please enter your text: ")
    
    if user_text == "Stop":
        break
    else:
        probable_language = detect_langs(user_text)
        print(f"Most probably you used the {probable_language} language.")

Please enter your text: hello, como este? Me llamo Niko
Most probably you used the [es:0.9999943866096032] language.
Please enter your text: Heeey I am Niko und wie heißt du?
Most probably you used the [de:0.8571428772867566, so:0.1428559988650058] language.
Please enter your text: My name is Niko und wie heißt du?
Most probably you used the [de:0.9999972184979435] language.
Please enter your text: me llamo Niko und du, wie ist dein Name?
Most probably you used the [de:0.9999956634454071] language.
Please enter your text: heey how are you today? Ich bin Niko und mir geht es gut
Most probably you used the [en:0.5714290730882619, so:0.4285697119619247] language.
Please enter your text: Stop


#### 3. Tokenization based on detected language

##### English

In [26]:
# Loading the EN model

nlp_en = spacy.load("en_core_web_sm")

In [31]:
# English text
text_en = "Audi AG is a German automotive manufacturer of luxury vehicles headquartered in Ingolstadt, Bavaria, Germany. A subsidiary of the Volkswagen Group, Audi produces vehicles in nine production facilities worldwide.The origins of the company are complex, dating back to the early 20th century and the initial enterprises (Horch and the Audiwerke) founded by engineer August Horch. Two other manufacturers (DKW and Wanderer) also contributed to the foundation of Auto Union in 1932. The modern Audi era began in the 1960s, when Auto Union was acquired by Volkswagen from Daimler-Benz. After relaunching the Audi brand with the 1965 introduction of the Audi F103 series, Volkswagen merged Auto Union with NSU Motorenwerke in 1969, thus creating the present-day form of the company.The company name is based on the Latin translation of the surname of the founder, August Horch. Horch, meaning 'listen', becomes audi in Latin. The four rings of the Audi logo each represent one of four car companies that banded together to create Audi's predecessor company, Auto Union. Audi's slogan is Vorsprung durch Technik, which is translated as 'Progress through Technology'.Audi, along with German brands BMW and Mercedes-Benz, is among the best-selling luxury automobile brands in the world."

In [35]:
# Lowercasing
text_en = text_en.lower()

In [36]:
# Removing the special characters
text_en = re.sub(r"[^a-zA-Z\s]", "", text_en)

In [37]:
# Tokenization

doc_en = nlp_en(text_en)
tokens_en = [token.text for token in doc_en]
print("English Tokens:", tokens_en)

English Tokens: ['audi', 'ag', 'is', 'a', 'german', 'automotive', 'manufacturer', 'of', 'luxury', 'vehicles', 'headquartered', 'in', 'ingolstadt', 'bavaria', 'germany', 'a', 'subsidiary', 'of', 'the', 'volkswagen', 'group', 'audi', 'produces', 'vehicles', 'in', 'nine', 'production', 'facilities', 'worldwidethe', 'origins', 'of', 'the', 'company', 'are', 'complex', 'dating', 'back', 'to', 'the', 'early', 'th', 'century', 'and', 'the', 'initial', 'enterprises', 'horch', 'and', 'the', 'audiwerke', 'founded', 'by', 'engineer', 'august', 'horch', 'two', 'other', 'manufacturers', 'dkw', 'and', 'wanderer', 'also', 'contributed', 'to', 'the', 'foundation', 'of', 'auto', 'union', 'in', ' ', 'the', 'modern', 'audi', 'era', 'began', 'in', 'the', 's', 'when', 'auto', 'union', 'was', 'acquired', 'by', 'volkswagen', 'from', 'daimlerbenz', 'after', 'relaunching', 'the', 'audi', 'brand', 'with', 'the', ' ', 'introduction', 'of', 'the', 'audi', 'f', 'series', 'volkswagen', 'merged', 'auto', 'union', 'w

##### German

In [49]:
# Loading the DE model

nlp_de = spacy.load("de_core_news_sm")

In [50]:
# German text
text_de = "Die Audi AG mit Sitz in Ingolstadt in Bayern ist ein deutscher Automobilhersteller, der seit den 1960er Jahren dem Volkswagen-Konzern angehört und seit den 2000er Jahren zu den Premiumherstellern gezählt wird. Der Markenclaim des Unternehmens lautet „Vorsprung durch Technik“.Die Firma entstand, da August Horch nach Zerwürfnissen mit dem Finanzvorstand der A. Horch & Cie. Motorwagenwerke Zwickau, im damaligen Königreich Sachsen gelegen, das Unternehmen verlassen hatte. Seine Firma konnte er nicht August Horch Automobilwerke GmbH nennen, denn die Rechte an der Marke „Horch“ gehörten der A. Horch & Cie. Motorwagenwerke Zwickau. Die Lösung fand er im Vorschlag des Zwickauer Gymnasiasten Heinrich Fikentscher (Sohn des mit August Horch befreundeten Franz Fikentscher), der Horch ins Latein übersetzte. Audi ist der Imperativ Singular von audire (zu Deutsch hören, zuhören) und bedeutet „Höre!“ oder eben „Horch!“. Am 25. April 1910 wurde die Audi Automobilwerke GmbH Zwickau in das Handelsregister der Stadt Zwickau eingetragen.Im Jahr 1928 übernahm der Kleinwagen- und Motorradproduzent Zschopauer Motorenwerke J. S. Rasmussen AG, bekannt durch seine Marke DKW, die Audiwerke AG Zwickau. Beide Unternehmen gingen in der Mitte 1932 gegründeten Auto Union AG, Chemnitz auf, zu der noch die Zwickauer Horchwerke AG und das Fahrzeugwerk Siegmar der Wanderer-Werke in Schönau bei Chemnitz gehörten. Symbolisch zum Ausdruck kam der Zusammenschluss der vier Pkw-Marken Audi, DKW, Horch und Wanderer im Auto-Union-Firmenzeichen mit den verschlungenen Ringen, dem heutigen Audi-Logo. In den 1930er Jahren gehörten die DKW F 1 bis F 8 sowie die beiden Audi-Modelle UW und 225 zu den Pionieren des Frontantriebs in Deutschland.Nach dem Zweiten Weltkrieg wurde 1949 eine neue Auto Union GmbH in Ingolstadt (Bayern) gegründet. Dort liefen, neben DKW-Motorrädern (RT 125 W), zunächst Schnellaster vom Band und ab 1950 baute das neue Werk Düsseldorf-Derendorf die F-89-Pkw, beides Modelle mit Frontantrieb. 1958 erwarb Daimler-Benz eine Mehrheit am Stammkapital der Auto Union und in der Folge fertigte das Düsseldorfer Werk leichte Mercedes-Transporter. Ab 1964 übernahm schrittweise der Volkswagen-Konzern die Auto Union. Sie hatte bis dahin nur Pkw mit Zweitaktmotoren unter der Marke DKW gebaut, bis 1965 das Modell Audi mit Viertaktmotor unter der Traditionsmarke aus der Vorkriegszeit auf den Markt kam. Im Zuge der Fusion mit den NSU Motorenwerken entstand im Jahr 1969 die Audi NSU Auto Union AG. Dem Fusionspartner entsprechend war deren Sitz bis 1985 Neckarsulm, bevor infolge des Auslaufens der Marke NSU die Firma auf Audi AG verkürzt wurde und der Sitz wieder zurück nach Ingolstadt wechselte.Zur Audi AG gehören seit 1998 der Sportwagenhersteller Lamborghini und seit 2012 der Motorradhersteller Ducati."

In [51]:
# Lowercasing
text_de = text_de.lower()

In [53]:
# Removing the special characters
text_de = re.sub(r"[^a-zA-Z\s]", "r", text_de)

In [55]:
doc_de = nlp_de(text_de)
tokens_de = [token.text for token in doc_de]
print("German tokens: ",tokens_de)

German tokens:  ['die', 'audi', 'ag', 'mit', 'sitz', 'in', 'ingolstadt', 'in', 'bayern', 'ist', 'ein', 'deutscher', 'automobilherstellerr', 'der', 'seit', 'den', 'rrrrer', 'jahren', 'dem', 'volkswagenrkonzern', 'angehrrt', 'und', 'seit', 'den', 'rrrrer', 'jahren', 'zu', 'den', 'premiumherstellern', 'gezrhlt', 'wirdr', 'der', 'markenclaim', 'des', 'unternehmens', 'lautet', 'rvorsprung', 'durch', 'technikrrdie', 'firma', 'entstandr', 'da', 'august', 'horch', 'nach', 'zerwrrfnissen', 'mit', 'dem', 'finanzvorstand', 'der', 'ar', 'horch', 'r', 'cier', 'motorwagenwerke', 'zwickaur', 'im', 'damaligen', 'krnigreich', 'sachsen', 'gelegenr', 'das', 'unternehmen', 'verlassen', 'hatter', 'seine', 'firma', 'konnte', 'er', 'nicht', 'august', 'horch', 'automobilwerke', 'gmbh', 'nennenr', 'denn', 'die', 'rechte', 'an', 'der', 'marke', 'rhorchr', 'gehrrten', 'der', 'ar', 'horch', 'r', 'cier', 'motorwagenwerke', 'zwickaur', 'die', 'lrsung', 'fand', 'er', 'im', 'vorschlag', 'des', 'zwickauer', 'gymnasias

#### 4. Basic translator

In [56]:
!pip install googletrans==4.0.0-rc1


Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
     ---------------------------------------- 55.1/55.1 kB 2.8 MB/s eta 0:00:00
Collecting chardet==3.*
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
     ---------------------------------------- 133.4/133.4 kB ? eta 0:00:00
Collecting idna==2.*
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
     ---------------------------------------- 58.8/58.8 kB ? eta 0:00:00
Collecting hstspreload
  Downloading hstspreload-2024.9.1-py3-none-any.whl (1.2 MB)
     ---------------------------------------- 1.2/1.2 MB 12.6 MB/s eta 0:00:00
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
     ---------------------------------------- 42.6/42.6 kB ? eta 0:00:00
Collecting rfc3986<2,>=1.3
  Downloa

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.2.2 requires pyqt5<5.13, which is not installed.
spyder 5.2.2 requires pyqtwebengine<5.13, which is not installed.
anaconda-project 0.11.1 requires ruamel-yaml, which is not installed.
conda-repo-cli 1.0.20 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.20 requires nbformat==5.4.0, but you have nbformat 5.5.0 which is incompatible.


In [58]:
# Initialize the translator

translator = Translator()

In [59]:
# Input text

input_text = "Hello, my name ist Nikolay"

In [60]:
# Detecting the language

detected_lang = translator.detect(input_text)

In [63]:
# Translating the text

translated_text = translator.translate(input_text, dest = "es")

In [71]:
# Printing the result

print(translated_text.text)

My name is Nikolay


#### 5. Translator with user input

In [72]:
while True:
    input_text = input("Please enter your text: ")
    if input_text == "Stop":
        break
    else:
        dest_lang = input("Please enter the desired language: ")

        detected_lang = translator.detect(input_text)
        translated_text = translator.translate(input_text, dest = dest_lang)

        print(translated_text.text)

Please enter your text: Mein Name ist Nikolay und ich komme aus Bulgarien
Please enter the desired language: es
Mi nombre es Nikolay y vengo de Bulgaria
Please enter your text: Stop
