### Importing Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from nltk.tokenize import sent_tokenize
import re
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\georgs56\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Language codes for Wikipedia
language_codes = ["en", "ml", "hi", "ta", "pt", "fr", "nl", "es", "el", "ru", "da", "it", "tr", "sv", "ar"]

# Create a dictionary to store text data
data_dict = {lang: [] for lang in language_codes}

In [3]:
def get_wikipedia_title(topic, lang_code):
    """Fetches the correct Wikipedia page title in the specified language"""
    
    # Wikipedia API endpoint
    url = "https://en.wikipedia.org/w/api.php"
    
    # API parameters
    params = {
        "action": "query",
        "format": "json",
        "titles": topic,
        "prop": "langlinks",
        "lllimit": "500"  # Increase the limit to get more translations
    }
    
    # Make the request
    response = requests.get(url, params=params)
    data = response.json()
    
    # Extract the page ID
    page_data = list(data["query"]["pages"].values())[0]

    if "langlinks" in page_data:
        for link in page_data["langlinks"]:
            if link["lang"] == lang_code:
                return link["*"]  # Return the translated title

    return None  # If translation not found, return None

# Example usage
topic = "Nature"
translated_title = get_wikipedia_title(topic, "el")  # Greek
print("Translated title: "+ topic + '->', translated_title)


Translated title: Nature-> Φύση


In [None]:
def text_scrap(topic):
    """Scrapes Wikipedia pages for different languages and stores text data"""
    
    for lang in language_codes:
        if lang!='en':
            translated_title = get_wikipedia_title(topic, lang)
            print(translated_title)
        else:
            print(topic)
            translated_title = topic
            
        
        if not translated_title:
            print(f"Could not find a Wikipedia page for '{topic}' in {lang}. Skipping...")
            continue
        
        url = f"https://{lang}.wikipedia.org/wiki/{translated_title}"
        print(f"Scraping: {url}")
        print('')
        
        try:
            request = requests.get(url)
            soup = BeautifulSoup(request.text, "lxml")
            paragraphs = soup.find_all("p")
            
            for paragraph in paragraphs:
                text = paragraph.text.strip()
                if text:
                    data_dict[lang].append(text)
                    
        except Exception as e:
            print(f"Error fetching {url}: {e}")

# Example usage
topics = ['Physics', 'Mathematics', 'Biology', 'Chemistry', 'Geology']
for topic in topics:
    text_scrap(topic)



Physics
Scraping: https://en.wikipedia.org/wiki/Physics

ഭൗതികശാസ്ത്രം
Scraping: https://ml.wikipedia.org/wiki/ഭൗതികശാസ്ത്രം

भौतिक शास्त्र
Scraping: https://hi.wikipedia.org/wiki/भौतिक शास्त्र

இயற்பியல்
Scraping: https://ta.wikipedia.org/wiki/இயற்பியல்

Física
Scraping: https://pt.wikipedia.org/wiki/Física

Physique
Scraping: https://fr.wikipedia.org/wiki/Physique

Natuurkunde
Scraping: https://nl.wikipedia.org/wiki/Natuurkunde

Física
Scraping: https://es.wikipedia.org/wiki/Física

Φυσική
Scraping: https://el.wikipedia.org/wiki/Φυσική

Физика
Scraping: https://ru.wikipedia.org/wiki/Физика

Fysik
Scraping: https://da.wikipedia.org/wiki/Fysik

Fisica
Scraping: https://it.wikipedia.org/wiki/Fisica

Fizik
Scraping: https://tr.wikipedia.org/wiki/Fizik

Fysik
Scraping: https://sv.wikipedia.org/wiki/Fysik

فيزياء
Scraping: https://ar.wikipedia.org/wiki/فيزياء

Mathematics
Scraping: https://en.wikipedia.org/wiki/Mathematics

ഗണിതം
Scraping: https://ml.wikipedia.org/wiki/ഗണിതം

गणित
Scraping

In [9]:
def split_into_sentences(text):
    """Splits text into sentences and removes references like [2], [4], etc."""
    sentences = sent_tokenize(text)  # Tokenize into sentences
    cleaned_sentences = [re.sub(r"\[\d+\]", "", s).strip() for s in sentences]  # Remove references like [2]
    return [s for s in cleaned_sentences if s]  # Remove empty sentences

# Convert scrapped dict into a structured DataFrame
data = []

for lang, texts in data_dict.items():
    for text in texts:
        sentences = split_into_sentences(text)
        for sentence in sentences:
            data.append({"Sentence": sentence, "Language": lang})

# Create the DataFrame
df = pd.DataFrame(data)
print(df.head())

                                            Sentence Language
0  Physics is the scientific study of matter, its...       en
1  Physics is one of the most fundamental scienti...       en
2  A scientist who specializes in the field of ph...       en
3  Physics is one of the oldest academic discipli...       en
4  Over much of the past two millennia, physics, ...       en


In [10]:
df.shape

(13318, 2)

In [11]:
df.head()

Unnamed: 0,Sentence,Language
0,"Physics is the scientific study of matter, its...",en
1,Physics is one of the most fundamental scienti...,en
2,A scientist who specializes in the field of ph...,en
3,Physics is one of the oldest academic discipli...,en
4,"Over much of the past two millennia, physics, ...",en


In [13]:
# resetting the index
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [14]:
# saving the prepared csv file
df.to_csv("../scrapped_data/language_detection_dataset.csv", index=False)
