In [17]:
import re
import nltk
import zeyrek
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mbber\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mbber\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Step 0: Setup and Load Data

In [4]:
# Load the dataset
file_path = 'train.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(data.head())


   ID                                           Sentence
0   0  sınıf , havuz ve açık deniz çalışmalarıyla , t...
1   1  bu standart , sualtında kendini rahat hisseden...
2   2  yapılan araştırmalar , öğrencilerin mevcut dal...
3   3  pdıc öğrencilerinde , psikolojik eğitim ve yet...
4   4  pdıc eğitiminin sağladığı güven ve rahatlık , ...


### Step 1: Text Normalization

In [5]:
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Standardize punctuation (example shown for a common case)
    text = re.sub(r'[\u201C\u201D]', '"', text)  # Convert smart quotes to standard quotes
    return text

In [7]:
data['normalized_text'] = data['Sentence'].apply(normalize_text)

### Step 2: Removal of HTML Tags and Symbols

In [8]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [9]:
data['clean_text'] = data['normalized_text'].apply(remove_html_tags)

  soup = BeautifulSoup(text, "html.parser")


### Step 3: Tokenization

In [10]:
def tokenize_text(text):
    # Tokenize text
    tokens = word_tokenize(text, language='turkish')
    return tokens

In [11]:
data['tokens'] = data['clean_text'].apply(tokenize_text)

### Step 4: Numerical, Date, and Time Handling

In [12]:
def standardize_dates(text):
    # Example: Replace dates in format 'dd/mm/yyyy' with 'DATE'
    text = re.sub(r'\b\d{1,2}/\d{1,2}/\d{4}\b', 'DATE', text)
    return text

In [13]:
data['standardized_text'] = data['clean_text'].apply(standardize_dates)

### Step 5: Stop Word Removal

In [14]:
stop_words = set(stopwords.words('turkish'))

def remove_stop_words(tokens):
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

data['filtered_tokens'] = data['tokens'].apply(remove_stop_words)

### Step 6: Lemmatization

In [18]:
def lemmatize_tokens(tokens):
    # Zeyrek returns a tuple with various analyses; we take the first lemma (if available)
    lemmatized_tokens = [lemmatizer.lemmatize(word)[0][1][0] if lemmatizer.lemmatize(word) else word for word in tokens]
    return lemmatized_tokens

In [19]:
# Initialize Zeyrek lemmatizer
lemmatizer = zeyrek.MorphAnalyzer()

# Applying lemmatization to your tokens column
data['lemmatized_tokens'] = data['filtered_tokens'].apply(lemmatize_tokens)

ValueError: The language 'turkish' is not supported.