In [1]:
import re
import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rruba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rruba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Step 0: Setup and Load Data

In [17]:
# Load the dataset
file_path = 'train_fixed.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(data.head())


                                         source_text
0  sinif , havuz ve acik deniz calismalariyla , t...
1  bu standart , sualtinda kendini rahat hisseden...
2  yapilan arastirmalar , ogrencilerin mevcut dal...
3  pdic ogrencilerinde , psikolojik egitim ve yet...
4  pdic egitiminin sagladigi guven ve rahatlik , ...


### Step 1: Text Normalization

In [5]:
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Standardize punctuation (example shown for a common case)
    text = re.sub(r'[\u201C\u201D]', '"', text) 
    return text

In [6]:
data['normalized_text'] = data['Sentence'].apply(normalize_text)

### Step 2: Removal of HTML Tags and Symbols

In [7]:
def clean_text(text):
    soup = BeautifulSoup(text, "html.parser").get_text()
    
    soup = re.sub(r'[\?\)\(\:\%\»\°\|\,\.\!]', '', soup)
    soup = re.sub(r'[\/\-\;]', ' ', soup)
    
    return soup

In [18]:
data['clean_text'] = data['normalized_text'].apply(clean_text)

  soup = BeautifulSoup(text, "html.parser").get_text()


### Step 3: Numerical, Date, and Time Handling

In [19]:
def standardize_text(text):
    
    text = re.sub(r'\d{1,2}:\d{2}', "TIME", text) 
    text = re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}', "DATE", text)
    text = re.sub(r'\d+', "NUMBER", text) 
    
    return text

In [20]:
data['standardized_text'] = data['clean_text'].apply(standardize_text)

### Step 4: Tokenization

In [21]:
def tokenize_text(text):
    # Tokenize text
    tokens = word_tokenize(text, language='turkish')
    
    return tokens

In [22]:
data['tokens'] = data['standardized_text'].apply(tokenize_text)

### Step 5: Stop Word Removal

In [23]:
stop_words = set(stopwords.words('turkish'))

def remove_stop_words(tokens):
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

data['filtered_tokens'] = data['tokens'].apply(remove_stop_words)

In [24]:
def char_tokenize_text(text):
    char_tokens = list(text)
    return char_tokens

data['char_tokens'] = data['standardized_text'].apply(char_tokenize_text)

In [25]:
data.to_csv("./data/cleaned.csv")