In [1]:
import pandas as pd
import numpy as np
import os
import re

def load_and_clean_poems(data_path):

    ds = pd.read_csv(data_path)
    
    # Datayı inceldiğimde 'pp' ve 'br' gibi htmldaen kalma atıkların temizlenmesi
    def clean_poem(poem):
        poem = str(poem)  
        poem = re.sub(r"pp[\w]*", "", poem)
        poem = re.sub(r"<br>|<br/>|br", "\n", poem)
        poem = re.sub(r"\s+", " ", poem).strip()
        return poem

    ds['cleaned_poem'] = ds['cleaned_poem'].apply(clean_poem)
    return ds


data_path = "/Users/mertgenc/DataFile/RNN-LSTM-GRU/turkish_poem/train_dataset.csv"
ds_cleaned = load_and_clean_poems(data_path)

# Temizlenmiş ilk 5 şiiri yazdır
print(ds_cleaned['cleaned_poem'].head())
#print(ds_cleaned)


0    derhal fark edilen melek o yüce erk daha gizle...
1    tedbirini terkeyle takdir hüdanındır sen yoksu...
2    karanlık basmadan ovalarıma kainatın duru ille...
3    ince narin kanatlı uçurtmadır kalbin duru yağm...
4    her gün bir yerden göçmek ne iyi her gün bir y...
Name: cleaned_poem, dtype: object


In [2]:
full_text = " / ".join(ds_cleaned['cleaned_poem'])
print(full_text[:1000])

derhal fark edilen melek o yüce erk daha gizlendiği yere girdiği gibi tertemiz alev alev ve dikilerek yalvardı bırakıp da her bir talebi şaşkını bir tacir olarak kalmasına izin verilsin diye eskiden neydiyse öyle okuyamazdı o böylesi sözün fazla gelirdi bir âlime bile melek ona yazılı sayfasını buyurganca gösteriyor gösteriyordu tekrar ediyordu ısrarla oku o melek önüne eğdi kafasını okuyandı artık o andan itibaren ve bilendi ve uyandı ve hüküm veren osman tuğlu / tedbirini terkeyle takdir hüdanındır sen yoksun o benlikler hep vehm ü gümanındır birden bire bul aşkı bu tuhfe bulanındır devran olalı devran erbabı safanındır keder neyler gam halkı cihanındır koyma kadehi elden söz piri muganındır seyrettim uşşaka mataf olmuş teklif ü tekellüften sükkanı maaf olmuş bir neşe gelüp meclis bi havf u hilaf olmuş gam sohbeti yad olmaz meşrebleri saf olmuş keder neyler gam halkı cihanındır koyma kadehi elden söz piri muganındır dil sen o dildara layık mı değilsin ya davayı mahabete sadık mı deği

In [3]:
# Karakterleri indeksliyerek metni modelin anlayabileceği hale getirmek.
chars  = sorted(list(set(full_text)))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

In [4]:
# 40 karekter uzunluğundaki bir metnin 41. kareternin tahminleme yapamyı öğrenemsi için X ve y şeklinde ayırıyoruz.
seq_lenght = 40 
sequences = []
next_chars = []

for i in range(len(full_text) - seq_lenght):
    seq = full_text[i:i+seq_lenght]
    next_char = full_text[i+seq_lenght]
    sequences.append([char_to_idx[c] for c in seq])
    next_chars.append(char_to_idx[next_char])
    

In [5]:
print(sequences[0])

[16, 17, 30, 20, 13, 24, 0, 18, 13, 30, 23, 0, 17, 16, 21, 24, 17, 26, 0, 25, 17, 24, 17, 23, 0, 27, 0, 37, 55, 15, 17, 0, 17, 30, 23, 0, 16, 13, 20, 13]


In [6]:
print(next_chars[7])

16


In [14]:
import numpy as np
from tensorflow.keras.utils import to_categorical

X = np.array(sequences)
X = X.reshape((X.shape[0], X.shape[1], 1)).astype('float32')
y = to_categorical(next_chars, num_classes=len(chars))

In [15]:
print(X.shape)

(2899719, 40, 1)


## Model 

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Modelde kullanılacak karakter sayısı (output katmanı için)
num_chars = len(chars)


model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(num_chars, activation='softmax'))

# Derleme
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Callback'ler
early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

checkpoint_path = "bestsiir_model.h5"
checkpoint = ModelCheckpoint(
    filepath=checkpoint_path,
    monitor='loss',
    save_best_only=True,
    mode='min',
    verbose=1
)

# Model eğitimi
model.fit(
    X, y,
    batch_size=128,
    epochs=30,
    callbacks=[early_stop, checkpoint]
)


  super().__init__(**kwargs)


Epoch 1/30
[1m  830/22655[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m52:32[0m 144ms/step - accuracy: 0.1831 - loss: 2.8798

KeyboardInterrupt: 

In [18]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

class TurkishPoemGenerator:
    def __init__(self, seq_length=60):
        self.seq_length = seq_length
        self.chars = None
        self.char_to_idx = None
        self.idx_to_char = None
        self.model = None
        
    def load_and_clean_poems(self, data_path):
        """Load and clean poem data with improved preprocessing"""
        ds = pd.read_csv(data_path)

        # Datayı inceldiğimde 'pp' ,'br' vb gibi atıkların temizlenmesi
        def clean_poem(poem):
            if pd.isna(poem):
                return ""
            
            poem = str(poem)
            # Remove HTML tags more thoroughly
            poem = re.sub(r'<[^>]+>', '', poem)
            # Remove specific artifacts
            poem = re.sub(r'pp\w*', '', poem)
            poem = re.sub(r'br\w*', '\n', poem)
            # Clean up whitespace but preserve line breaks
            poem = re.sub(r'[ \t]+', ' ', poem)
            poem = re.sub(r'\n+', '\n', poem)
            # Remove non-Turkish characters but keep punctuation
            poem = re.sub(r'[^\w\s\nüğıöşçÜĞIİÖŞÇ.,!?;:()-]', '', poem)
            
            return poem.strip()
        
        ds['cleaned_poem'] = ds['cleaned_poem'].apply(clean_poem)
        # eğer bir uzunluğu yoksa (boşsa) kaldırılamsı.
        ds = ds[ds['cleaned_poem'].str.len() > 0]
        
        return ds
    
    def prepare_data(self, ds_cleaned, validation_split=0.2):
        """Prepare data with better text joining and validation split"""
        # Join poems with a special separator that maintains poem boundaries
        full_text = "\n\n###POEM_SEPARATOR###\n\n".join(ds_cleaned['cleaned_poem'])
        
        # Karakterleri indeksliyerek metni modelin anlayabileceği hale getirmek.
        self.chars = sorted(list(set(full_text)))
        self.char_to_idx = {char: idx for idx, char in enumerate(self.chars)}
        self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}
        
        # 40 karekter uzunluğundaki bir metnin 41. kareternin tahminleme yapamyı öğrenemsi için X ve y şeklinde ayırıyoruz.
        sequences = []
        next_chars = []
        
    
        step = 3
        for i in range(0, len(full_text) - self.seq_length, step):
            seq = full_text[i:i + self.seq_length]
            next_char = full_text[i + self.seq_length]
            sequences.append([self.char_to_idx[c] for c in seq])
            next_chars.append(self.char_to_idx[next_char])
        
        X = np.array(sequences)
        y = to_categorical(next_chars, num_classes=len(self.chars))
        

        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=validation_split, random_state=42
        )
        
        print(f"Training sequences: {len(X_train)}")
        print(f"Validation sequences: {len(X_val)}")
        print(f"Vocabulary size: {len(self.chars)}")
        
        return X_train, X_val, y_train, y_val
    
    def build_model(self):
        
        model = Sequential([
            # Karakterleri vektörlere dönüştürüp makinenin daha ii öğrenmesini sağlar.
            Embedding(len(self.chars), 128, input_length=self.seq_length),
            
            LSTM(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),## hızlardıma ve daha karkalı öğrenme
            
            LSTM(128, dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),
            
            # Dense layers
            Dense(256, activation='relu'),
            Dropout(0.3),
            Dense(128, activation='relu'),
            Dropout(0.3),
            Dense(len(self.chars), activation='softmax')
        ])
        
        # Use custom optimizer with learning rate scheduling
        optimizer = Adam(learning_rate=0.001)
        model.compile(
            loss='categorical_crossentropy',
            optimizer=optimizer,
            metrics=['accuracy']
        )
        
        self.model = model
        return model
    
    def train_model(self, X_train, X_val, y_train, y_val, epochs=50, batch_size=256):
        """Train model with improved callbacks"""
        callbacks = [
            EarlyStopping(
                monitor='val_loss',
                patience=8,
                restore_best_weights=True,
                verbose=1
            ),
            ModelCheckpoint(
                'best_turkish_poem_model.h5',
                monitor='val_loss',
                save_best_only=True,
                mode='min',
                verbose=1
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=4,
                min_lr=0.0001,
                verbose=1
            )
        ]
        
        history = self.model.fit(
            X_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(X_val, y_val),
            callbacks=callbacks,
            verbose=1
        )
        
        return history
    
    def generate_text(self, seed_text, length=500, temperature=1.0):
        """Generate text with temperature control"""
        if self.model is None:
            raise ValueError("Model not trained yet!")
        
        # Prepare seed text
        if len(seed_text) < self.seq_length:
            seed_text = seed_text + ' ' * (self.seq_length - len(seed_text))
        else:
            seed_text = seed_text[:self.seq_length]
        
        # Convert to indices
        generated = seed_text
        
        for _ in range(length):
            # Get last seq_length characters
            seq = generated[-self.seq_length:]
            seq_indices = [self.char_to_idx.get(c, 0) for c in seq]
            
            # Predict next character
            x = np.array([seq_indices])
            predictions = self.model.predict(x, verbose=0)[0]
            
            # Apply temperature
            predictions = np.log(predictions + 1e-8) / temperature
            exp_predictions = np.exp(predictions)
            predictions = exp_predictions / np.sum(exp_predictions)
            
            # Sample from the distribution
            next_idx = np.random.choice(len(self.chars), p=predictions)
            next_char = self.idx_to_char[next_idx]
            
            generated += next_char
        
        return generated
    
    def save_model_and_chars(self, model_path='turkish_poem_model.h5', chars_path='chars.pkl'):
        """Save model and character mappings"""
        self.model.save(model_path)
        with open(chars_path, 'wb') as f:
            pickle.dump((self.chars, self.char_to_idx, self.idx_to_char), f)
        print(f"Model saved to {model_path}")
        print(f"Character mappings saved to {chars_path}")
    
    def load_model_and_chars(self, model_path='turkish_poem_model.h5', chars_path='chars.pkl'):
        """Load model and character mappings"""
        from tensorflow.keras.models import load_model
        
        self.model = load_model(model_path)
        with open(chars_path, 'rb') as f:
            self.chars, self.char_to_idx, self.idx_to_char = pickle.load(f)
        print("Model and character mappings loaded successfully!")

# Usage example
def main():
    generator = TurkishPoemGenerator(seq_length=60)
    
    
    data_path = "/Users/mertgenc/DataFile/RNN-LSTM-GRU/turkish_poem/test_dataset.csv"  
    ds_cleaned = generator.load_and_clean_poems(data_path)
    
    X_train, X_val, y_train, y_val = generator.prepare_data(ds_cleaned)
    
    model = generator.build_model()
    model.summary()
    
    history = generator.train_model(X_train, X_val, y_train, y_val, epochs=25)
    
    generator.save_model_and_chars()
    
    seed_text = "güzel bir gün"
    generated_poem = generator.generate_text(seed_text, length=300, temperature=0.8)
    print("\nGenerated Poem:")
    print(generated_poem)

if __name__ == "__main__":
    main()

Training sequences: 235344
Validation sequences: 58836
Vocabulary size: 64




Epoch 1/25
[1m331/920[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m5:03[0m 516ms/step - accuracy: 0.1535 - loss: 3.1751

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, GRU
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# Enable mixed precision for A100
from tensorflow.keras.mixed_precision import Policy
policy = Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)

class FastTurkishPoemGenerator:
    def __init__(self, seq_length=40):  # Reduced from 60
        self.seq_length = seq_length
        self.chars = None
        self.char_to_idx = None
        self.idx_to_char = None
        self.model = None
        
    def load_and_clean_poems(self, data_path, max_poems=50000):  # Limit dataset size
        """Load and clean poem data with size limit"""
        ds = pd.read_csv(data_path)
        
        # Take only first max_poems for faster training
        if len(ds) > max_poems:
            ds = ds.head(max_poems)
            print(f"Limited dataset to {max_poems} poems for faster training")
        
        def clean_poem(poem):
            if pd.isna(poem):
                return ""
            
            poem = str(poem)
            # Quick cleaning
            poem = re.sub(r'<[^>]+>', '', poem)
            poem = re.sub(r'pp\w*|br\w*', ' ', poem)
            poem = re.sub(r'[^\w\s\nüğıöşçÜĞIİÖŞÇ.,!?;:()-]', '', poem)
            poem = re.sub(r'\s+', ' ', poem)
            
            return poem.strip()
        
        ds['cleaned_poem'] = ds['cleaned_poem'].apply(clean_poem)
        ds = ds[ds['cleaned_poem'].str.len() > 0]
        
        return ds
    
    def prepare_data_fast(self, ds_cleaned, validation_split=0.2):
        """Prepare data with aggressive sampling for speed"""
        # Join poems with simple separator
        full_text = " | ".join(ds_cleaned['cleaned_poem'])
        
        # Limit text length for faster processing
        if len(full_text) > 1000000:  # 1M chars max
            full_text = full_text[:1000000]
            print(f"Limited text to 1M characters for faster training")
        
        # Create character mappings
        self.chars = sorted(list(set(full_text)))
        self.char_to_idx = {char: idx for idx, char in enumerate(self.chars)}
        self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}
        
        # Create sequences with large step size to reduce data
        sequences = []
        next_chars = []
        
        step = 10  # Much larger step for fewer sequences
        for i in range(0, len(full_text) - self.seq_length, step):
            seq = full_text[i:i + self.seq_length]
            next_char = full_text[i + self.seq_length]
            sequences.append([self.char_to_idx[c] for c in seq])
            next_chars.append(self.char_to_idx[next_char])
        
        # Further limit if still too large
        if len(sequences) > 200000:
            sequences = sequences[:200000]
            next_chars = next_chars[:200000]
            print(f"Limited to 200K sequences for faster training")
        
        # Convert to numpy arrays
        X = np.array(sequences, dtype=np.int32)
        y = to_categorical(next_chars, num_classes=len(self.chars))
        
        # Split data
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=validation_split, random_state=42
        )
        
        print(f"Training sequences: {len(X_train)}")
        print(f"Validation sequences: {len(X_val)}")
        print(f"Vocabulary size: {len(self.chars)}")
        
        return X_train, X_val, y_train, y_val
    
    def build_fast_model(self):
        """Build fast, lightweight model optimized for A100"""
        model = Sequential([
            # Smaller embedding
            Embedding(len(self.chars), 64, input_length=self.seq_length),
            
            # Single GRU layer (faster than LSTM)
            GRU(128, dropout=0.2, recurrent_dropout=0.1),
            
            # Smaller dense layers
            Dense(128, activation='relu'),
            Dropout(0.3),
            Dense(len(self.chars), activation='softmax', dtype='float32')  # float32 for output
        ])
        
        # Aggressive optimizer settings
        optimizer = Adam(learning_rate=0.003)  # Higher learning rate
        model.compile(
            loss='categorical_crossentropy',
            optimizer=optimizer,
            metrics=['accuracy']
        )
        
        self.model = model
        return model
    
    def train_fast(self, X_train, X_val, y_train, y_val, epochs=20, batch_size=512):
        """Fast training with large batch size"""
        callbacks = [
            EarlyStopping(
                monitor='val_loss',
                patience=3,  # Reduced patience
                restore_best_weights=True,
                verbose=1
            ),
            ModelCheckpoint(
                'fast_turkish_poem_model.h5',
                monitor='val_loss',
                save_best_only=True,
                mode='min',
                verbose=1
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=2,  # Reduced patience
                min_lr=0.0001,
                verbose=1
            )
        ]
        
        # Use larger batch size for A100
        history = self.model.fit(
            X_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(X_val, y_val),
            callbacks=callbacks,
            verbose=1
        )
        
        return history
    
    def generate_text(self, seed_text, length=300, temperature=0.8):
        """Generate text with temperature control"""
        if self.model is None:
            raise ValueError("Model not trained yet!")
        
        # Prepare seed text
        if len(seed_text) < self.seq_length:
            seed_text = seed_text + ' ' * (self.seq_length - len(seed_text))
        else:
            seed_text = seed_text[:self.seq_length]
        
        generated = seed_text
        
        for _ in range(length):
            seq = generated[-self.seq_length:]
            seq_indices = [self.char_to_idx.get(c, 0) for c in seq]
            
            x = np.array([seq_indices])
            predictions = self.model.predict(x, verbose=0)[0]
            
            # Apply temperature
            predictions = np.log(predictions + 1e-8) / temperature
            exp_predictions = np.exp(predictions)
            predictions = exp_predictions / np.sum(exp_predictions)
            
            next_idx = np.random.choice(len(self.chars), p=predictions)
            next_char = self.idx_to_char[next_idx]
            
            generated += next_char
        
        return generated
    
    def save_model_and_chars(self, model_path='fast_turkish_poem_model.h5', chars_path='chars.pkl'):
        """Save model and character mappings"""
        self.model.save(model_path)
        with open(chars_path, 'wb') as f:
            pickle.dump((self.chars, self.char_to_idx, self.idx_to_char), f)
        print(f"Model saved to {model_path}")
        print(f"Character mappings saved to {chars_path}")
    
    def load_model_and_chars(self, model_path='fast_turkish_poem_model.h5', chars_path='chars.pkl'):
        """Load model and character mappings"""
        from tensorflow.keras.models import load_model
        
        self.model = load_model(model_path)
        with open(chars_path, 'rb') as f:
            self.chars, self.char_to_idx, self.idx_to_char = pickle.load(f)
        print("Model and character mappings loaded successfully!")

# Optimized usage for A100
def main():
    # Set A100 specific optimizations
    tf.config.experimental.enable_tensor_float_32()  # Enable TF32 for A100
    
    # Initialize generator with faster settings
    generator = FastTurkishPoemGenerator(seq_length=40)
    
    # Load and clean data with limits
    data_path = "train_dataset.csv"  # Update with your path
    ds_cleaned = generator.load_and_clean_poems(data_path, max_poems=30000)
    
    # Prepare data with aggressive sampling
    X_train, X_val, y_train, y_val = generator.prepare_data_fast(ds_cleaned)
    
    # Build lightweight model
    model = generator.build_fast_model()
    model.summary()
    
    print("\nStarting training with A100 optimizations...")
    
    # Train with large batch size
    history = generator.train_fast(X_train, X_val, y_train, y_val, 
                                  epochs=15, batch_size=1024)  # Large batch for A100
    
    # Save model
    generator.save_model_and_chars()
    
    # Generate sample text
    seed_text = "güzel bir gün"
    generated_poem = generator.generate_text(seed_text, length=200, temperature=0.8)
    print("\nGenerated Poem:")
    print(generated_poem)

if __name__ == "__main__":
    main()