In [18]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import random

In [19]:
def encode_campaigns(sequences, campaigns):
    enc = OneHotEncoder(categories=[campaigns], sparse_output=False)
    encoded_sequences = []
    for sequence in sequences:
        encoded_sequence = enc.fit_transform(np.array(sequence).reshape(-1, 1))
        encoded_sequences.append(encoded_sequence)
    print(encoded_sequences)
    return np.array(encoded_sequences)

In [20]:
def generate_random_sequences(campaigns):
    num_sequences = 1000000

    min_length = 5
    max_length = 15

    random_sequences = []
    for _ in range(num_sequences):
        sequence_length = random.randint(min_length, max_length)
        sequence = random.choices(campaigns, k=sequence_length)
        random_sequences.append(sequence)

    return random_sequences

In [21]:
campaigns = [
    'seker_sakiz', 'cikolata_biskuvi', 'cips', 'gevrek', 'bebek', 
    'sampuan_dusjeli', 'sabun', 'kisisel_bakim', 'camasir', 'bulasik', 
    'ev_temizligi', 'makarna_pirinc_bakliyat', 'hazirgida_baharat', 
    'sigara', 'pasta', 'peynir_tereyagi', 'dondurulmus', 'yumurta', 
    'salam_sosis_sucuk', 'kahve', 'cay', 'alet', 'sos', 'ekmek', 
    'sivi_yag', 'meyve_sebze', 'maden_suyu', 'icecek', 'kolonya', 
    'konserve_salca', 'pecete', 'mangal', 'poset', 'recel_bal', 
    'porselen', 'dondurma', 'kedi_kopek', 'kuruyemis', 'plastik', 
    'su', 'sut', 'ayran_yogurt', 'pil'
]

campaign_sequences = generate_random_sequences(campaigns)

In [23]:
# Pad sequences to the maximum length
max_length = max(len(seq) for seq in campaign_sequences)
padded_sequences = []
for seq in campaign_sequences:
    padded_seq = seq + [''] * (max_length - len(seq))
    padded_sequences.append(padded_seq)

# Encode campaigns using LabelEncoder
label_encoder = LabelEncoder()
encoded_sequences = np.array([label_encoder.fit_transform(seq) for seq in padded_sequences])

# Apply one-hot encoding
enc = OneHotEncoder(categories='auto', sparse_output=False)
encoded_sequences_one_hot = enc.fit_transform(encoded_sequences)

# Apply k-means clustering
k = 12  # Number of clusters
kmeans = KMeans(n_clusters=k)
kmeans.fit(encoded_sequences_one_hot)

cluster_labels = kmeans.labels_

# Assign clusters to campaign sequences
clustered_sequences = {}
for i, label in enumerate(cluster_labels):
    if label not in clustered_sequences:
        clustered_sequences[label] = []
    clustered_sequences[label].append(campaign_sequences[i])

output_clusters_file = 'clusters_output.txt'
with open(output_clusters_file, 'w') as f:
    for cluster, sequences in clustered_sequences.items():
        f.write(f"Cluster {cluster}:\n")
        for seq in sequences:
            f.write(','.join(seq) + '\n')
        f.write('\n')

print(f"Cluster information saved to {output_clusters_file}.")

Cluster information saved to clusters_output.txt.
