In [10]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split

import torch

import json
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re


np.random.seed(42)

torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [11]:
DATASET_COLUMNS = ["target", "text"]

data = pd.read_csv("./data/output.csv", sep=',')
data = data[:20000]
data.columns = DATASET_COLUMNS
data = data[data['text'].notnull()]
data.head(10)

Unnamed: 0,target,text
0,1,did you know they make rocky road snicker had ...
1,1,hope mine normal
2,0,yay for you si stuck in traffic on my way home...
3,1,watched dane play 2nd ball game later walked u...
4,0,ugh every time think almost to 100 realize not...
5,1,they re playing manchester on wednesday have t...
6,1,your day are about to be filled with fun
7,1,it is always interesting to step off the inter...
8,0,oh no that too bad
9,0,off to school again sadly and disapointed with...


In [12]:
def tokenize_texts(texts_list):
    all_words = set()
    for text in texts_list:
        words = str(text).split()
        all_words.update(words)

    word_to_index = {word: i for i, word in enumerate(all_words)}
    
    # Save the dictionary as a pickle file
    with open('word_to_index.pkl', 'wb') as f:
        pickle.dump(word_to_index, f)
        
    # Save the dictionary as a JSON file
    with open('word_to_index.json', 'w') as f:
        json.dump(word_to_index, f)
        
    # Create index_to_word dictionary and save as a JSON file
    index_to_word = {i: word for word, i in word_to_index.items()}
    with open('index_to_word.json', 'w') as f:
        json.dump(index_to_word, f)

    tokenized_texts = []
    for text in texts_list:
        words = text.split()
        tokenized_texts.append([word_to_index[word] for word in words])

    return tokenized_texts, word_to_index

def pad_tokenized_texts(tokenized_texts, max_length=None):
    if not max_length:
        max_length = max([len(text) for text in tokenized_texts])

    padded_texts = []
    for text in tokenized_texts:
        if len(text) < max_length:
            text += [0] * (max_length - len(text))
        padded_texts.append(text)

    return padded_texts

def pad_tokenized_text(tokenized_text, max_length=None):
    if len(tokenized_text) < max_length:
        tokenized_text += [0] * (max_length - len(tokenized_text))

    return tokenized_text

def get_tokenized_sentence(sentence, word_to_index):
    words = sentence.split()
    tokenized_sentence = [word_to_index.get(word, 0) for word in words]
    return tokenized_sentence

def preprocess_sentence(sentence):
    stemmer = PorterStemmer()
    
    # Twitter mentions, URLs, and non-alphabetic characters removal
    sentence = re.sub("@", "", sentence)
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = re.sub("[^a-zA-Z]", " ", sentence)
    
    # Convert to lowercase and remove stopwords
    sentence = sentence.lower()
    sentence = ' '.join([word for word in sentence.split() if word not in stopwords.words('english')])
    
    # Remove words with less than 4 characters
    sentence = ' '.join([w for w in sentence.split() if len(w) > 3])
    
    # Stemming
    sentence = ' '.join([stemmer.stem(w) for w in sentence.split()])
    
    return sentence

texts = data['text']
labels = data['target']

tokenized_texts, word_to_index = tokenize_texts(texts)
padded_texts = pad_tokenized_texts(tokenized_texts)

In [13]:
vocab_size = len(word_to_index) + 1
max_length = len(padded_texts[0])

In [44]:
labels.reset_index(drop=True, inplace=True)

tensor_padded_texts = torch.tensor(padded_texts)

from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler((0, 1))
scaled_data = scaler.fit_transform(tensor_padded_texts.numpy())
tensor_padded_texts = torch.tensor(scaled_data).float()

labels_tensor = torch.tensor(labels.to_numpy()).long()  # or `.long()` if your labels are integers
#reshaped_labels = labels_tensor.view(-1, 1).float()

In [45]:
import torch
import torch.nn as nn
import torch.optim as optim

class AdvancedModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(AdvancedModel, self).__init__()
        self.layer1 = nn.Linear(embedding_dim, 50)
        self.layer5 = nn.Linear(50, 2)  # 2 output units for 2 classes
        self.layers = [self.layer1, self.layer5]
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)  # Dropout layer
        # Sigmoid is removed because CrossEntropyLoss includes softmax

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.dropout(x)
        #x = self.relu(self.layer2(x))
        x = self.layer5(x)  # Removed sigmoid
        return x

input_dim = max_length
learning_rate = 0.01
epochs = 5000

# Create the model
model = AdvancedModel(vocab_size, input_dim)

X_train, X_test, y_train, y_test = train_test_split(tensor_padded_texts, labels_tensor, test_size=0.1, random_state=42)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()  # Changed to CrossEntropyLoss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
outputs = None
for epoch in range(epochs):
    model.train()
    
    optimizer.zero_grad()
    outputs = model(X_train)
    
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

Epoch 1/5000, Loss: 505.350341796875
Epoch 2/5000, Loss: 421.5753173828125
Epoch 3/5000, Loss: 1211.7071533203125
Epoch 4/5000, Loss: 780.143798828125
Epoch 5/5000, Loss: 235.32252502441406
Epoch 6/5000, Loss: 534.686279296875
Epoch 7/5000, Loss: 364.2042541503906
Epoch 8/5000, Loss: 185.93739318847656
Epoch 9/5000, Loss: 343.117919921875
Epoch 10/5000, Loss: 286.5565490722656
Epoch 11/5000, Loss: 133.51821899414062
Epoch 12/5000, Loss: 235.65545654296875
Epoch 13/5000, Loss: 256.62896728515625
Epoch 14/5000, Loss: 144.0341339111328
Epoch 15/5000, Loss: 149.591552734375
Epoch 16/5000, Loss: 210.68360900878906
Epoch 17/5000, Loss: 159.5963592529297
Epoch 18/5000, Loss: 96.62486267089844
Epoch 19/5000, Loss: 151.65318298339844
Epoch 20/5000, Loss: 152.3468475341797
Epoch 21/5000, Loss: 93.84152221679688
Epoch 22/5000, Loss: 104.12415313720703
Epoch 23/5000, Loss: 130.7544403076172
Epoch 24/5000, Loss: 103.36970520019531
Epoch 25/5000, Loss: 75.30619812011719
Epoch 26/5000, Loss: 102.6037

In [46]:
with torch.no_grad():
    model.eval()
    outputs = model(X_test)
    _, predicted_class = torch.max(outputs, 1)  # Argmax along dimension 1
    correct_count = (predicted_class == y_test).sum().item()
    accuracy = correct_count / len(y_test) * 100  # Calculate the accuracy percentage

print(f"Test complete. Accuracy: {accuracy}%")

Test complete. Accuracy: 52.16035634743875%


In [47]:
bad_sentence = "dead dead dead fuck fuck die die "
good_sentence = "lets plant tree everywhere with love"
good_sentence2 = "i want to hugging all of you with love."
sentence = preprocess_sentence(bad_sentence)
print(sentence)
with open('word_to_index.pkl', 'rb') as f:
    loaded_word_to_index = pickle.load(f)

tokenized_sentence = pad_tokenized_text(get_tokenized_sentence(sentence, loaded_word_to_index), max_length)

[12212,
 10769,
 4295,
 7457,
 9254,
 13843,
 8162,
 3754,
 8047,
 17951,
 183,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [25]:
single_sentence = np.array(tokenized_sentence).reshape(1, -1)
scaled_single_sentence = scaler.transform(single_sentence)
scaled_single_sentence = torch.tensor(scaled_single_sentence).float()

In [27]:
single_sentence = torch.tensor(scaled_single_sentence).float().unsqueeze(0)
model.eval()
# Tahmin yap
with torch.no_grad():
    outputs = model(single_sentence)
    
# Tahminin sonucunu ikili sınıflandırma için bir eşik değeriyle (threshold) karşılaştır
predicted_label = (outputs >= 0.5).int().item()

print(f"Predicted Label: {predicted_label}")

tensor([[[-0.2335, -0.0061]]])
Predicted Label: -0.0061292946338653564


  single_sentence = torch.tensor(scaled_single_sentence).float().unsqueeze(0)


In [48]:
base_scaling_factor = 10**4
model_name='model_1'

model_data = {}

model_json = {}
layers = model.layers
num_layers = len(layers)
for i in range(num_layers):
    weight_key = f"layers.{i}.weight"
    bias_key = f"layers.{i}.bias"
    
    layer_weights = layers[i].weight
    layer_bias = layers[i].bias
    
    layer_weights = layer_weights.detach().numpy().flatten()
    layer_bias = layer_bias.detach().numpy().flatten()
    
    print(len(layer_weights))
    print(len(layer_bias))
    
    model_json[f"w{i+1}"] = (layer_weights * base_scaling_factor).round().astype(int).tolist()
    model_json[f"b{i+1}"] = (layer_bias * base_scaling_factor ** (i + 1)).round().astype(int).tolist()
# Save to JSON
with open(f"{model_name}_parameters.json", "w") as f:
    json.dump(model_json, f, indent=4)

print(f"Saved model parameters at {model_name}_parameters.json")

Saved model parameters at model_1_parameters.json


In [49]:
n_samples = 10

samples_json = {}

for (i, sample) in enumerate(zip(tensor_padded_texts[:n_samples], labels_tensor[:n_samples])):
    samples_json[f"in{i+1}"] = (sample[0]*(base_scaling_factor)).round().to(torch.int64).tolist()
    samples_json[f"out{i+1}"] = sample[1].tolist()
    
# Save to JSON
with open(f"{model_name}_samples.json", "w") as f:
    print(samples_json)
    json.dump(samples_json, f, indent=4)

print(f"Saved test samples at {model_name}_samples.json")

{'in1': [7260, 13843, 15780, 2383, 9319, 16903, 366, 9885, 16081, 5714, 2217, 1772, 16381, 10288, 13843, 11367, 10836, 10868, 1176, 15402, 3913, 2057, 6376, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out1': 1, 'in2': [9628, 762, 16271, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out2': 1, 'in3': [16054, 16043, 13843, 12669, 12551, 15402, 9655, 18217, 12160, 12135, 7105, 14656, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out3': 0, 'in4': [1808, 4748, 2549, 18318, 11094, 11898, 8744, 18083, 10396, 4295, 13833, 6777, 4153, 8039, 16043, 12394, 9636, 1772, 10306, 1582, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out4': 1, 'in5': [14656, 8651, 14364, 8162, 11275, 4295, 6758, 8785, 16079, 16393, 13089, 7585, 7902, 3913, 14657, 3517, 15483, 11212, 3964, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out5': 0, 'in6': [2383, 17674, 4006, 499, 18217, 13384, 8146, 4295, 7549, 7849, 4295, 3913, 6958, 2383, 11995, 523, 7457, 15175, 14364, 10306, 14625, 0, 0, 0, 0