In [118]:
import io
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import sentencepiece as spm

from gensim.models import Word2Vec
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

### Preprocess the Data

In [119]:
def dataframe_sentence_generator(df):
    for sentence in df:
        yield sentence

# Replace 'your_file.csv' with the path to your CSV file
file_path = 'Dataset1000WithScore.txt'
df = pd.read_csv(file_path, sep=';')

# Split the data into training and test sets
train_texts, test_texts, train_score, test_score = train_test_split(
    df['title'], 
    df['score'], 
    test_size=0.2, 
    random_state=42
)

### Train a SentencePiece Model

In [120]:
spm.SentencePieceTrainer.train(
    sentence_iterator=dataframe_sentence_generator(train_texts), 
    model_prefix='spm_Alex_week2', 
    vocab_size=2400
)

sp = spm.SentencePieceProcessor()
sp.load('spm_Alex_week2.model')

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input_format: 
  model_prefix: spm_Alex_week2
  model_type: UNIGRAM
  vocab_size: 2400
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_

True

### Generate  Tokens

In [121]:
tokenized_titles = [sp.encode_as_pieces(title) for title in train_texts]

### Use Word2Vec to Generate Embeddings

In [122]:
vector_size = 100
w2v_model = Word2Vec(
    sentences=tokenized_titles, 
    vector_size=vector_size, 
    window=5, 
    min_count=1, 
    workers=4
)

In [129]:
def title_to_embedding(sp, title, vector_size):
    tokens = sp.encode_as_pieces(title)
    embeddings = [w2v_model.wv[token] for token in tokens if token in w2v_model.wv]

    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(vector_size)

### Create a PyTorch Dataset

In [156]:
embeddings = np.empty((len(train_texts), vector_size), dtype=np.float64)

for idx, title in enumerate(train_texts):
    embeddings[idx] = title_to_embedding(sp, title, vector_size)
    
print(type(embeddings))

<class 'numpy.ndarray'>


In [160]:

input_embeddings_tensor = torch.from_numpy(embeddings)
print(input_embeddings_tensor)
print(embeddings)

tensor([[-0.0090,  0.0219,  0.0059,  ..., -0.0119, -0.0043,  0.0002],
        [-0.0099,  0.0221, -0.0010,  ..., -0.0153, -0.0025, -0.0065],
        [-0.0105,  0.0204,  0.0022,  ..., -0.0149,  0.0009, -0.0042],
        ...,
        [-0.0084,  0.0145,  0.0022,  ..., -0.0075, -0.0030, -0.0046],
        [-0.0114,  0.0178,  0.0005,  ..., -0.0124, -0.0002, -0.0019],
        [-0.0136,  0.0270,  0.0036,  ..., -0.0168, -0.0012,  0.0006]],
       dtype=torch.float64)
[[-0.00897185  0.02188748  0.00587505 ... -0.01191749 -0.00431119
   0.00019266]
 [-0.00989507  0.02214914 -0.00102099 ... -0.01534402 -0.00249354
  -0.00648603]
 [-0.0105211   0.0203801   0.0021798  ... -0.0148694   0.00089716
  -0.00415677]
 ...
 [-0.00839692  0.01446246  0.00217176 ... -0.00753136 -0.00299297
  -0.00461253]
 [-0.01144713  0.01782822  0.00053025 ... -0.0123812  -0.00016564
  -0.00191025]
 [-0.01355701  0.02695094  0.00358522 ... -0.01676181 -0.00117642
   0.000595  ]]


In [None]:

class NewsDataset(Dataset):
    def __init__(self, titles, score, sp, vector_size):
        self.titles = titles
        self.score = score
        self.vector_size = vector_size

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title_embedding = title_to_embedding(sp, self.titles.iloc[idx], self.vector_size)
        score = self.score.iloc[idx]
        return torch.tensor(title_embedding, dtype=torch.float), torch.tensor(score, dtype=torch.float)

In [135]:
TrainDataset = NewsDataset(train_texts, train_score, sp, vector_size)
TrainLoader = DataLoader(TrainDataset, batch_size=16, shuffle=True)

### Define and Train the Neural Network

In [131]:
class VotesPredictor(nn.Module):
    def __init__(self):
        super(VotesPredictor, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Linear(50, 1)
        )
    
    def forward(self, x):
        return self.fc(x)

In [137]:
Model = VotesPredictor()
criterion = nn.MSELoss()
optimizer = optim.Adam(Model.parameters(), lr=0.01)

In [138]:
# Training loop
for epoch in range(100):  # Number of epochs
    for title_embedding, score in TrainLoader:
        optimizer.zero_grad()
        outputs = Model(title_embedding)
        loss = criterion(outputs.squeeze(), score)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 18.947511672973633
Epoch 2, Loss: 13.208354949951172
Epoch 3, Loss: 33.680416107177734
Epoch 4, Loss: 1834.595947265625
Epoch 5, Loss: 333.9202575683594
Epoch 6, Loss: 173.79379272460938
Epoch 7, Loss: 17.589494705200195
Epoch 8, Loss: 15.28741455078125
Epoch 9, Loss: 643.418701171875
Epoch 10, Loss: 113.61872863769531
Epoch 11, Loss: 215.33375549316406
Epoch 12, Loss: 22.551467895507812
Epoch 13, Loss: 181.5155029296875
Epoch 14, Loss: 114.51161193847656
Epoch 15, Loss: 110.3350830078125
Epoch 16, Loss: 32.165184020996094
Epoch 17, Loss: 14.471490859985352
Epoch 18, Loss: 31.956567764282227
Epoch 19, Loss: 283.3117980957031
Epoch 20, Loss: 55.41526412963867
Epoch 21, Loss: 571.264404296875
Epoch 22, Loss: 119.49076080322266
Epoch 23, Loss: 525.5765991210938
Epoch 24, Loss: 63.93572235107422
Epoch 25, Loss: 55.49064254760742
Epoch 26, Loss: 165.920654296875
Epoch 27, Loss: 18.61789321899414
Epoch 28, Loss: 20.971588134765625
Epoch 29, Loss: 666.3204345703125
Epoch 30, Lo

In [139]:
# Define the model
Model = nn.Sequential(
    nn.Linear(vector_size, 50),
    nn.ReLU(),
    nn.Linear(50, 1)
)

# Set the criterion
criterion = nn.MSELoss()

# Choose the optimizer
optimizer = optim.Adam(Model.parameters(), lr=0.001)


inputs = torch.randn(100, 100)
targets = torch.randn(100, 1)

In [None]:
epochs = 1000
for epoch in range(1, epochs + 1):
    # Zero the gradients
    optimizer.zero_grad()
    
    # Forward pass
    outputs = Model(inputs)
    
    # Calculate loss
    loss = criterion(outputs, targets)
    
    # Backward pass and optimize
    loss.backward()
    optimizer.step()
    
    # Print loss every 100 epochs
    if epoch % 100 == 0:
        print(f'Epoch [{epoch}/{epochs}], Loss: {loss.item()}')