In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import random
import pandas as pd
from tqdm import tqdm 
from gensim.models import Word2Vec

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class SkipGramDataset(Dataset):
    def __init__(self, sentences, window_size=10, min_count=3):
        self.sentences = sentences
        self.window_size = window_size
        self.vocab = self._build_vocab(min_count)
        self.word2idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}
        self.data = self._create_data()

    def _build_vocab(self, min_count):
        word_counts = Counter([word for sentence in self.sentences for word in sentence])
        return [word for word, count in word_counts.items() if count >= min_count]

    def _create_data(self):
        data = []
        for sentence in self.sentences:
            for i, word in enumerate(sentence):
                if word in self.word2idx:
                    for j in range(max(0, i - self.window_size), min(len(sentence), i + self.window_size + 1)):
                        if i != j and sentence[j] in self.word2idx:
                            data.append((self.word2idx[word], self.word2idx[sentence[j]]))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]).to(device), torch.tensor(self.data[idx][1]).to(device)


In [2]:
def load_and_preprocess_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    words = text.split()
    
    print(f"Total number of words: {len(words)}")
    
    sequence_length = 20  
    sequences = [words[i:i+sequence_length] for i in range(0, len(words), sequence_length)]
    
    print(f"Number of sequences: {len(sequences)}")
    print(f"First sequence: {sequences[0]}")
    
    return sequences

In [3]:
sentences=load_and_preprocess_data('data/text8')

Total number of words: 17005207
First 10 words: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
Number of sequences: 850261
First sequence: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english']


In [4]:
sentences = sentences[0:25000]

In [5]:
dataset = SkipGramDataset(sentences)

In [6]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [7]:
print (len(dataset))
print (len(dataloader))

466068
14565


In [None]:
users = pd.read_csv('data/users.csv')
items = pd.read_csv('data/items.csv')

In [9]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        output = self.linear(embeds)
        return output
        
def train_model(dataset, embedding_dim=512, batch_size=32, num_epochs=25, learning_rate=0.005):
    model = SkipGramModel(len(dataset.vocab), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (input_words, target_words) in tqdm(enumerate(dataloader), total=len(dataloader)):
            input_words, target_words = input_words.to(device), target_words.to(device)
            optimizer.zero_grad()
            log_probs = model(input_words)
            loss = criterion(log_probs, target_words)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")
    
    return model

In [10]:
model = train_model(dataset)

100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [06:13<00:00, 39.04it/s]


Epoch 1/25, Loss: 5.8998


100%|████████████████████████████████████████████████████████████████████████████| 14565/14565 [02:00<00:00, 120.54it/s]


Epoch 2/25, Loss: 5.6330


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [04:35<00:00, 52.80it/s]


Epoch 3/25, Loss: 5.5561


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [06:11<00:00, 39.26it/s]


Epoch 4/25, Loss: 5.5050


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [06:10<00:00, 39.26it/s]


Epoch 5/25, Loss: 5.4675


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:37<00:00, 43.16it/s]


Epoch 6/25, Loss: 5.4387


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:58<00:00, 40.62it/s]


Epoch 7/25, Loss: 5.4150


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [06:04<00:00, 39.94it/s]


Epoch 8/25, Loss: 5.3959


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:25<00:00, 44.78it/s]


Epoch 9/25, Loss: 5.3795


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [06:02<00:00, 40.14it/s]


Epoch 10/25, Loss: 5.3648


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [06:00<00:00, 40.43it/s]


Epoch 11/25, Loss: 5.3526


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:26<00:00, 44.55it/s]


Epoch 12/25, Loss: 5.3414


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:50<00:00, 41.61it/s]


Epoch 13/25, Loss: 5.3317


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [06:02<00:00, 40.22it/s]


Epoch 14/25, Loss: 5.3231


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:04<00:00, 47.83it/s]


Epoch 15/25, Loss: 5.3160


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:54<00:00, 41.03it/s]


Epoch 16/25, Loss: 5.3082


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:58<00:00, 40.58it/s]


Epoch 17/25, Loss: 5.3021


100%|████████████████████████████████████████████████████████████████████████████| 14565/14565 [01:20<00:00, 180.17it/s]


Epoch 18/25, Loss: 5.2968


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [04:18<00:00, 56.43it/s]


Epoch 19/25, Loss: 5.2914


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:43<00:00, 42.40it/s]


Epoch 20/25, Loss: 5.2869


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:49<00:00, 41.63it/s]


Epoch 21/25, Loss: 5.2826


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:47<00:00, 41.86it/s]


Epoch 22/25, Loss: 5.2787


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:55<00:00, 41.02it/s]


Epoch 23/25, Loss: 5.2748


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:53<00:00, 41.24it/s]


Epoch 24/25, Loss: 5.2720


100%|█████████████████████████████████████████████████████████████████████████████| 14565/14565 [05:41<00:00, 42.67it/s]

Epoch 25/25, Loss: 5.2691





In [60]:
word_embeddings = model.embeddings.weight.detach().cpu().numpy()

def most_similar(word, top_n=5):
    if word not in dataset.word2idx:
        return []
    word_idx = dataset.word2idx[word]
    word_vec = word_embeddings[word_idx]
    similarities = np.dot(word_embeddings, word_vec) / (np.linalg.norm(word_embeddings, axis=1) * np.linalg.norm(word_vec))
    most_similar = similarities.argsort()[-top_n-1:-1][::-1]
    return [(dataset.idx2word[idx], similarities[idx]) for idx in most_similar if idx != word_idx]


In [61]:
print(most_similar("author"))

[('famous', 0.49498144), ('difference', 0.46292886), ('woman', 0.4597501), ('influential', 0.45435798), ('kingdom', 0.4328415)]


In [50]:
torch.save({
            'model_state_dict': model.state_dict(),
            }, 'checkpoints/initial.pkg')


In [52]:
checkpoint = torch.load('checkpoints/initial.pkg', weights_only=True)

In [58]:
checkpoint['model_state_dict']

OrderedDict([('embeddings.weight',
              tensor([[-0.0334,  0.7723, -0.1541,  ..., -0.8312, -0.8679, -0.1490],
                      [ 0.1979,  0.0738, -0.0707,  ..., -0.6376,  0.0895, -0.0600],
                      [-0.1665, -0.0984,  0.0231,  ...,  0.0484,  0.0600, -0.0972],
                      ...,
                      [-0.0420,  1.2516,  0.3799,  ...,  1.1112,  0.0416,  1.6234],
                      [-0.4737,  0.3162,  0.9902,  ...,  0.9877, -0.2667,  0.8550],
                      [-0.9057,  0.2380,  0.2295,  ...,  0.0347, -0.9295, -0.0289]],
                     device='cuda:0')),
             ('linear.weight',
              tensor([[-0.8265, -0.0731, -0.6963,  ..., -0.0869, -0.9737,  1.0510],
                      [-0.7041, -0.2857, -0.5386,  ..., -0.0798, -0.6669,  1.2450],
                      [-0.6976, -0.2072, -0.5487,  ..., -0.0528, -0.7097,  1.0454],
                      ...,
                      [-0.7761, -0.2400, -0.5212,  ...,  0.3490, -0.9173,  1.4619],

# Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming the dataframe is already loaded as 'df'
# If not, you would load it like this:
# df = pd.read_csv('hacker_news_dataset.csv')

# Convert 'time' column to datetime if it's not already
df['time'] = pd.to_datetime(df['time'])

# Extract month from the 'time' column
df['month'] = df['time'].dt.strftime('%B')

# Count submissions for each month
monthly_submissions = df['month'].value_counts().sort_index()

# Define month order for proper sorting
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
               'July', 'August', 'September', 'October', 'November', 'December']

# Reindex the series to ensure all months are included and properly ordered
monthly_submissions = monthly_submissions.reindex(month_order).fillna(0)

# Create a bar plot
plt.figure(figsize=(12, 6))
monthly_submissions.plot(kind='bar')
plt.title('Number of Hacker News Submissions per Month')
plt.xlabel('Month')
plt.ylabel('Number of Submissions')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming the dataframe is already loaded as 'df'
# If not, you would load it like this:
# df = pd.read_csv('hacker_news_dataset.csv')

# Convert 'time' column to datetime if it's not already
df['time'] = pd.to_datetime(df['time'])

# Extract year from the 'time' column
df['year'] = df['time'].dt.year

# Count submissions for each year
yearly_submissions = df['year'].value_counts().sort_index()

# Create a bar plot
plt.figure(figsize=(12, 6))
yearly_submissions.plot(kind='bar')
plt.title('Number of Hacker News Submissions per Year')
plt.xlabel('Year')
plt.ylabel('Number of Submissions')
plt.xticks(rotation=0)  # Keeping year labels horizontal
plt.tight_layout()

# Show the plot
plt.show()

# Optional: Print the data
print(yearly_submissions)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming the dataframe is already loaded as 'df'
# If not, you would load it like this:
# df = pd.read_csv('hacker_news_dataset.csv')

# Convert 'time' column to datetime if it's not already
df['time'] = pd.to_datetime(df['time'])

# Extract year and month from the 'time' column
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month

# Group by year and month, and count submissions
monthly_submissions = df.groupby(['year', 'month']).size().reset_index(name='count')

# Sort by year and month
monthly_submissions = monthly_submissions.sort_values(['year', 'month'])

# Create a sequential index for x-axis
monthly_submissions['index'] = range(len(monthly_submissions))

# Create the plot
plt.figure(figsize=(15, 8))

# Get unique years for different colors
years = monthly_submissions['year'].unique()
colors = sns.color_palette("husl", n_colors=len(years))

# Plot each year with a different color
for year, color in zip(years, colors):
    year_data = monthly_submissions[monthly_submissions['year'] == year]
    plt.plot(year_data['index'], year_data['count'], label=str(year), color=color)

# Customize the plot
plt.title('Number of Hacker News Submissions per Month (Colored by Year)')
plt.xlabel('Months (Sequential)')
plt.ylabel('Number of Submissions')

# Customize x-axis ticks to show every 12th month (January of each year)
xticks = monthly_submissions[monthly_submissions['month'] == 1]['index']
xtick_labels = monthly_submissions[monthly_submissions['month'] == 1]['year']
plt.xticks(xticks, xtick_labels, rotation=45)

plt.legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming the dataframe is already loaded as 'df'
# If not, you would load it like this:
# df = pd.read_csv('hacker_news_dataset.csv')

# Convert 'time' column to datetime if it's not already
df['time'] = pd.to_datetime(df['time'])

# Extract year and month from the 'time' column
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month

# Group by year and month, and count submissions
monthly_submissions = df.groupby(['year', 'month']).size().reset_index(name='count')

# Sort by year and month
monthly_submissions = monthly_submissions.sort_values(['year', 'month'])

# Create a sequential index for x-axis
monthly_submissions['index'] = range(len(monthly_submissions))

# Create the plot
plt.figure(figsize=(20, 10))

# Get unique years for different colors
years = monthly_submissions['year'].unique()
colors = sns.color_palette("husl", n_colors=len(years))

# Plot bars for each month, colored by year
for year, color in zip(years, colors):
    year_data = monthly_submissions[monthly_submissions['year'] == year]
    plt.bar(year_data['index'], year_data['count'], color=color, width=1, align='edge')

# Customize the plot
plt.title('Number of Hacker News Submissions per Month (Colored by Year)', fontsize=16)
plt.xlabel('Months (Sequential)', fontsize=12)
plt.ylabel('Number of Submissions', fontsize=12)

# Customize x-axis ticks to show every 12th month (January of each year)
xticks = monthly_submissions[monthly_submissions['month'] == 1]['index']
xtick_labels = monthly_submissions[monthly_submissions['month'] == 1]['year']
plt.xticks(xticks, xtick_labels, rotation=45, ha='right')

# Add a legend
handles = [plt.Rectangle((0,0),1,1, color=color) for color in colors]
plt.legend(handles, years, title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming the dataframe is already loaded as 'df'
# If not, you would load it like this:
# df = pd.read_csv('hacker_news_dataset.csv')

# Convert 'time' column to datetime if it's not already
df['time'] = pd.to_datetime(df['time'])

# Extract year and month from the 'time' column
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month

# Group by year and month, and count submissions
monthly_submissions = df.groupby(['year', 'month']).size().reset_index(name='count')

# Pivot the data to have years as columns and months as rows
pivot_data = monthly_submissions.pivot(index='month', columns='year', values='count')

# Create the plot
plt.figure(figsize=(15, 8))

# Plot lines for each year
for year in pivot_data.columns:
    plt.plot(pivot_data.index, pivot_data[year], label=str(year), marker='o', markersize=4)

# Customize the plot
plt.title('Number of Hacker News Submissions by Month and Year', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Submissions', fontsize=12)

# Set x-axis ticks to show month names
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
plt.xticks(range(1, 13), month_names)

# Add grid
plt.grid(True, linestyle='--', alpha=0.7)

# Add legend
plt.legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout to prevent cutting off labels
plt.tight_layout()

# Show the plot
plt.show()