# Data Processing and EDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('leetcode_dataset.csv')
data

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
import seaborn as sns

fig, axs = plt.subplots(1, 5, figsize=(18, 5))
fig.suptitle('LeetCode Questions EDA Overview', fontsize=18)

# Plot 1
sns.countplot(x='difficulty', data=data, ax=axs[0])
axs[0].set_title('Question Difficulty Distribution')

# Plot 2
sns.histplot(data['acceptance_rate'], bins=30, kde=True, ax=axs[1])
axs[1].set_title('Acceptance Rate Distribution')

# Plot 3
sns.boxplot(data=data, x='difficulty', y='rating', ax=axs[2])
axs[2].set_title("Ratings by Difficulty")
axs[2].set_xlabel("Difficulty")
axs[2].set_ylabel("Rating")

# Plot 4 - relation between acceptance rate and difficulty
sns.boxplot(data=data, x='difficulty', y='acceptance_rate', ax=axs[3])
axs[3].set_title("Acceptance Rate by Difficulty")
axs[3].set_xlabel("Difficulty")
axs[3].set_ylabel("Acceptance Rate")

#Plot 5- correlation between frequency and acceptance rate
sns.scatterplot(data,x='frequency',y='acceptance_rate', ax=axs[4])
axs[4].set_title("Frequency vs Acceptance Rate")
axs[4].set_xlabel("Frequency of Question Attempts")
axs[4].set_ylabel("Acceptance Rate")

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()




Plot Analysis

Plot 1 - Question Difficulty Distribution: Bar Plot, X-axis: Difficulty Level, Y-axis: Count of Questions. Analysis: Majority of the questions are medium difficulty, fewer questions are categorized as hard and slightly more are categorized as easy.

Plot 2 - Acceptance Rate Distribution: Histogram with KDE Outline, X-axis: Acceptance Rate (%), Y-axis: Count of questions in each bin. Analysis: The acceptance rate is bell-shaped, with a peak at around 50%. There are very few questions that have extremely low or extremely high acceptance rates.

Plot 3 - Ratings By Difficulty: Boxplot, X-axis: Difficulty Level, Y-axis: Rating. Analysis: All difficulty levels have high median ratings. Easy problems have a higher variance and harder problems have consistently higher ratings.

Plot 4 - Acceptance Rate by Difficulty: Boxplot, X-axis: Difficulty Level, Y-axis: Acceptance Rate. Analysis: Easy level problems have a higher acceptance rate, hard problems have a lower acceptance rate, and medium problems have a medium acceptance rate.

Plot 5 - Frequency vs. Acceptance Rate: Scatter Plot, X-axis: Frequency of Attempts, Y-axis: Acceptance Rate. Analysis: There is no strong linear relation between the two data points.

In [None]:
data.drop(columns=['id','solution_link','likes','dislikes', 'is_premium', 'similar_questions'], inplace=True)
data.dropna(inplace=True)

#convert values at related_topics and companies to list object
data['related_topics'] = data['related_topics'].str.split(',')
data['companies'] = data['companies'].str.split(',')

#convert M/K suffixes of accepted and submissions
conversion = {'m':1000000, 'k':1000}
data['accepted']=data['accepted'].apply(lambda x: int(float(x[:-1])*conversion[x[-1].lower()]) if x[-1].lower() in conversion.keys() else x)
data['submissions']=data['submissions'].apply(lambda x: int(float(x[:-1])*conversion[x[-1].lower()]) if x[-1].lower() in conversion.keys() else x)
data

#frequency is how often the problem is attempted
#we can engineer new difficulty rating [Easy, Medium, Hard]=> 1,2,3 ?? frequency ?? acceptance_rate
difficulty_dict = {'Easy':1, 'Medium':2, 'Hard':3}
data['difficulty'] = data['difficulty'].map(difficulty_dict)

In [None]:
#Feature engineering
#Simply forming buckets with the acceptance rate and scaling the built in difficult levels up
#(70% weightage to original difficulty), (25% weightage to acceptance rate), (5% weightage to ratings), scale to 10
data['scaled_difficulty'] = (data['difficulty']/3*7)+((1-data['acceptance_rate'])/100*2.5)+(data['rating']/100*0.5)

In [None]:
data

# Normalization

In [None]:
# Uses the data df from the data processing portion. Sets up a set of normalized dfs to be loaded into SQL.

# CREATING TABLES
# set ids
data = data.reset_index(drop=True)
data['problem_id'] = data.index + 1



# Problems table is just filtering
problems_df = data[['problem_id', 'title', 'description', 'difficulty', 'scaled_difficulty', 'url']]


# Companies table, list of each company name with stripped & lowercase name as unique key
# all_companies is the set of unique names in the list
all_companies = set(c.strip() for company_list in data['companies'] for c in company_list)
company_ids_df = pd.DataFrame({
    'company_id': [c.lower().replace(' ', '_') for c in all_companies],
    'company_name': list(all_companies)
})

# Company to problem table. attaches problem id to each associated company id
companies_records = []
for _, row in data.iterrows():
    for company in row['companies']:
        companies_records.append({
            'problem_id': row['problem_id'],
            'company_id': company.strip().lower().replace(' ', '_')
        })
companies_df = pd.DataFrame(companies_records)

# Topic ids: set of all unique topics, given a strippped lowercase ID key. same logic as company id table
all_topics = set(t.strip() for topic_list in data['related_topics'] for t in topic_list)
topic_ids_df = pd.DataFrame({
    'topic_id': [t.lower().replace(' ', '_') for t in all_topics],
    'topic_name': list(all_topics)
})

# Topics table: problem id and each topic id associated with it
topics_records = []
for _, row in data.iterrows():
    for topic in row['related_topics']:
        topics_records.append({
            'problem_id': row['problem_id'],
            'topic_id': topic.strip().lower().replace(' ', '_'),
        })
topics_df = pd.DataFrame(topics_records)


In [None]:
# Checks and prints schema (in df form) from above code
print("CURRENT SCHEMA \n")
print("problems")
print(problems_df)
print("\n")

print("company_ids")
print(company_ids_df)
print("\n")

print("companies")
print(companies_df)
print("\n")

print("topic_ids")
print(topic_ids_df)
print("\n")

print("topics")
print(topics_df)


In [None]:
# SETUP FOR SQLITE
import sqlite3

# Connects to leetcode.db database file
conn = sqlite3.connect('leetcode.db')
cursor = conn.cursor()


# drops all previous versions of tables. recreates schema for each run to align with any changes
cursor.execute('DROP TABLE IF EXISTS topics')
cursor.execute('DROP TABLE IF EXISTS topic_ids')
cursor.execute('DROP TABLE IF EXISTS companies')
cursor.execute('DROP TABLE IF EXISTS company_ids')
cursor.execute('DROP TABLE IF EXISTS problems')



# problems table
cursor.execute('''
CREATE TABLE IF NOT EXISTS problems (
    problem_id INTEGER PRIMARY KEY,
    title TEXT,
    description TEXT,
    difficulty INTEGER,
    scaled_difficulty REAL,
    url TEXT
)
''')

# company_ids table
cursor.execute('''
CREATE TABLE IF NOT EXISTS company_ids (
    company_id TEXT PRIMARY KEY,
    company_name TEXT
)
''')

# companies table
cursor.execute('''
CREATE TABLE IF NOT EXISTS companies (
    problem_id INTEGER,
    company_id TEXT,
    FOREIGN KEY (problem_id) REFERENCES problems (problem_id),
    FOREIGN KEY (company_id) REFERENCES company_ids (company_id)
)
''')

# topic_ids table
cursor.execute('''
CREATE TABLE IF NOT EXISTS topic_ids (
    topic_id TEXT PRIMARY KEY,
    topic_name TEXT
)
''')

# topics table
cursor.execute('''
CREATE TABLE IF NOT EXISTS topics (
    problem_id INTEGER,
    topic_id TEXT,
    FOREIGN KEY (problem_id) REFERENCES problems (problem_id),
    FOREIGN KEY (topic_id) REFERENCES topic_ids (topic_id)
)
''')

conn.commit()



# INSERTS EACH DF TO TABLE

# problems
for _, row in problems_df.iterrows():
    cursor.execute('''
    INSERT OR IGNORE INTO problems (problem_id, title, description, difficulty, scaled_difficulty, url)
    VALUES (?, ?, ?, ?, ?, ?)
    ''', (row['problem_id'], row['title'], row['description'], row['difficulty'], row['scaled_difficulty'], row['url']))

# company_ids
for _, row in company_ids_df.iterrows():
    cursor.execute('''
    INSERT OR IGNORE INTO company_ids (company_id, company_name)
    VALUES (?, ?)
    ''', (row['company_id'], row['company_name']))

# companies
for _, row in companies_df.iterrows():
    cursor.execute('''
    INSERT OR IGNORE INTO companies (problem_id, company_id)
    VALUES (?, ?)
    ''', (row['problem_id'], row['company_id']))

# topic ids
for _, row in topic_ids_df.iterrows():
    cursor.execute('''
    INSERT OR IGNORE INTO topic_ids (topic_id, topic_name)
    VALUES (?, ?)
    ''', (row['topic_id'], row['topic_name']))

# topics
for _, row in topics_df.iterrows():
    cursor.execute('''
    INSERT OR IGNORE INTO topics (problem_id, topic_id)
    VALUES (?, ?)
    ''', (row['problem_id'], row['topic_id']))


conn.commit()
conn.close()



# retrieve problem url from id
def get_problem_url(problem_id):
    conn = sqlite3.connect('leetcode.db')
    cursor = conn.cursor()

    cursor.execute('SELECT url FROM problems WHERE problem_id = ?', (problem_id,))
    result = cursor.fetchone()

    conn.close()

    if result:
        return result[0]
    else:
        return None

# Encoding

In [None]:
!pip install -Uqqq sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
data['text'] = data['title'].fillna('') + ' ' + data['description'].fillna('')

In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
title_description_embeddings = embedder.encode(data['text'].fillna('').tolist(), show_progress_bar=True)
#description_embeddings = model.encode(data['description'].fillna('').tolist(), show_progress_bar = True)
topics_embeddings = embedder.encode([" ".join(data['related_topics'].iloc[i]) + " ".join(data['companies'].iloc[i]) for i in range(data.shape[0])],  show_progress_bar=True)

In [None]:
import numpy as np
print("Title/desc Embeddings Size", len(np.array(list(title_description_embeddings[0]))))

print("Topic Embeddings Size", len(np.array(list(topics_embeddings[0]))))


In [None]:
diff = data['scaled_difficulty'].values[:,None]
X = np.hstack([topics_embeddings, diff])
X= X.reshape(X.shape[0], 1, X.shape[1])
padding = np.array([0]*1570).reshape(1570,1)
y = np.hstack([title_description_embeddings, padding])
y = y.reshape(y.shape[0],1,y.shape[1])
print(X.shape, y.shape)

## Model Building

Random Forests Regression

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)


In [None]:
print(X_train.shape, y_train.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity

regr = RandomForestRegressor(max_depth=5, random_state=0)
regr.fit(X_train.reshape(1256,385),y_train.reshape(1256,385))

pred = regr.predict(X_test.reshape(314,385))

mse = mean_squared_error(y_test.reshape(314,385),pred)

r2 = r2_score(y_test.reshape(314,385),pred)
print(f"Mean Squared Error: {mse}, R-squared: {r2}")

In [None]:
user_input_topics = "Binary Search Tree Array Google"
usr_topic_embed = embedder.encode(user_input_topics)
user_difficulty = 9
user_input = np.hstack([usr_topic_embed, user_difficulty])
pred_emb = regr.predict(user_input.reshape(1,-1))

In [None]:
find_cos = cosine_similarity(pred_emb.reshape(1,-1), y.reshape(1570,385))
find_euc = euclidean_distances(pred_emb.reshape(1,-1), y.reshape(1570,385))

print(data['text'].iloc[np.argmax(find_cos)])

In [None]:
problem_id = int(data["problem_id"].iloc[np.argmax(find_euc)])
print("Extracted problem 2: ", get_problem_url(problem_id))

In [None]:
data[data['title'] == "Count Good Nodes in Binary Tree"]

RNN Encoder to Decoder Architecture

In [None]:
#preprocess and tokenize words for RNN model

import tensorflow as tf
import keras
from keras import models, layers, preprocessing, optimizers
from tensorflow.keras.preprocessing.text import Tokenizer
#preprocessing.text.Tokenizer, proprocessing.sequence.pad_sequences



In [None]:

enc_inps = keras.layers.Input(shape = (1, 385), name = 'encoder_input')
encoder = keras.layers.LSTM(128, return_sequences = True, return_state = True)
emb_outs, state_h, state_c = encoder(enc_inps)


dec_inps = keras.layers.Input(shape = (1, 385), name = 'decoder_input')
decoder = keras.layers.LSTM(128, return_sequences= True, return_state= True)
dec_out,_,_ = decoder(dec_inps, initial_state = [state_h,state_c])
decoder_dense = keras.layers.Dense(385,activation = 'softmax')
dec_out = decoder_dense(dec_out)
model = keras.Model(inputs = [enc_inps, dec_inps], outputs = dec_out)
model.compile("rmsprop", "mean_squared_error", metrics=["mean_squared_error"])

start = np.zeros((y_train.shape[0], 1, y_train.shape[2]))

dec_in = np.concatenate([ start, y_train[:,:-1,:]], axis = 1)
history = model.fit([X_train, dec_in], y_train, epochs=10, validation_split = 0.2)
model.summary()

start = np.zeros([1,1,385])
pred_emb = model.predict([X_test[:1],start])
find_cos = cosine_similarity(pred_emb.reshape(1,-1), y.reshape(1570,385))
print("Extracted problem: ", data["text"].iloc[np.argmax(find_cos)])

In [None]:
start = np.zeros((y_test.shape[0], y_test.shape[1], y_test.shape[2]))
model.evaluate([X_test, start], y_test)

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('RNN model loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train', 'validation'])

## Bert Model


# Bert4Rec (Transformer Rec)


In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

user_sequences = {
    'user_0': [1370, 672, 210, 694, 50, 1227, 705],
    'user_1': [637, 379, 382, 937, 1230, 1546, 1152, 1492, 1431],
    'user_2': [844, 1552, 300, 1374, 236],
    'user_3': [780, 93, 340, 105, 59, 1324, 1178, 1416],
    'user_4': [937, 686, 1200, 442, 1279, 1062],
    'user_5': [1068, 993, 171, 132, 440, 732, 640],
    'user_6': [1087, 155, 105, 310, 640, 131, 522],
    'user_7': [284, 338, 57, 494, 51, 527],
    'user_8': [1031, 1491, 1522, 165, 1169, 445, 340, 1278, 538, 328],
    'user_9': [1197, 738, 1192, 411, 514, 935]
}




In [None]:
class LeetCodeDataset(Dataset):
    def __init__(self, user_sequences, embeddings, max_seq_len=50):
        self.sequences = []
        self.targets = []
        self.embeddings = embeddings
        self.max_seq_len = max_seq_len

        for seq in user_sequences.values():
            for i in range(1, len(seq)):
                input_seq = seq[:i]
                target = seq[i]
                if len(input_seq) > max_seq_len:
                    input_seq = input_seq[-max_seq_len:]
                self.sequences.append(input_seq)
                self.targets.append(target)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq_indices = self.sequences[idx]
        target_index = self.targets[idx]

        # Pad sequences
        pad_len = self.max_seq_len - len(seq_indices)
        seq_indices = [0]*pad_len + seq_indices

        seq_embeddings = torch.tensor(self.embeddings[seq_indices], dtype=torch.float)
        target_embedding = torch.tensor(self.embeddings[target_index], dtype=torch.float)

        return seq_embeddings, target_embedding


In [None]:
import torch.nn as nn

class TransformerRecommender(nn.Module):
    def __init__(self, embedding_dim, nhead=2, num_layers=2, dropout=0.1):
        super(TransformerRecommender, self).__init__()
        self.embedding_dim = embedding_dim
        self.positional_encoding = nn.Parameter(torch.zeros(1, 50, embedding_dim))
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=nhead, dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_layer = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, x):
        x = x + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        x = x[:, -1, :]
        x = self.output_layer(x)
        return x


In [None]:
# Parameters
embedding_dim = title_description_embeddings.shape[1]
batch_size = 32
epochs = 10
learning_rate = 1e-3

# Dataset and DataLoader
dataset = LeetCodeDataset(user_sequences, title_description_embeddings)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Model, Loss, Optimizer
model1 = TransformerRecommender(embedding_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model1.parameters(), lr=learning_rate)

# Training Loop
model1.train()
for epoch in range(epochs):
    total_loss = 0
    for seq_embeddings, target_embedding in dataloader:
        optimizer.zero_grad()
        output = model1(seq_embeddings)
        loss = criterion(output, target_embedding)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")


In [None]:
def recommend_next_problem(model, user_sequence, embeddings, top_k=5):
    model1.eval()
    with torch.no_grad():
        seq_indices = user_sequence[-50:]
        pad_len = 50 - len(seq_indices)
        seq_indices = [0]*pad_len + seq_indices
        seq_embeddings = torch.tensor(embeddings[seq_indices], dtype=torch.float).unsqueeze(0)
        output_embedding = model1(seq_embeddings).squeeze(0)

        # Compute cosine similarity
        all_embeddings = torch.tensor(embeddings, dtype=torch.float)
        similarities = torch.nn.functional.cosine_similarity(output_embedding.unsqueeze(0), all_embeddings)
        top_indices = similarities.argsort(descending=True)[:top_k].tolist()
        return top_indices


In [None]:
user_id = 'user_1'
user_sequence = user_sequences[user_id]
recommended_indices = recommend_next_problem(model1, user_sequence, title_description_embeddings)

for idx in recommended_indices:
    print(data.iloc[idx]['title'])


# Data Visualizations

In [None]:
model1.eval()

pred_embeddings = []
true_embeddings = []

with torch.no_grad():
    for xb, yb in dataloader:
        preds = model1(xb)
        true = yb.squeeze(1)

        pred_embeddings.append(preds)
        true_embeddings.append(true)

pred_embeddings = torch.cat(pred_embeddings, dim=0)
true_embeddings = torch.cat(true_embeddings, dim=0)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

cos_sims = cosine_similarity(pred_embeddings.numpy(), true_embeddings.numpy())

plt.hist(cos_sims.diagonal(), bins=30, color='skyblue')
plt.title("Cosine Similarity: BERT-style Model Predictions")
plt.xlabel("Cosine Similarity")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()


In [None]:
rnn_pred = model.predict([X_test, np.zeros_like(y_test)])

rnn_pred_embeddings = torch.tensor(rnn_pred[:, -1, :])
rnn_true_embeddings = torch.tensor(y_test[:, -1, :])


In [None]:
cos_sims_rnn = cosine_similarity(rnn_pred_embeddings.numpy(), rnn_true_embeddings.numpy())

plt.hist(cos_sims_rnn.diagonal(), bins=30, color='lightgreen')
plt.title("Cosine Similarity: RNN Model Predictions")
plt.xlabel("Cosine Similarity")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()


# Database Connection

In [None]:
## SETUP FOR MYSQL ##
# note: connection isn't working on colab, data on sqlite below
'''
# ensure connection is downloaded
!pip install mysql-connector-python

# download MySQL
!apt-get -y install mysql-server

# starts server
!service mysql start

# starts connection & SQL cursor
import mysql.connector
conn = mysql.connector.connect(user='root', password='root', host='localhost')
cursor = conn.cursor()
'''