In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
import gensim

In [3]:
!pip install gensim
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

In [4]:
df = pd.read_csv("db_mod.csv")
df.head()

Unnamed: 0,uid,tags,statement,rating
0,1895G,"dp,flows,greedy",You are given a string consisting of character...,3100
1,1895F,"combinatorics,dp",Let's call an array a of n non-negative intege...,2600
2,1895E,"dp,games,graphs,greedy,sortings",Monocarp and Bicarp are playing a card game. E...,2300
3,1895D,bitmasks,"You are given n-1 integers a_1, a_2, \dots, a_...",1900
4,1895C,dp,A ticket is a non-empty string of digits from ...,1400


In [5]:
df.shape

(5504, 4)

In [6]:
df.describe()

Unnamed: 0,rating
count,5504.0
mean,1940.905342
std,696.152181
min,-1.0
25%,1400.0
50%,1900.0
75%,2400.0
max,3500.0


In [7]:
problem_statement = df.statement.apply(gensim.utils.simple_preprocess)
problem_statement

Unnamed: 0,statement
0,"[you, are, given, string, consisting, of, char..."
1,"[let, call, an, array, of, non, negative, inte..."
2,"[monocarp, and, bicarp, are, playing, card, ga..."
3,"[you, are, given, integers, a_, a_, dots, a_, ..."
4,"[ticket, is, non, empty, string, of, digits, f..."
...,...
5499,"[permutation, is, an, ordered, group, of, numb..."
5500,"[simon, has, rectangular, table, consisting, o..."
5501,"[dima, is, good, person, in, fact, he, great, ..."
5502,"[dima, liked, the, present, he, got, from, inn..."


In [8]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [9]:
def remove_stopwords(text):
    words = text.split()  # Split the text into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Keep only words not in stop_words
    return " ".join(filtered_words)  # Join the filtered words back into a string
df['statement'] = df['statement'].apply(remove_stopwords)

In [10]:
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    words = text.split()  # Split the text into words
    lemmas = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize each word
    return " ".join(lemmas)  # Join the lemmas back into a string
df['statement'] = df['statement'].apply(lemmatize_text)

In [11]:
import re
def remove_math_symbols(text):
    # Define a pattern to match common mathematical symbols
    pattern = r"[^\w\s]"  # Matches any character that is not a word character or whitespace

    # Remove the matched symbols using re.sub()
    cleaned_text = re.sub(pattern, "", text)
    return cleaned_text
df['statement'] = df['statement'].apply(remove_math_symbols)

In [12]:
def remove_two_letter_words(text):
    words = text.split()  # Split the text into words
    filtered_words = [word for word in words if len(word) > 2]  # Keep only words with length greater than 2
    return " ".join(filtered_words)  # Join the filtered words back into a string
df['statement'] = df['statement'].apply(remove_two_letter_words)

In [13]:
def remove_words_with_numbers(text):
    # Define a pattern to match words containing numbers
    pattern = r"\b\w*\d\w*\b"  # Matches whole words containing at least one digit

    # Remove the matched words using re.sub()
    cleaned_text = re.sub(pattern, "", text)
    return cleaned_text
df['statement'] = df['statement'].apply(remove_words_with_numbers)

In [14]:
model = gensim.models.Word2Vec( window=10, min_count=5, workers=5, vector_size = 300)

In [15]:
model.build_vocab(problem_statement, progress_per= 800)

In [16]:
model.train(problem_statement, total_examples=model.corpus_count, epochs=7)

(4312507, 6117097)

In [None]:
model.wv.most_similar("good")

[('beautiful', 0.6833444833755493),
 ('balanced', 0.5910377502441406),
 ('bad', 0.5893204212188721),
 ('sequences', 0.5718134045600891),
 ('called', 0.5658767819404602),
 ('true', 0.5601967573165894),
 ('valid', 0.5519869327545166),
 ('correct', 0.5510422587394714),
 ('subsequences', 0.5391006469726562),
 ('subsequence', 0.52839595079422)]

In [17]:
def rev_embed(x):  # Pass the model as an argument
    x = x.split(' ')
    embedd = []
    for i in x:
        if i in model.wv:  # Check if word is in vocabulary
            embedd.append(model.wv[i]) # size  of model.wv[i] is 100
    if embedd:  # If any embeddings were found
        x = np.array(embedd)
        x = np.mean(x, axis=0)
        return x
    else:
        return np.zeros(model.vector_size)  # Return a zero vector for missing words


In [18]:
feat_vect = []
for i in df['statement']:
  feat_vect.append(rev_embed(i))
  print(i)

Output hidden; open in https://colab.research.google.com to view.

In [19]:
vect_hash = {}
for i in range(len(df)):
  name = str(df["rating"][i])
  vect_hash[name] = rev_embed(df["statement"][i]).tolist()

vect_hash

{'3100': [0.1585128754377365,
  0.14032451808452606,
  -0.1734509915113449,
  -0.3968011438846588,
  -0.0169450081884861,
  -0.23704376816749573,
  0.12078175693750381,
  0.25179004669189453,
  0.07743540406227112,
  -0.15597045421600342,
  0.052537959069013596,
  0.2667723596096039,
  0.1745116412639618,
  0.002408987144008279,
  0.14878341555595398,
  -0.30132588744163513,
  0.13447582721710205,
  -0.014141879975795746,
  0.1352696716785431,
  -0.1395251750946045,
  -0.04981226474046707,
  -0.0543973371386528,
  0.06795613467693329,
  -0.1928119957447052,
  0.39544162154197693,
  -0.2427513152360916,
  -0.2811991274356842,
  -0.21866625547409058,
  0.014149615541100502,
  -0.09969103336334229,
  0.11237131804227829,
  0.38434088230133057,
  -0.04845326766371727,
  -0.000893542543053627,
  0.226582333445549,
  -0.020805370062589645,
  -0.47230464220046997,
  -0.016786320134997368,
  0.09753303229808807,
  0.0975692942738533,
  0.31650567054748535,
  0.009387638419866562,
  -0.00842800

In [20]:
feat_vect=np.array(feat_vect)
feat_vect

array([[-0.31440443, -0.06013072, -0.6783132 , ...,  0.07726856,
         0.38760427, -0.4387104 ],
       [ 0.27689713, -0.41337878,  0.07209109, ...,  0.48478144,
        -0.32738787, -0.14839399],
       [ 0.22232974, -0.28275865, -0.34103206, ..., -0.3095712 ,
        -0.15001878, -0.7720636 ],
       ...,
       [-0.18175803,  0.1727442 , -0.37346742, ..., -0.06061522,
         0.1277346 ,  0.10240923],
       [ 0.08163052,  0.14837454, -0.03192734, ..., -0.03097657,
         0.16985105,  0.02714377],
       [ 0.3335044 , -0.05246697, -0.09033038, ...,  0.03993677,
        -0.11134081, -0.19171537]], dtype=float32)

In [22]:
feat_vect.shape


(5504, 300)

In [23]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
import gensim
import nltk
from sklearn.model_selection import train_test_split
from gensim.models.callbacks import CallbackAny2Vec