In [9]:
!pip install nltk



In [12]:
!pip install spacy



In [101]:
import nltk
import spacy
import pandas as pd
import numpy as np
import tensorflow as tf
import random

from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from tensorflow.keras.preprocessing.text import Tokenizer

In [21]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [95]:
MAX_NUMBER_OF_WORDS_PER_SENTENCE = 45
MAX_NUMBER_OF_SENTENCES = 10
MAX_NUMBER_OF_LETTERS = 550
VOCAB_SIZE = 800

In [3]:
dataset_csv = "data.csv"

In [8]:
data_frame = pd.read_csv(dataset_csv)
data_frame

Unnamed: 0,Problem,Difficulty,Tags,Worst Complexity,Link
0,Given an array of integers nums and an integer...,Easy,Brute force;Hashmap,N^2,https://leetcode.com/problems/two-sum/
1,You are given two non-empty linked lists repre...,Medium,Linked List;Implementation,N,https://leetcode.com/problems/add-two-numbers/
2,"Given a string s, find the length of the longe...",Medium,Sliding Window;Hashmap,N,https://leetcode.com/problems/longest-substrin...
3,Given two sorted arrays nums1 and nums2 of siz...,Hard,Binary Search;Divide and conquer,log(N),https://leetcode.com/problems/median-of-two-so...
4,"Given a string s, return the longest \npalindr...",Medium,Dynamic Programming,N^2,https://leetcode.com/problems/longest-palindro...
...,...,...,...,...,...
68,"Given an m x n integers matrix, return the len...",Hard,Dynamic Programming,N^2,https://leetcode.com/problems/longest-increasi...
69,Write a function that reverses a string. The i...,Easy,Implementation,N,https://leetcode.com/problems/reverse-string/
70,"Given an integer array nums and an integer k, ...",Medium,Hashmap,N,https://leetcode.com/problems/top-k-frequent-e...
71,"Given a string s, find the first non-repeating...",Easy,Hashmap,N,https://leetcode.com/problems/first-unique-cha...


In [62]:
def clean_text(text):
  text = text.strip()
  text = text.lower()

  lemmatizer = WordNetLemmatizer()
  words = text.split()
  words = [lemmatizer.lemmatize(word) for word in words]
  text = ' '.join(words)

  return text

nlp = spacy.load("en_core_web_sm")
def split_text_to_sentences(text):
    doc = nlp(text)

    sentences = [sentence.text for sentence in doc.sents]

    return sentences



In [None]:
def clean_dataframe(data_frame):
    data_frame["Problem"] = data_frame["Problem"].apply(lambda problem: clean_text(problem))
    return data_frame

In [42]:
def split_dataframe_by_tag(data_frame):
    data_frame = data_frame.assign(Tag=data_frame['Tags'].str.split(';')).explode('Tag')
    return data_frame

In [43]:
processed_dataframe = clean_dataframe(data_frame)


In [44]:
processed_dataframe = split_dataframe_by_tag(processed_dataframe)

In [45]:
processed_dataframe

Unnamed: 0,Problem,Difficulty,Tags,Worst Complexity,Link,Tag
0,given an array of integer nums and an integer ...,Easy,Brute force;Hashmap,N^2,https://leetcode.com/problems/two-sum/,Brute force
0,given an array of integer nums and an integer ...,Easy,Brute force;Hashmap,N^2,https://leetcode.com/problems/two-sum/,Hashmap
1,you are given two non-empty linked list repres...,Medium,Linked List;Implementation,N,https://leetcode.com/problems/add-two-numbers/,Linked List
1,you are given two non-empty linked list repres...,Medium,Linked List;Implementation,N,https://leetcode.com/problems/add-two-numbers/,Implementation
2,"given a string s, find the length of the longe...",Medium,Sliding Window;Hashmap,N,https://leetcode.com/problems/longest-substrin...,Sliding Window
...,...,...,...,...,...,...
69,write a function that revers a string. the inp...,Easy,Implementation,N,https://leetcode.com/problems/reverse-string/,Implementation
70,"given an integer array nums and an integer k, ...",Medium,Hashmap,N,https://leetcode.com/problems/top-k-frequent-e...,Hashmap
71,"given a string s, find the first non-repeating...",Easy,Hashmap,N,https://leetcode.com/problems/first-unique-cha...,Hashmap
72,"given four integer array nums1, nums2, nums3, ...",Medium,Hashmap;Binary Search,N^2,https://leetcode.com/problems/4sum-ii/descript...,Hashmap


In [87]:
def analyze_dataframe(data_frame):
  max_number_of_letters = 0
  max_number_of_words_per_sentence = 0
  max_number_of_sentences = 0
  vocabs = dict()
  problems_count_by_tag = defaultdict(int)
  problems_count_by_difficulty = defaultdict(int)
  problems_count_by_time_complexity = defaultdict(int)

  for row in processed_dataframe.iterrows():
      row_content = row[1]
      problem, tag, time_complexity, difficulty = row_content["Problem"], row_content["Tag"], row_content["Worst Complexity"], row_content["Difficulty"]

      max_number_of_letters = max(max_number_of_letters, len(problem))

      problem_sentences = split_text_to_sentences(problem)
      number_of_sentences = len(problem_sentences)
      max_number_of_sentences = max(max_number_of_sentences, number_of_sentences)

      for sentence in problem_sentences:
        words = sentence.split(" ")

        for word in words:
          vocabs[word] = 1

        number_of_words = len(words)
        max_number_of_words_per_sentence = max(max_number_of_words_per_sentence, number_of_words)

        problems_count_by_tag[tag] += 1
        problems_count_by_difficulty[difficulty] += 1
        problems_count_by_time_complexity[time_complexity] += 1

  return {
      "max_number_of_words_per_sentence": max_number_of_words_per_sentence,
      "max_number_of_sentences": max_number_of_sentences,
      "max_number_of_letters": max_number_of_letters,
      "vocabs": vocabs,
      "problems_count_by_tag": problems_count_by_tag,
      "problems_count_by_difficulty": problems_count_by_difficulty,
      "problems_count_by_time_complexity": problems_count_by_time_complexity
  }





In [88]:
analyze_result = analyze_dataframe(processed_dataframe)

In [90]:
analyze_result["max_number_of_letters"]

549

In [96]:
def get_tokenizer(texts):
    tokenizer = Tokenizer(num_words = VOCAB_SIZE)
    tokenizer.fit_on_texts(texts)
    return tokenizer

In [97]:
problems = processed_dataframe["Problem"]
tokenizer = get_tokenizer(problems)

In [None]:
tokenizer

In [103]:
def permute_sentences(text):
    sentences = split_text_to_sentences(text)
    random.shuffle(sentences)  # Shuffle the sentences randomly
    permuted_text = '. '.join(sentences)  # Join the sentences back into a text string
    return permuted_text


In [113]:
# Permute the sentences in the problem to get a little more data
def permute_problem_sentences_in_dataframe(data_frame):
    new_rows = []
    for row in data_frame.iterrows():
        row_content = row[1]
        problem, tag, time_complexity, difficulty, link = row_content["Problem"], row_content["Tag"], row_content["Worst Complexity"], \
            row_content["Difficulty"], row_content["Link"]
        permuted_problem = permute_sentences(problem)
        new_row = {
            "Problem": permuted_problem,
            "Tag": tag,
            "Worst Complexity": time_complexity,
            "Difficulty": difficulty,
            "Link": link
        }
        new_rows.append(new_row)

    new_data_frame = data_frame.append(new_rows)
    return new_data_frame


In [114]:
new_data_frame = permute_problem_sentences_in_dataframe(processed_dataframe)

  new_data_frame = data_frame.append(new_rows)


In [115]:
new_data_frame

Unnamed: 0,Problem,Difficulty,Tags,Worst Complexity,Link,Tag
0,given an array of integer nums and an integer ...,Easy,Brute force;Hashmap,N^2,https://leetcode.com/problems/two-sum/,Brute force
0,given an array of integer nums and an integer ...,Easy,Brute force;Hashmap,N^2,https://leetcode.com/problems/two-sum/,Hashmap
1,you are given two non-empty linked list repres...,Medium,Linked List;Implementation,N,https://leetcode.com/problems/add-two-numbers/,Linked List
1,you are given two non-empty linked list repres...,Medium,Linked List;Implementation,N,https://leetcode.com/problems/add-two-numbers/,Implementation
2,"given a string s, find the length of the longe...",Medium,Sliding Window;Hashmap,N,https://leetcode.com/problems/longest-substrin...,Sliding Window
...,...,...,...,...,...,...
88,the input string is given a an array of charac...,Easy,,N,https://leetcode.com/problems/reverse-string/,Implementation
89,"given an integer array nums and an integer k, ...",Medium,,N,https://leetcode.com/problems/top-k-frequent-e...,Hashmap
90,"if it doe not exist, return -1.. given a strin...",Easy,,N,https://leetcode.com/problems/first-unique-cha...,Hashmap
91,"given four integer array nums1, nums2, nums3, ...",Medium,,N^2,https://leetcode.com/problems/4sum-ii/descript...,Hashmap
