## Finding Examples in Code_120k in C

In [2]:
import pyarrow.parquet as pq
import pyarrow as pa

code_table = pq.read_table("train-00000-of-00001-d9b93805488c263e.parquet")

In [7]:
sum = 0
for i, category in enumerate(code_table):
    for example in category:
        if i==0:
            sum+=1
print(sum)

121959


## Find Potential C Code Programs Using Regex

In [3]:
import re
from pyarrow import Table

pattern = r"\s+[cC]([\s.?!,;:])(?!\+)"

sum = 0

matching_examples = []

c_examples = None

for i, instruction in enumerate(code_table.column('instruction')):
    example = instruction.as_py()
    match = re.search(pattern, example)
    if match:
        #print(instruction)
        sum += 1
        new_batch = code_table.slice(i, 1)
        #print(new_batch)
        matching_examples.append(new_batch)


print(len(matching_examples))
#print(len(matching_examples[0]))
#for value in matching_examples[1540]:
    #print(value)
print("Number of examples with C:", sum)

c_examples = pa.concat_tables(matching_examples)
print(len(c_examples))

pq.write_table(c_examples, 'c_examples_excluding_c_space_plus.parquet')
print("Examples saved")

1539
Number of examples with C: 1539
1539
Examples saved


#### Matches Examples that Have C ++ (with a space between C and the plus signs)

In [3]:
cpp_pattern = r"[cC]\ [+]"
for example in c_examples.column('instruction'):
    instruction = example.as_py()
    match = re.search(cpp_pattern, instruction)
    if match:
        print(example)

NameError: name 'c_examples' is not defined

## Loading C Examples

In [23]:
import pyarrow.parquet as pq

c_code = pq.read_table("c_examples.parquet")

In [24]:
len(c_code)

1541

In [None]:
for i in range(len(c_code.column('instruction'))):
    if i==3: 
        break
    slice = c_code.slice(i, 1)
    print(slice)

In [None]:
code_table.schema

## Find examples of Python in Code_120k

In [30]:
import re
from pyarrow import Table

pattern = r"Python|python"

sum = 0

matching_examples = []

python_examples = None

for i, instruction in enumerate(code_table.column('instruction')):
    example = instruction.as_py()
    match = re.search(pattern, example)
    if match:
        #print(instruction)
        sum += 1
        new_batch = code_table.slice(i, 1)
        #print(new_batch)
        matching_examples.append(new_batch)


print(len(matching_examples))
#print(len(matching_examples[0]))
#for value in matching_examples[1540]:
    #print(value)
print("Number of examples with Python:", sum)

python_examples = pa.concat_tables(matching_examples)
print(len(python_examples))

pq.write_table(python_examples, 'python_examples.parquet')
print("Examples saved")

17997
Number of examples with Python: 17997
17997
Examples saved


## Find Valid Examples of C Code in code_120k

In [3]:
import pyarrow.parquet as pq
import os

def write_code_to_file(code, filename, extension=".c"):
  """
  Writes the provided code to a file with the given filename and extension.

  Args:
    code: The Python string containing the code to be written.
    filename: The name (without extension) of the file to create.
    extension: The file extension (default is ".c").
  """
  with open(os.path.join("c_code_files", filename + extension), "w") as f:
    f.write(code)

def process_pyarrow_table(table):
  """
  Processes a PyArrow table containing C code examples.

  Args:
    table: A PyArrow table with a column containing code examples.
  """
  code_column = table.column('output') 

  # Create a directory to store the C files
  os.makedirs("c_code_files", exist_ok=True)

  for i, code in enumerate(code_column):
    filename = f"code_{i+1}"
    write_code_to_file(str(code), filename)

# Load the PyArrow table (replace 'your_table.parquet' with your actual file)
table = pq.read_table("c_examples.parquet")

# Process the table
process_pyarrow_table(table)

print("C code written to individual files in the 'c_code' directory.")


C code written to individual files in the 'c_code' directory.


In [2]:
import pyarrow
import pyarrow.parquet as pq
import pandas as pd
import os
import subprocess

def write_code_to_file(code, filename, extension=".c"):
  """
  Writes the provided code to a file with the given filename and extension.

  Args:
    code: The Python string containing the code to be written.
    filename: The name (without extension) of the file to create.
    extension: The file extension (default is ".c").
  """
  with open(os.path.join("c_code_files", filename + extension), "w") as f:
    f.write(code)

def compile_and_test(code, filename):
  """
  Attempts to compile the code using GCC and returns True if successful.

  Args:
    code: The Python string containing the C code.
    filename: The filename (without extension) used for compilation.
  """
  try:
    # Write the code to a temporary file
    with open("temp.c", "w") as f:
      f.write(code)
    # Compile the code using GCC
    subprocess.run(["gcc", "temp.c", "-o", filename], check=True)
    # Compilation successful, remove temporary file
    os.remove("temp.c")
    return True
  except subprocess.CalledProcessError:
    # Compilation failed, remove temporary file and return False
    os.remove("temp.c")
    return False

def create_new_table(table):
  """
  Creates a new PyArrow table containing only successfully compiled code examples
  and their corresponding data from all columns in the original table.

  Args:
    table: A PyArrow table with a column containing code examples.
  """
  #code_column_name = table.column_names[2]  # Get the code column name
  new_data = []

  # Create a directory to store the C files (optional)
  os.makedirs("c_code_files", exist_ok=True)

  for i, row in enumerate(table.to_pandas().itertuples()):
    # Extract all data from the current row
    data = list(row[1:])
    code = data[2]  # Assuming code is the first element after column names
    filename = f"code_{i+1}"
    write_code_to_file(code, filename)
    if compile_and_test(code, 'c_code_files/' + filename + '.c'):
      # Add all data back to the new_data list
      new_data.append(data)

  # Create a new table from the filtered data with the same column names
  new_table = pyarrow.Table.from_pandas(pd.DataFrame(new_data[1:], columns=table.column_names))

  # Write the new table to a new Parquet file
  pq.write_table(new_table, "compiled_code.parquet")

  print("C code written to individual files in the 'c_code' directory.")
  print("Successfully compiled code examples and their corresponding data written to 'compiled_code.parquet'.")

# Load the PyArrow table (replace 'your_table.parquet' with your actual file)
table = pq.read_table('c_examples_excluding_c_space_plus.parquet')

# Create the new table
create_new_table(table)

/bin/ld: /tmp/cc1oEkG4.o: in function `main':
temp.c:(.text+0x48): undefined reference to `log10'
/bin/ld: temp.c:(.text+0xb5): undefined reference to `pow'
/bin/ld: temp.c:(.text+0xc4): undefined reference to `round'
collect2: error: ld returned 1 exit status
temp.c:1:3: error: invalid preprocessing directive #Function
    1 | # Function to find GCD
      |   ^~~~~~~~
temp.c:2:1: error: unknown type name ‘def’
    2 | def find_gcd(a, b, c):
      | ^~~
temp.c: In function ‘find_gcd’:
temp.c:2:22: error: expected declaration specifiers before ‘:’ token
    2 | def find_gcd(a, b, c):
      |                      ^
temp.c:9:3: error: invalid preprocessing directive #Driver
    9 | # Driver code
      |   ^~~~~~
    2 | def find_gcd(a, b, c):
      |     ^~~~~~~~
temp.c: In function ‘multiply’:
   26 |         return C;
      |                ^
/bin/ld: /usr/lib/gcc/x86_64-linux-gnu/11/../../../x86_64-linux-gnu/Scrt1.o: in function `_start':
(.text+0x1b): undefined reference to `main'
col

C code written to individual files in the 'c_code' directory.
Successfully compiled code examples and their corresponding data written to 'compiled_code.parquet'.


/bin/ld: /usr/lib/gcc/x86_64-linux-gnu/11/../../../x86_64-linux-gnu/Scrt1.o: in function `_start':
(.text+0x1b): undefined reference to `main'
collect2: error: ld returned 1 exit status
temp.c: In function ‘main’:
   11 |     gets(str);
      |     ^~~~
      |     fgets
/bin/ld: /tmp/cc3SykfM.o: in function `main':


## Delete All Files Starting with 'code_'

In [5]:
import os

def delete_code_files():
  """Deletes all files in the current directory that start with 'code_'.

  Warns the user before deletion.
  """
  files_to_delete = []
  for filename in os.listdir():
    if filename.startswith("code_"):
      files_to_delete.append(filename)

  if files_to_delete:
    print(f"The following files will be deleted: {', '.join(files_to_delete)}")
    confirmation = input("Are you sure you want to delete these files (y/N)? ").lower()
    if confirmation == "y":
      for filename in files_to_delete:
        os.remove(filename)
      print("Files deleted successfully.")
    else:
      print("Deletion cancelled.")
  else:
    print("No files starting with 'code_' found in the current directory.")

if __name__ == "__main__":
  delete_code_files()


The following files will be deleted: code_2122.cpp, code_1091.cpp, code_3086.cpp, code_504.cpp, code_2118.cpp, code_2638.cpp, code_2860.cpp, code_300.cpp, code_1485.cpp, code_1976.cpp, code_303.cpp, code_2720.cpp, code_1257.cpp, code_3241.cpp, code_3334.cpp, code_1128.cpp, code_2404.cpp, code_1278.cpp, code_2682.cpp, code_1442.cpp, code_298.cpp, code_111.cpp, code_399.cpp, code_1731.cpp, code_210.cpp, code_1439.cpp, code_648.cpp, code_2172.cpp, code_405.cpp, code_3019.cpp, code_1826.cpp, code_2805.cpp, code_3254.cpp, code_1835.cpp, code_1807.cpp, code_845.cpp, code_1654.cpp, code_3007.cpp, code_1273.cpp, code_1635.cpp, code_2880.cpp, code_666.cpp, code_1147.cpp, code_1707.cpp, code_2023.cpp, code_1857.cpp, code_810.cpp, code_2543.cpp, code_2002.cpp, code_3088.cpp, code_3478.cpp, code_2648.cpp, code_33.cpp, code_2286.cpp, code_3452.cpp, code_2349.cpp, code_3385.cpp, code_2733.cpp, code_1176.cpp, code_1900.cpp, code_1031.cpp, code_773.cpp, code_1694.cpp, code_2332.cpp, code_1970.cpp, cod

## Finding CPP Examples in code_120k

#### Find Potential Examples Using Regex

In [5]:
import re
from pyarrow import Table

pattern = r"(?:\b(?:c|C)\+\+|\bcpp\b|\bCpp\b)"

sum = 0

matching_examples = []

c_examples = None

for i, instruction in enumerate(code_table.column('instruction')):
    example = instruction.as_py()
    match = re.search(pattern, example)
    if match:
        #print(instruction)
        sum += 1
        new_batch = code_table.slice(i, 1)
        #print(new_batch)
        matching_examples.append(new_batch)


print(len(matching_examples))
#print(len(matching_examples[0]))
#for value in matching_examples[1540]:
    #print(value)
print("Number of examples with C++:", sum)

c_examples = pa.concat_tables(matching_examples)
print(len(c_examples))

pq.write_table(c_examples, 'cpp_examples.parquet')
print("Examples saved")

3512
Number of examples with C++: 3512
3512
Examples saved


#### Loop Through All Exaxmples. If They Compile, Add Them To The Dataset ('compiled_cpp_code.parquet')

In [6]:
import pyarrow
import pyarrow.parquet as pq
import pandas as pd
import os
import subprocess

def write_code_to_file(code, filename, extension=".cpp"):
  """
  Writes the provided code to a file with the given filename and extension.

  Args:
    code: The Python string containing the code to be written.
    filename: The name (without extension) of the file to create.
    extension: The file extension (default is ".cpp").
  """
  with open(os.path.join("cpp_code_files", filename + extension), "w") as f:
    f.write(code)

def compile_and_test(code, filename):
  """
  Attempts to compile the code using GCC and returns True if successful.

  Args:
    code: The Python string containing the C code.
    filename: The filename (without extension) used for compilation.
  """
  try:
    # Write the code to a temporary file
    with open("temp.cpp", "w") as f:
      f.write(code)
    # Compile the code using GCC
    subprocess.run(["g++", "temp.cpp", "-o", filename], check=True)
    # Compilation successful, remove temporary file
    os.remove("temp.cpp")
    return True
  except subprocess.CalledProcessError:
    # Compilation failed, remove temporary file and return False
    os.remove("temp.cpp")
    return False

def create_new_table(table):
  """
  Creates a new PyArrow table containing only successfully compiled code examples
  and their corresponding data from all columns in the original table.

  Args:
    table: A PyArrow table with a column containing code examples.
  """
  #code_column_name = table.column_names[2]  # Get the code column name
  new_data = []

  # Create a directory to store the C files (optional)
  os.makedirs("cpp_code_files", exist_ok=True)

  for i, row in enumerate(table.to_pandas().itertuples()):
    # Extract all data from the current row
    data = list(row[1:])
    code = data[2]  # Assuming code is the first element after column names
    filename = f"code_{i+1}"
    write_code_to_file(code, filename)
    if compile_and_test(code, 'cpp_code_files/' + filename + '.cpp'):
      # Add all data back to the new_data list
      new_data.append(data)

  # Create a new table from the filtered data with the same column names
  new_table = pyarrow.Table.from_pandas(pd.DataFrame(new_data[1:], columns=table.column_names))

  # Write the new table to a new Parquet file
  pq.write_table(new_table, "compiled_cpp_code.parquet")

  print("CPP code written to individual files in the 'cpp_code_files' directory.")
  print("Successfully compiled code examples and their corresponding data written to 'compiled_cpp_code.parquet'.")

# Load the PyArrow table (replace 'your_table.parquet' with your actual file)
table = pq.read_table("cpp_examples.parquet")

# Create the new table
create_new_table(table)

/bin/ld: /usr/lib/gcc/x86_64-linux-gnu/11/../../../x86_64-linux-gnu/Scrt1.o: in function `_start':
(.text+0x1b): undefined reference to `main'
collect2: error: ld returned 1 exit status
/bin/ld: /usr/lib/gcc/x86_64-linux-gnu/11/../../../x86_64-linux-gnu/Scrt1.o: in function `_start':
(.text+0x1b): undefined reference to `main'
collect2: error: ld returned 1 exit status
/bin/ld: /usr/lib/gcc/x86_64-linux-gnu/11/../../../x86_64-linux-gnu/Scrt1.o: in function `_start':
(.text+0x1b): undefined reference to `main'
collect2: error: ld returned 1 exit status
temp.cpp:16:1: error: expected unqualified-id before ‘for’
   16 | for (int i = 0; i < 5; i++) {
      | ^~~
temp.cpp:16:17: error: ‘i’ does not name a type
   16 | for (int i = 0; i < 5; i++) {
      |                 ^
temp.cpp:16:24: error: ‘i’ does not name a type
   16 | for (int i = 0; i < 5; i++) {
      |                        ^
temp.cpp:1:6: error: variable or field ‘bubbleSort’ declared void
    1 | void bubbleSort(vector<int> 

C code written to individual files in the 'c_code' directory.
Successfully compiled code examples and their corresponding data written to 'compiled_code.parquet'.


collect2: error: ld returned 1 exit status


In [7]:
table = pq.read_table('compiled_cpp_code.parquet')

In [8]:
len(table)

2010