In [2]:
import json
import pandas as pd

In [5]:
# In this Cell, we convert our Jsonl file to Dataframe for better understanding
PATH = '../../dump-original.jsonl'

data = []

# Read and parse each line of the JSONL file
with open(PATH, 'r') as f:
    for line_num, line in enumerate(f, start=1):
        try:
            json_obj = json.loads(line.strip())  # Parse the JSON object from the line
            data.append(json_obj)  # Append valid JSON objects to the list
        except json.JSONDecodeError as e:
            print(f"Skipping line {line_num}: {e}")  # Catch and display any parsing errors

# Convert the list of valid JSON objects to a Pandas DataFrame
df = pd.DataFrame(data)
print(f"Number of Dataframe rows : {df.shape}")
df.head()

Number of Dataframe rows : (1262910, 3)


Unnamed: 0,submission_id,language,source
0,12746065,GNU C++11,/*\n******************************************...
1,12746876,GNU C++,#include<iostream>\n\nusing namespace std;\n\n...
2,12747297,MS C++,#include <iostream>\n#include <sstream>\n#incl...
3,12747301,GNU C++11,#include<stdio.h>\n#include<algorithm>\n#inclu...
4,12747302,GNU C++11,#include <bits/stdc++.h>\nusing namespace std;...


In [6]:
# We can see how many languages we have
df['language'].value_counts()

language
GNU C++14                451022
GNU C++11                329281
GNU C++                  242294
Java 8                    63483
MS C++                    34856
Python 3                  30827
GNU C++17                 27384
GNU C                     26880
Python 2                   9346
FPC                        8156
GNU C11                    5925
Java 7                     5564
GNU C++17 Diagnostics      4420
MS C#                      4195
PyPy 3                     2947
PyPy 2                     2298
Mono C#                    2081
PascalABC.NET              2075
JavaScript                 1670
Delphi                     1177
Go                         1124
Haskell                     983
Scala                       975
Ruby                        972
PHP                         798
Kotlin                      577
Rust                        461
D                           429
Perl                        348
Clang++17 Diagnostics       208
Ocaml                       154

In [7]:
# We decided to choose Python 3 as a main language
df = df[df['language'] == 'Python 3']
print(f"Number of Dataframe rows : {df.shape}")
df['language'].value_counts()

Number of Dataframe rows : (30827, 3)


language
Python 3    30827
Name: count, dtype: int64

In [8]:
# Removing 'submission_id' column
df = df.drop('submission_id', axis=1)
df.head()

Unnamed: 0,language,source
52,Python 3,"n, m = map(int, input().split())\na = []\nfor ..."
123,Python 3,"n, m = map(int, input().split())\na = []\nfor ..."
144,Python 3,#In the name of Allah\n\nfrom sys import stdin...
530,Python 3,"s = input()\nl, *v = (int(x) for x in input()...."
734,Python 3,"s = input()\nl, *v = (int(x) for x in input()...."


In [9]:
# Reseting indexes of dataframe
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,language,source
0,Python 3,"n, m = map(int, input().split())\na = []\nfor ..."
1,Python 3,"n, m = map(int, input().split())\na = []\nfor ..."
2,Python 3,#In the name of Allah\n\nfrom sys import stdin...
3,Python 3,"s = input()\nl, *v = (int(x) for x in input()...."
4,Python 3,"s = input()\nl, *v = (int(x) for x in input()...."


In [10]:
# Filter the DataFrame based on the number of lines in 'source' column less than 70 lines and reorder indexes
df = df[df['source'].apply(lambda x: 10 <= len(x.split('\n')) <= 60)]
df = df.reset_index(drop=True)
df.shape

(23187, 2)

In [11]:
# Cheking dataset whether it has null row or not
print(df['language'].isnull().sum())
print(df['source'].isnull().sum())

0
0


In [68]:
# Save as the csv file
df.to_csv('python_codes.csv', index=False)