In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style('darkgrid')

# Basic statistics of preprocessed Python150k dataset

In [None]:
!unzip parsed.zip
!ls parsed

In [None]:
prefix = "parsed"

comments = open(f"{prefix}/python150k_comments.txt", "rt")
comments = comments.readlines()
print("Len(comments):", len(comments))
print("comments[10]:", comments[10])

docstrings = open(f"{prefix}/python150k_docstrings.txt", "rt")
docstrings = docstrings.readlines()
print("Len(docstrings):", len(docstrings))
print("docstrings[10]:", docstrings[10])

functions = open(f"{prefix}/python150k_sequence.txt", "rt")
functions = functions.readlines()
print("Len(functions):", len(functions))
print("functions[10]:", functions[10])

ast_functions = open(f"{prefix}/python150k_ast.txt", "rt")
ast_functions = ast_functions.readlines()
print("Len(ast_functions):", len(ast_functions))
print("ast_functions[10]:", ast_functions[10])

Train/Test split:

In [None]:
from sklearn.model_selection import train_test_split

train_comments, test_comments = train_test_split(comments, test_size=0.2)
train_functions, test_functions = train_test_split(functions, test_size=0.2)
train_ast_functions, test_ast_functions = train_test_split(ast_functions, test_size=0.2)
train_docstrings, test_docstrings = train_test_split(docstrings, test_size=0.2)

In [None]:
print("# functions in train:", len(train_functions))
print("# functions in test:", len(test_functions))
print("# comments in train:", len(train_comments))
print("# comments in test:", len(test_comments))
print("# docstrings in train:", len(train_docstrings))
print("# docstrings in test:", len(test_docstrings))
print("# ast functions in train:", len(train_ast_functions))
print("# ast functions in test:", len(test_ast_functions))

**Note:** <br>
Processed files: **6854** <br>
Canceled files:  **1459** <br>
Total files: **8313**

# Distributions:

**Usual functions:**

In [None]:
# Count functions length histogram
train_func_lengths = list(map(len, train_functions))
test_func_lengths = list(map(len, test_functions))

# Filter bad cases
train_func_lengths = list(filter(lambda len: len < 1e4, train_func_lengths))
test_func_lengths = list(filter(lambda len: len < 1e4, test_func_lengths))

In [None]:
plt.figure(figsize=(15, 8))
plt.xlabel("Length of the function", fontsize=20)
plt.ylabel("Density", fontsize=20)
ax = sns.distplot(train_func_lengths, color='#007b7f', label='Functions from train set', kde_kws=dict(linewidth=3))
ax.patch.set_edgecolor('black')  
ax.patch.set_linewidth('2')
ax2 = sns.distplot(test_func_lengths, color='g', label='Functions from test set', kde_kws=dict(linewidth=3))
ax2.patch.set_edgecolor('black')  
ax2.patch.set_linewidth('2')
plt.setp(ax.patches, linewidth=3)
plt.title("Functions lengths distributions", fontsize=30)
plt.xlim((-10, 2000))
plt.legend()
plt.show()

**AST-processed functions:**

In [None]:
# Count functions length histogram
train_func_lengths = list(map(len, train_ast_functions))
test_func_lengths = list(map(len, test_ast_functions))

# Filter bad cases
train_func_lengths = list(filter(lambda len: len < 1e4, train_func_lengths))
test_func_lengths = list(filter(lambda len: len < 1e4, test_func_lengths))

In [None]:
plt.figure(figsize=(15, 8))
plt.xlabel("Length of the function", fontsize=20)
plt.ylabel("Density", fontsize=20)
ax = sns.distplot(train_func_lengths, color='#007b7f', label='Functions from train set', kde_kws=dict(linewidth=3))
ax.patch.set_edgecolor('black')  
ax.patch.set_linewidth('2')
ax2 = sns.distplot(test_func_lengths, color='g', label='Functions from test set', kde_kws=dict(linewidth=3))
ax2.patch.set_edgecolor('black')  
ax2.patch.set_linewidth('2')
plt.setp(ax.patches, linewidth=3)
plt.title("AST processed functions lengths distributions", fontsize=30)
plt.xlim((-10, 2000))
plt.legend()
plt.show()

# Summary lengths:

In [None]:
train_lengths_comments = list(map(len, train_comments))
test_lengths_comments = list(map(len, test_comments))

print("Mean length of train comments in SYMBOLS:", np.array(train_lengths_comments).mean())
print("Mean length of test comments in SYMBOLS:", np.array(test_lengths_comments).mean())

In [None]:
train_lengths_comments = list(map(lambda comment: len(comment.split()), train_comments))
test_lengths_comments = list(map(lambda comment: len(comment.split()), test_comments))


print("Mean length of train comments in WORDS:", np.array(train_lengths_comments).mean())
print("Mean length of test comments in WORDS:", np.array(test_lengths_comments).mean())

In [None]:
plt.figure(figsize=(15, 8))
plt.xlabel("Length of the function", fontsize=20)
plt.ylabel("Density", fontsize=20)
ax = sns.distplot(train_lengths_comments, color='#007b7f', label='Functions from train set', kde_kws=dict(linewidth=3))
ax.patch.set_edgecolor('black')  
ax.patch.set_linewidth('2')
ax2 = sns.distplot(test_lengths_comments, color='g', label='Functions from test set', kde_kws=dict(linewidth=3))
ax2.patch.set_edgecolor('black')  
ax2.patch.set_linewidth('2')
plt.setp(ax.patches, linewidth=3)
plt.title("Comments lengths distributions", fontsize=30)
plt.legend()
plt.show()

# Words distribution for comments:

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english') + ['ha', 'wa'])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

count_vectorizer = CountVectorizer(stop_words=stop_words, min_df=10)
comments_data = count_vectorizer.fit_transform(comments)

functions_vectorizer = CountVectorizer(stop_words=stop_words, min_df=10)
functions_data = functions_vectorizer.fit_transform(functions)

def plot_most_common_words(count_data, count_vectorizer, n_words):
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:n_words]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='Most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90, fontsize=15) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

In [None]:
plot_most_common_words(comments_data, count_vectorizer, 20)

In [None]:
plot_most_common_words(functions_data, functions_vectorizer, 20)

# Consider WordCloud for functions:

In [None]:
from wordcloud import WordCloud

all_texts = ','.join(functions)
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, 
                      contour_color='steelblue', width=800, height=400)

wordcloud.generate(all_texts)
wordcloud.to_image()