Skip to content

Commit a648162

Browse files
represent code to vector space
0 parents  commit a648162

File tree

7 files changed

+5177
-0
lines changed

7 files changed

+5177
-0
lines changed

.vs/code2vec_experiments/v16/.suo

31 KB
Binary file not shown.

Figure_1.png

36.1 KB
Loading

Figure_2.png

16.4 KB
Loading

code2vec_experiments.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
import pandas as pd
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
import re
5+
from keras.preprocessing.text import Tokenizer
6+
from keras.preprocessing.sequence import pad_sequences
7+
from keras.models import Sequential
8+
from keras.layers import Dense
9+
from keras.layers import Flatten
10+
from keras.layers.embeddings import Embedding
11+
from keras import backend as K
12+
from sklearn.model_selection import train_test_split
13+
from sklearn.decomposition import PCA
14+
from matplotlib import pyplot
15+
16+
font = {'weight': 'bold', 'size': 20}
17+
plt.rc('font', **font)
18+
19+
dataframe = pd.read_json('data/source_code.json', lines=True)
20+
print(dataframe.head(5))
21+
print('{:,}'.format(len(dataframe)))
22+
23+
#print(dataframe.correct.value_counts())
24+
25+
def remove_comments(text):
26+
return re.sub(re.compile('\\.*?\n'), '', text)
27+
def get_docs_and_labels(df):
28+
_docs = []
29+
_labels = []
30+
for index in df.index:
31+
code = remove_comments(
32+
df.at[index, 'source']
33+
)
34+
_docs.append(code)
35+
label = int(df.at[index, 'submission_id'])
36+
_labels.append(label)
37+
return _docs, _labels
38+
39+
docs, labels = get_docs_and_labels(dataframe)
40+
print('{:,}'.format(len(docs)))
41+
42+
43+
NUM_WORDS = 2000
44+
45+
def get_tokenizer():
46+
return Tokenizer(num_words=NUM_WORDS,
47+
filters='\t\n',
48+
lower=True,
49+
split=' ',
50+
char_level=False)
51+
52+
word_t = get_tokenizer()
53+
word_t.fit_on_texts(docs)
54+
print(word_t.word_counts['if']) # count word (if) in the source code submissions
55+
print('Number docs: {:,}'.format(word_t.document_count)) # total num of submission
56+
print(word_t.word_index['if'])
57+
print(word_t.word_docs['if'])
58+
59+
60+
sequences = word_t.texts_to_sequences(docs)
61+
print(sequences[0])
62+
len_seqs = [len(s) for s in sequences]
63+
np.mean(len_seqs), np.std(len_seqs), np.max(len_seqs)
64+
MAX_LENGTH = 50
65+
66+
67+
68+
id_to_word = { v: k for k, v in word_t.word_index.items() }
69+
print(id_to_word[1])
70+
print([id_to_word[index] for index in sequences[0]])
71+
72+
padded_docs = pad_sequences(sequences, maxlen=MAX_LENGTH, padding='post')
73+
print(padded_docs[0])
74+
75+
def f1(y_true, y_pred):
76+
77+
def recall(y_true, y_pred):
78+
"""Recall metric.
79+
80+
Only computes a batch-wise average of recall.
81+
82+
Computes the recall, a metric for multi-label classification of
83+
how many relevant items are selected.
84+
"""
85+
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
86+
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
87+
recall = true_positives / (possible_positives + K.epsilon())
88+
return recall
89+
90+
def precision(y_true, y_pred):
91+
"""Precision metric.
92+
93+
Only computes a batch-wise average of precision.
94+
95+
Computes the precision, a metric for multi-label classification of
96+
how many selected items are relevant.
97+
"""
98+
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
99+
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
100+
precision = true_positives / (predicted_positives + K.epsilon())
101+
return precision
102+
103+
precision = precision(y_true, y_pred)
104+
recall = recall(y_true, y_pred)
105+
return 2*((precision*recall)/(precision+recall+K.epsilon()))
106+
107+
def get_model():
108+
109+
# define the model
110+
model = Sequential()
111+
model.add(Embedding(NUM_WORDS, 100, input_length=MAX_LENGTH))
112+
model.add(Flatten())
113+
model.add(Dense(1, activation='sigmoid'))
114+
# compile the model
115+
model.compile(optimizer='adam',
116+
loss='binary_crossentropy',
117+
metrics=['acc', f1])
118+
# summarize the model
119+
print(model.summary())
120+
return model
121+
122+
model = get_model()
123+
124+
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.2, random_state=0)
125+
model.fit(X_train,
126+
y_train,
127+
epochs=10,
128+
validation_split=0.2)
129+
word_loss, word_accuracy, word_f1 = model.evaluate(X_test, y_test, verbose=1)
130+
print('Accuracy: %f, F1: %f' % (word_accuracy * 100, word_f1 * 100))
131+
132+
word_score = {
133+
'accuracy': word_accuracy,
134+
'F1': word_f1,
135+
}
136+
embeddings_scores = { 'Word': word_score }
137+
def get_embeddings(model):
138+
139+
# Embedding Layer
140+
embedding_layer = model.layers[0]
141+
embeddings = embedding_layer.get_weights()[0]
142+
print('Embedding Layer shape:', embeddings.shape)
143+
144+
return embeddings
145+
embeddings = get_embeddings(model)
146+
147+
148+
def get_pca(embeddings):
149+
150+
# PCA
151+
pca = PCA(n_components=2)
152+
principal_components = pca.fit_transform(embeddings)
153+
print('PCA explained variance ratio:', pca.explained_variance_ratio_, 'Total:', sum(pca.explained_variance_ratio_))
154+
return principal_components
155+
pca = get_pca(embeddings)
156+
157+
def get_top_words(tokenizer, N=50):
158+
159+
return [word for word, occurrences in sorted(tokenizer.word_counts.items(), key=lambda t: t[1], reverse=True)[:N]]
160+
161+
top_words = get_top_words(word_t)
162+
163+
def plot_embeddings(low_dim_embs, id_to_word, top_words, figsize=(8, 8)):
164+
165+
plt.figure(figsize=figsize, dpi=100)
166+
ax = plt.axes()
167+
ax.yaxis.set_major_locator(plt.NullLocator())
168+
ax.xaxis.set_major_locator(plt.NullLocator())
169+
i = 0
170+
while i < len(low_dim_embs):
171+
172+
if i in id_to_word:
173+
174+
x, y = low_dim_embs[i, :]
175+
word = id_to_word[i]
176+
177+
if word in top_words:
178+
plt.scatter(x, y, color='b')
179+
plt.annotate(word,
180+
xy=(x, y),
181+
xytext=(5, 2),
182+
textcoords='offset points',
183+
ha='right',
184+
va='bottom',
185+
fontsize=14)
186+
187+
i += 1
188+
189+
plot_embeddings(pca, id_to_word, top_words, figsize=(18, 18))
190+
pyplot.show()
191+
plot_embeddings(pca, id_to_word, get_top_words(word_t, 20))
192+
pyplot.show()

code2vec_experiments.pyproj

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="4.0">
2+
<PropertyGroup>
3+
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
4+
<SchemaVersion>2.0</SchemaVersion>
5+
<ProjectGuid>eb5bbce0-818d-4a9c-8af1-d22b91611c3d</ProjectGuid>
6+
<ProjectHome>.</ProjectHome>
7+
<StartupFile>code2vec_experiments.py</StartupFile>
8+
<SearchPath>
9+
</SearchPath>
10+
<WorkingDirectory>.</WorkingDirectory>
11+
<OutputPath>.</OutputPath>
12+
<Name>code2vec_experiments</Name>
13+
<RootNamespace>code2vec_experiments</RootNamespace>
14+
</PropertyGroup>
15+
<PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
16+
<DebugSymbols>true</DebugSymbols>
17+
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
18+
</PropertyGroup>
19+
<PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
20+
<DebugSymbols>true</DebugSymbols>
21+
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
22+
</PropertyGroup>
23+
<ItemGroup>
24+
<Compile Include="code2vec_experiments.py" />
25+
</ItemGroup>
26+
<ItemGroup>
27+
<Folder Include="data\" />
28+
</ItemGroup>
29+
<ItemGroup>
30+
<Content Include="data\source_code.json" />
31+
</ItemGroup>
32+
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
33+
<!-- Uncomment the CoreCompile target to enable the Build command in
34+
Visual Studio and specify your pre- and post-build commands in
35+
the BeforeBuild and AfterBuild targets below. -->
36+
<!--<Target Name="CoreCompile" />-->
37+
<Target Name="BeforeBuild">
38+
</Target>
39+
<Target Name="AfterBuild">
40+
</Target>
41+
</Project>

code2vec_experiments.sln

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio Version 16
4+
VisualStudioVersion = 16.0.29411.108
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "code2vec_experiments", "code2vec_experiments.pyproj", "{EB5BBCE0-818D-4A9C-8AF1-D22B91611C3D}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{EB5BBCE0-818D-4A9C-8AF1-D22B91611C3D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{EB5BBCE0-818D-4A9C-8AF1-D22B91611C3D}.Release|Any CPU.ActiveCfg = Release|Any CPU
16+
EndGlobalSection
17+
GlobalSection(SolutionProperties) = preSolution
18+
HideSolutionNode = FALSE
19+
EndGlobalSection
20+
GlobalSection(ExtensibilityGlobals) = postSolution
21+
SolutionGuid = {9275494D-EA79-463A-AE90-E96CCBB75B8F}
22+
EndGlobalSection
23+
EndGlobal

0 commit comments

Comments
 (0)