1+ import pandas as pd
2+ import numpy as np
3+ import matplotlib .pyplot as plt
4+ import re
5+ from keras .preprocessing .text import Tokenizer
6+ from keras .preprocessing .sequence import pad_sequences
7+ from keras .models import Sequential
8+ from keras .layers import Dense
9+ from keras .layers import Flatten
10+ from keras .layers .embeddings import Embedding
11+ from keras import backend as K
12+ from sklearn .model_selection import train_test_split
13+ from sklearn .decomposition import PCA
14+ from matplotlib import pyplot
15+
16+ font = {'weight' : 'bold' , 'size' : 20 }
17+ plt .rc ('font' , ** font )
18+
19+ dataframe = pd .read_json ('data/source_code.json' , lines = True )
20+ print (dataframe .head (5 ))
21+ print ('{:,}' .format (len (dataframe )))
22+
23+ #print(dataframe.correct.value_counts())
24+
25+ def remove_comments (text ):
26+ return re .sub (re .compile ('\\ .*?\n ' ), '' , text )
27+ def get_docs_and_labels (df ):
28+ _docs = []
29+ _labels = []
30+ for index in df .index :
31+ code = remove_comments (
32+ df .at [index , 'source' ]
33+ )
34+ _docs .append (code )
35+ label = int (df .at [index , 'submission_id' ])
36+ _labels .append (label )
37+ return _docs , _labels
38+
39+ docs , labels = get_docs_and_labels (dataframe )
40+ print ('{:,}' .format (len (docs )))
41+
42+
43+ NUM_WORDS = 2000
44+
45+ def get_tokenizer ():
46+ return Tokenizer (num_words = NUM_WORDS ,
47+ filters = '\t \n ' ,
48+ lower = True ,
49+ split = ' ' ,
50+ char_level = False )
51+
52+ word_t = get_tokenizer ()
53+ word_t .fit_on_texts (docs )
54+ print (word_t .word_counts ['if' ]) # count word (if) in the source code submissions
55+ print ('Number docs: {:,}' .format (word_t .document_count )) # total num of submission
56+ print (word_t .word_index ['if' ])
57+ print (word_t .word_docs ['if' ])
58+
59+
60+ sequences = word_t .texts_to_sequences (docs )
61+ print (sequences [0 ])
62+ len_seqs = [len (s ) for s in sequences ]
63+ np .mean (len_seqs ), np .std (len_seqs ), np .max (len_seqs )
64+ MAX_LENGTH = 50
65+
66+
67+
68+ id_to_word = { v : k for k , v in word_t .word_index .items () }
69+ print (id_to_word [1 ])
70+ print ([id_to_word [index ] for index in sequences [0 ]])
71+
72+ padded_docs = pad_sequences (sequences , maxlen = MAX_LENGTH , padding = 'post' )
73+ print (padded_docs [0 ])
74+
75+ def f1 (y_true , y_pred ):
76+
77+ def recall (y_true , y_pred ):
78+ """Recall metric.
79+
80+ Only computes a batch-wise average of recall.
81+
82+ Computes the recall, a metric for multi-label classification of
83+ how many relevant items are selected.
84+ """
85+ true_positives = K .sum (K .round (K .clip (y_true * y_pred , 0 , 1 )))
86+ possible_positives = K .sum (K .round (K .clip (y_true , 0 , 1 )))
87+ recall = true_positives / (possible_positives + K .epsilon ())
88+ return recall
89+
90+ def precision (y_true , y_pred ):
91+ """Precision metric.
92+
93+ Only computes a batch-wise average of precision.
94+
95+ Computes the precision, a metric for multi-label classification of
96+ how many selected items are relevant.
97+ """
98+ true_positives = K .sum (K .round (K .clip (y_true * y_pred , 0 , 1 )))
99+ predicted_positives = K .sum (K .round (K .clip (y_pred , 0 , 1 )))
100+ precision = true_positives / (predicted_positives + K .epsilon ())
101+ return precision
102+
103+ precision = precision (y_true , y_pred )
104+ recall = recall (y_true , y_pred )
105+ return 2 * ((precision * recall )/ (precision + recall + K .epsilon ()))
106+
107+ def get_model ():
108+
109+ # define the model
110+ model = Sequential ()
111+ model .add (Embedding (NUM_WORDS , 100 , input_length = MAX_LENGTH ))
112+ model .add (Flatten ())
113+ model .add (Dense (1 , activation = 'sigmoid' ))
114+ # compile the model
115+ model .compile (optimizer = 'adam' ,
116+ loss = 'binary_crossentropy' ,
117+ metrics = ['acc' , f1 ])
118+ # summarize the model
119+ print (model .summary ())
120+ return model
121+
122+ model = get_model ()
123+
124+ X_train , X_test , y_train , y_test = train_test_split (padded_docs , labels , test_size = 0.2 , random_state = 0 )
125+ model .fit (X_train ,
126+ y_train ,
127+ epochs = 10 ,
128+ validation_split = 0.2 )
129+ word_loss , word_accuracy , word_f1 = model .evaluate (X_test , y_test , verbose = 1 )
130+ print ('Accuracy: %f, F1: %f' % (word_accuracy * 100 , word_f1 * 100 ))
131+
132+ word_score = {
133+ 'accuracy' : word_accuracy ,
134+ 'F1' : word_f1 ,
135+ }
136+ embeddings_scores = { 'Word' : word_score }
137+ def get_embeddings (model ):
138+
139+ # Embedding Layer
140+ embedding_layer = model .layers [0 ]
141+ embeddings = embedding_layer .get_weights ()[0 ]
142+ print ('Embedding Layer shape:' , embeddings .shape )
143+
144+ return embeddings
145+ embeddings = get_embeddings (model )
146+
147+
148+ def get_pca (embeddings ):
149+
150+ # PCA
151+ pca = PCA (n_components = 2 )
152+ principal_components = pca .fit_transform (embeddings )
153+ print ('PCA explained variance ratio:' , pca .explained_variance_ratio_ , 'Total:' , sum (pca .explained_variance_ratio_ ))
154+ return principal_components
155+ pca = get_pca (embeddings )
156+
157+ def get_top_words (tokenizer , N = 50 ):
158+
159+ return [word for word , occurrences in sorted (tokenizer .word_counts .items (), key = lambda t : t [1 ], reverse = True )[:N ]]
160+
161+ top_words = get_top_words (word_t )
162+
163+ def plot_embeddings (low_dim_embs , id_to_word , top_words , figsize = (8 , 8 )):
164+
165+ plt .figure (figsize = figsize , dpi = 100 )
166+ ax = plt .axes ()
167+ ax .yaxis .set_major_locator (plt .NullLocator ())
168+ ax .xaxis .set_major_locator (plt .NullLocator ())
169+ i = 0
170+ while i < len (low_dim_embs ):
171+
172+ if i in id_to_word :
173+
174+ x , y = low_dim_embs [i , :]
175+ word = id_to_word [i ]
176+
177+ if word in top_words :
178+ plt .scatter (x , y , color = 'b' )
179+ plt .annotate (word ,
180+ xy = (x , y ),
181+ xytext = (5 , 2 ),
182+ textcoords = 'offset points' ,
183+ ha = 'right' ,
184+ va = 'bottom' ,
185+ fontsize = 14 )
186+
187+ i += 1
188+
189+ plot_embeddings (pca , id_to_word , top_words , figsize = (18 , 18 ))
190+ pyplot .show ()
191+ plot_embeddings (pca , id_to_word , get_top_words (word_t , 20 ))
192+ pyplot .show ()
0 commit comments