In [109]:
import os
import pandas as pd
import numpy as np
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

## Text Dataset
Terry Pratchett + other books

In [93]:
text_examples = set()

pratchett_plus_path = "/home/alex/Proga/proj/pratchett"

def collect_text(path):
    for file in os.listdir(path):
        cur_path = os.path.join(path, file)
        if os.path.isdir(cur_path):
            collect_code(cur_path)
        elif os.path.isfile(cur_path):
            if file.endswith(".txt") or file.endswith(".fb2"):
                comment = 0
                for line in open(cur_path, 'r'):
                    ln = line.strip()
                    if not len(ln):
                        continue
                    i = 0
                    while i < len(ln):
                        chunk_size = random.randint(10, 50)
                        text_examples.add(ln[i : min(len(ln), i + chunk_size)])                        
                        i += chunk_size

collect_text(pratchett_plus_path)    

In [94]:
pd_text = pd.DataFrame(data={
    'marker': np.ones(len(text_examples), int), 'line': list(text_examples),
})
pd_text.shape

(496422, 2)

## Code Dataset
https://github.com/pytorch/pytorch

In [86]:
code_examples = set()

pytorch_path = "/home/alex/Proga/proj/pytorch"

def collect_code(path):
    for file in os.listdir(path):
        cur_path = os.path.join(path, file)
        if os.path.isdir(cur_path):
            collect_code(cur_path)
        elif os.path.isfile(cur_path):
            if file.endswith(".cpp"):
                comment = 0
                for line in open(cur_path, 'r'):
                    ln = line.strip()
                    if not len(ln):
                        continue
                    if ln.find("/*") != -1:
                        comment = 1
                    if ln.find("*/") != -1:
                        comment = 0
                    if comment == 1 or ln.find("//") != -1:
                        continue
                    code_examples.add(ln)
                
collect_code(pytorch_path)    

In [87]:
pd_code = pd.DataFrame(data={
    'marker': np.zeros(len(code_examples), int), 'line': list(code_examples),
})
pd_code.shape

(274334, 2)

## Combined Dataset

In [95]:
data = pd.concat([pd_code, pd_text], ignore_index=True)
data = data.sample(frac=1).reset_index(drop=True)
data.to_csv('data.csv', index=False)
data.shape

(770756, 2)

In [96]:
data.head(10)

Unnamed: 0,marker,line
0,0,return max_ws_size;
1,0,void IRPrinter::visit(VarPtr v) {
2,0,480709
3,1,"A wise man feareth, and departeth from evil: but"
4,1,"before him, and causeth the earth and th"
5,1,"he did so, and giving it a"
6,0,"dropout_p_(dropout_p),"
7,0,using AllocationInfoPtr = AllocationUseDefInfo*;
8,1,ng.And this evening was a
9,1,lm


## Make vectors from strings

In [101]:
cv = CountVectorizer()
cnts = cv.fit_transform(data['line'])

In [103]:
X_tr, X_te, Y_tr, Y_te = train_test_split(cnts, data['marker'], test_size=0.2, random_state=533)

In [112]:
model_NB = MultinomialNB().fit(X_tr, Y_tr)
model_LR = LogisticRegression(max_iter=1000, multi_class='ovr').fit(X_tr, Y_tr)

In [111]:
model_NB.score(X_te, Y_te)

0.9649826145622502

In [113]:
model_LR.score(X_te, Y_te)

0.9807916861279776