In [1]:
import pandas as pd
import os
# from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data

In [2]:
df = pd.DataFrame(columns=["text", "project_name"])

In [3]:
def load_data(directory):
#     directory = "github_data/train_set/"

    dfs = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            f = open(directory + filename, "r", encoding='utf-8')
            Lines = f.readlines()
            f.close()
            temp = pd.DataFrame()
            temp["text"] = Lines
            temp["project_name"] = filename
            temp = temp.drop(temp[temp["text"] == "\n"].index).reset_index(drop=True)
            dfs.append(temp)
    
    data = pd.DataFrame(columns=["text", "project_name"])

    for p in dfs:
        data = pd.concat([data, p], axis=0)
    data = data.reset_index(inplace=False)[["text", "project_name"]]
    data['text'] = data[data.text.str.endswith("\n")]['text'].str[:-2]
    
    return data

In [12]:
def clean_set_indicator(data):
    for s in ["train", "validation", "test"]:
        data.project_name = data.project_name.str.replace(f"_{s}","")
    return data

## TF IDF

In [13]:
data = load_data("github_data/train_set/")
data = clean_set_indicator(data)
corpus = data.groupby("project_name")['text'].apply(lambda x: ' '.join(x)).reset_index()

In [14]:
corpus

Unnamed: 0,project_name,text
0,PaddleHub.txt,import cv import numpy as n from PIL import Im...
1,PySolFC.txt,"def createGame(self, max_rounds=-1, num_de..."
2,building_tool.txt,"import bp from .core import register_core, unr..."
3,espnet.txt,"xs = xs[:, : max(ilens) ys = ys..."
4,horovod.txt,@_cach def nccl_built(verbose=False) for e...
5,jina.txt,"from ...excepts import BadClient, GRPCServerEr..."
6,pytorch_geometric.txt,"def __init__(self, in_channels, out_channe..."


In [6]:
my_corpus = corpus['text'].tolist()

In [16]:
validation_set = load_data("github_data/validation_set/")

validation_set

Unnamed: 0,text,project_name
0,@classmetho,building_tool.txt
1,"def build(cls, context, prop)",building_tool.txt
2,verify_facemaps_for_object(context.object,building_tool.txt
3,me = get_edit_mesh(,building_tool.txt
4,bm = bmesh.from_edit_mesh(me,building_tool.txt
...,...,...
50755,<https://arxiv.org/abs/2003.03123>`_ paper,pytorch_geometric.txt
50756,DimeNet transforms messages based on the a...,pytorch_geometric.txt
50757,rotation-equivariant fashion,pytorch_geometric.txt
50758,.. note:,pytorch_geometric.txt


In [None]:
x_validation = validation_set["text"][:-1]
y_validation = validation_set["project_name"]

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(my_corpus)


In [None]:
X.toarray()

In [None]:
vectorizer.get_feature_names()

In [None]:
y_train = corpus["project_name"]
y_train

In [None]:
a = X.toarray()
a[0]

In [None]:
x_testcv = vectorizer.transform(x_validation)
x_testcv.toarray()

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X, y_train)

In [None]:
pred = mnb.predict(x_testcv[0])
pred

In [None]:
y_validation

In [None]:
# data.drop_duplicates()

In [None]:
data[data.duplicated(keep=False)]

In [None]:
data[without['text'] == data['text']]

# Features

In [None]:
data[data['text'].str.contains(r'class .* : ')]

In [None]:
# understand distribution of names between project and understand 
# we want to be able to ask a test sample if it calls a function or class defined only in some or one project.
lst = []
class_df = data[data['text'].str.contains(r'class .*')]

In [32]:
class_df['text'] = class_df['text'].str.strip("class")
class_df['text'] = class_df['text'].str.replace(r"\(.*","")
class_df = class_df.drop_duplicates()

In [22]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(my_corpus)


In [23]:
X.toarray()

array([[0.00057629, 0.0010354 , 0.0036239 , ..., 0.00015592, 0.00046775,
        0.00015592],
       [0.        , 0.        , 0.00203431, ..., 0.        , 0.        ,
        0.        ],
       [0.00185876, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00166839, 0.00168612, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [24]:
vectorizer.get_feature_names()

['00',
 '000',
 '000000',
 '0000001e',
 '00001',
 '000086',
 '00008b',
 '0000cc',
 '0000ff',
 '0001',
 '00025',
 '0005',
 '001',
 '002',
 '003',
 '0036',
 '0038',
 '004',
 '00481',
 '005',
 '0059',
 '006',
 '00653',
 '007',
 '007843',
 '007f00',
 '008',
 '008080',
 '008200',
 '008285',
 '008286',
 '0082df',
 '009',
 '00982',
 '00aaff20',
 '00cc00',
 '00dc28',
 '00ff00',
 '00ffff',
 '01',
 '010',
 '0123456789abcdef0123',
 '01287',
 '0130',
 '0146',
 '0151134457776365',
 '02',
 '02111',
 '02413',
 '02850',
 '02901',
 '02d',
 '03',
 '03123',
 '03125',
 '0314',
 '03167',
 '0338',
 '03536',
 '03762',
 '03894',
 '03d',
 '03nxc2m',
 '04',
 '04368',
 '04407',
 '0478',
 '0486',
 '05',
 '05178',
 '05493',
 '05530',
 '0585',
 '0588',
 '05997',
 '05d',
 '0625',
 '06354',
 '06391',
 '065535',
 '06736',
 '072169',
 '07308',
 '07503',
 '0765',
 '0782f5',
 '0797',
 '07979',
 '08',
 '08022',
 '08082',
 '08246b',
 '08402',
 '08566',
 '08804',
 '08895',
 '09263',
 '0948',
 '0999',
 '0a5f89',
 '0aaaaaaadb

In [25]:
y_train = corpus["project_name"]
y_train

0            PaddleHub.txt
1              PySolFC.txt
2        building_tool.txt
3               espnet.txt
4              horovod.txt
5                 jina.txt
6    pytorch_geometric.txt
Name: project_name, dtype: object

In [26]:
a = X.toarray()
a[0]

array([0.00057629, 0.0010354 , 0.0036239 , ..., 0.00015592, 0.00046775,
       0.00015592])

In [35]:
x_testcv = vectorizer.transform(x_validation)
x_testcv.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [36]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [37]:
pred = mnb.predict(x_testcv[0])
pred

array(['building_tool.txt'], dtype='<U21')

In [None]:
y_validation

In [None]:
# data.drop_duplicates()

In [None]:
data[data.duplicated(keep=False)]

In [None]:
data[without['text'] == data['text']]

# Features

In [None]:
data[data['text'].str.contains(r'class .* : ')]

In [None]:
# understand distribution of names between project and understand 
# we want to be able to ask a test sample if it calls a function or class defined only in some or one project.
lst = []
class_df = data[data['text'].str.contains(r'class .*')]

In [None]:
class_df['text'] = class_df['text'].str.strip("class")
class_df['text'] = class_df['text'].str.replace(r"\(.*","")
class_df = class_df.drop_duplicates()

In [None]:
class_df

In [None]:
class_df.groupby('project_name').nunique()

In [None]:
class_df.groupby('text').nunique()

## project name

In [None]:
#is the name of the project embedded in text
data[data["text"].str.contains("Jina")]

## camel case and style features

In [None]:
# length of variable names, use of unique features (lambda, arrows and stuff), flags, type of filkes being use
# letgh of row of code (from start to end), number of inline notes (#)