In [1]:
# data manipulation
import pandas as pd
import numpy as np
import itertools

# viz and stats
import matplotlib.pyplot as plt
import seaborn as sns

# nlp
import re
import unicodedata
import nltk
import nltk.sentiment
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#split scale and model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn import naive_bayes as nb

# local imports
import acquire as a
import prepare as p

In [2]:
df = a.get_readmes()
df = p.prep_readmes(df)
df_backup = df

# Modeling

In [3]:
df = df_backup

In [4]:
df = df.assign(length = df['lemmatized'].str.len())
df

Unnamed: 0,repo,language,readme_contents,clean,lemmatized,top3other,length
0,Hardtack/HTProgressHUD,Objective-C,# HTProgressHUD\n\n[![No Maintenance Intended]...,htprogresshud maintenance intended http unmain...,htprogresshud maintenance intended http unmain...,Objective-C,3416
1,raycmorgan/Mu,JavaScript,"# Mu - a fast, streaming Node.js Mustache engi...",mu fast streaming node js mustache engine warn...,mu fast streaming node j mustache engine warni...,JavaScript,2168
2,rgeo/rgeo,Ruby,## RGeo\n\n[![Gem Version](https://badge.fury....,rgeo gem version https badge fury io rb rgeo s...,rgeo gem version http badge fury io rb rgeo sv...,other,4963
3,johnlui/AutoLayout,Swift,# Auto Layout 秘境\n\n### 1.《Auto Layout 使用心得》系列...,auto layout auto layout fir im keynote auto la...,auto layout auto layout fir im keynote auto la...,other,711
4,evnaz/ENSwiftSideMenu,Swift,# ENSwiftSideMenu\n\nA lightweight flyover sid...,enswiftsidemenu lightweight flyover side menu ...,enswiftsidemenu lightweight flyover side menu ...,other,1844
...,...,...,...,...,...,...,...
539,RealHacker/leetcode-solutions,Python,# leetcode-solutions\nRepo for all leetcode pr...,leetcode solutions repo leetcode problems solv...,leetcode solution repo leetcode problem solved...,other,128
540,Patrick-Kladek/CocoaDebugKit,Objective-C,CocoaDebugKit\n============\n[![Twitter: @Patr...,cocoadebugkit twitter patrickkladek https img ...,cocoadebugkit twitter patrickkladek http img s...,Objective-C,2259
541,gophercon/2016-talks,JavaScript,Slides for GopherCon 2016 regular program and ...,slides gophercon regular program lightning tal...,slide gophercon regular program lightning talk...,JavaScript,116
542,dinocore1/DevsmartLib-Android,Java,\n\nThe MIT License\n\nCopyright (c) 2011 Paul...,mit license copyright c paul soucy pauldev sma...,mit license copyright c paul soucy pauldev sma...,Java,958


In [5]:
train, val, test = p.split_data(df,'top3other')
train.shape,val.shape,test.shape

((326, 7), (109, 7), (109, 7))

In [6]:
# make the thing
cv = CountVectorizer()

# fit the thing
# use the thing
bag_of_words_cv = cv.fit_transform(train.lemmatized)

In [7]:
bag_of_words_cv.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [8]:
cv.get_feature_names_out()

array(['aa', 'aaadffead', 'aac', ..., 'zzhc', 'zzrqqoselistplfecedeee',
       'zzz'], dtype=object)

In [9]:
bow = pd.DataFrame(bag_of_words_cv.todense(),columns=cv.get_feature_names_out())
bow

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zshrc,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
322,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
323,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
bow_tf = bow.apply(lambda row: row/row.sum(), axis=1)
bow_tf

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zshrc,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
bow_atf = bow.apply(lambda row: (row/row.sum())/(row/row.sum()).max(), axis=1)
bow_atf

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zshrc,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#make it
tfidf = TfidfVectorizer()

#fit it/use it
bag_of_words_tfidf = tfidf.fit_transform(train.lemmatized)

In [13]:
bag_of_words_tfidf.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
bow_tfidf = pd.DataFrame(bag_of_words_tfidf.todense(),columns=tfidf.get_feature_names_out())
bow_tfidf

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zshrc,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
X_train = train[['lemmatized','length']]
y_train = train.top3other
X_val = val[['lemmatized','length']]
y_val = val.top3other
X_test = test[['lemmatized','length']]
y_test = test.top3other

In [16]:
X_train.head()

Unnamed: 0,lemmatized,length
407,swampdragon looking someone take longer time s...,3294
414,p alignleft hrefhttps www npmjs com package do...,5496
109,asmediafocusmanager asmediafocusmanager give a...,6448
238,littlebox super simple use cs icon littlebox h...,1193
318,img srchttps github com downloads sstephenson ...,678


In [17]:
y_train.head()

407     JavaScript
414          other
109    Objective-C
238          other
318          other
Name: top3other, dtype: object

In [18]:
# baseline
y_train.value_counts()[0]/y_train.value_counts().sum()

0.4233128834355828

In [19]:
y_train.value_counts(normalize=True)[0]

0.4233128834355828

In [20]:
#make my bag of words cv
cv = CountVectorizer()
Xtr_bow_cv = cv.fit_transform(X_train.lemmatized)
Xv_bow_cv = cv.transform(X_val.lemmatized)
Xt_bow_cv = cv.transform(X_test.lemmatized)

#make my bag of words tfidf
tfidf = TfidfVectorizer()
Xtr_bow_tfidf = tfidf.fit_transform(X_train.lemmatized)
Xv_bow_tfidf = tfidf.transform(X_val.lemmatized)
Xt_bow_tfidf = tfidf.transform(X_test.lemmatized)

In [21]:
type(X_train.length)

pandas.core.series.Series

In [22]:
# type(Xtr_bow_cv)
# Xtr_bow_cv
Xtr_cv = pd.DataFrame(Xtr_bow_cv.todense(),columns=cv.get_feature_names_out(),index=X_train.index)
Xv_cv = pd.DataFrame(Xv_bow_cv.todense(),columns=cv.get_feature_names_out(),index=X_val.index)
Xt_cv = pd.DataFrame(Xt_bow_cv.todense(),columns=cv.get_feature_names_out(),index=X_test.index)
Xtr_cv

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zshrc,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz
407,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
414,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
372,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
265,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
Xtr_tfidf = pd.DataFrame(Xtr_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=X_train.index)
Xv_tfidf = pd.DataFrame(Xv_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=X_val.index)
Xt_tfidf = pd.DataFrame(Xt_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=X_test.index)
Xtr_tfidf

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zshrc,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz
407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Xtr_cvl = pd.concat([Xtr_cv,X_train.length],axis=1)
# Xv_cvl = pd.concat([Xv_cv,X_val.length],axis=1)
# Xt_cvl = pd.concat([Xt_cv,X_test.length],axis=1)
# Xtr_cvl

In [25]:
# Xtr_tfidf_l = pd.concat([Xtr_tfidf,X_train.length],axis=1)
# Xv_tfidf_l = pd.concat([Xv_tfidf,X_val.length],axis=1)
# Xt_tfidf_l = pd.concat([Xt_tfidf,X_test.length],axis=1)
# Xtr_tfidf_l

In [26]:
# def mm(train,validate,test,scale=None):
#     if scale is None:
#         scale = train.columns.to_list()
#     mm_scale = MinMaxScaler()
#     Xtr,Xv,Xt = train[scale],validate[scale],test[scale]
#     Xtr = pd.DataFrame(mm_scale.fit_transform(train[scale]),train[scale].index,train[scale].columns)
#     Xv = pd.DataFrame(mm_scale.transform(validate[scale]),validate[scale].index,validate[scale].columns)
#     Xt = pd.DataFrame(mm_scale.transform(test[scale]),test[scale].index,test[scale].columns)
#     for col in scale:
#         Xtr = Xtr.rename(columns={col: f'{col}_s'})
#         Xv = Xv.rename(columns={col: f'{col}_s'})
#         Xt = Xt.rename(columns={col: f'{col}_s'})
#     return Xtr, Xv, Xt

In [27]:
# Xtr_cvl_s,Xv_cvl_s,Xt_cvl_s = mm(Xtr_cvl,Xv_cvl,Xt_cvl)
# Xtr_cvl_s

In [28]:
# Xtr_tfidf_l_s,Xv_tfidf_l_s,Xt_tfidf_l_s = mm(Xtr_tfidf_l,Xv_tfidf_l,Xt_tfidf_l)
# Xtr_tfidf_l_s

In [29]:
# DecisionTreeClassifier
# RandomForestClassifier
# KNeighborsClassifier
# LogisticRegression
# classification_report, confusion_matrix, ConfusionMatrixDisplay
# nb.ComplementNB
# nb.MultinomialNB

In [30]:
def class_models(Xtr,ytr,Xv,yv):
    # baseline as mean
    pred_mean = ytr.value_counts(normalize=True)[0]
    output = {
            'model':'bl',
            'params':'None',
            'tr_acc':pred_mean,
            'v_acc':'?',
        }
    metrics = [output]
    # decision tree not that good
    # cycle through depths for decision tree
    # for d in range(1,21):
    #     # decision tree
    #     tree = DecisionTreeClassifier(max_depth=d,random_state=42)
    #     tree = tree.fit(Xtr,ytr)
    #     # accuracies
    #     ytr_acc = tree.score(Xtr,ytr)
    #     yv_acc = tree.score(Xv,yv)
    #     # table-ize
    #     output ={
    #             'model':'DecisionTree',
    #             'params':f'max_depth={d}',
    #             'tr_acc':ytr_acc,
    #             'v_acc':yv_acc,
    #         }
    #     metrics.append(output)
    # random forest not that good
    # cycle through leaves and depths for random forest
    # for l, d in itertools.product(range(1,21), range(1,21)):
    #     # random forest
    #     rf = RandomForestClassifier(min_samples_leaf=l,max_depth=d,random_state=42)
    #     rf.fit(Xtr,ytr)
    #     # accuracies
    #     ytr_acc = rf.score(Xtr,ytr)
    #     yv_acc = rf.score(Xv,yv)
    #     # table-ize
    #     output ={
    #             'model':'RandomForest',
    #             'params':f'min_sample_leaf={l},max_depth={d}',
    #             'tr_acc':ytr_acc,
    #             'v_acc':yv_acc,
    #         }
    #     metrics.append(output)
    # knn not that good
    # cycle through neighbors,weights,algorithm for knn
    # for n, w in itertools.product(range(1,21),['uniform','distance']):
    #     # knn
    #     k = KNeighborsClassifier(n_neighbors=n,weights=w)
    #     k.fit(Xtr,ytr)
    #     # accuracies
    #     ytr_acc = k.score(Xtr,ytr)
    #     yv_acc = k.score(Xv,yv)
    #     # table-ize
    #     output ={
    #             'model':'KNN',
    #             'params':f'n_neighbors={n},weights={w}',
    #             'tr_acc':ytr_acc,
    #             'v_acc':yv_acc,
    #         }
    #     metrics.append(output)
    # cycle through C,class_weight for log reg
    for c in [.01,.1,1,10,100,1000]:
        # logistic regression
        lr = LogisticRegression(C=c,class_weight='balanced',random_state=42,max_iter=500)
        lr.fit(Xtr,ytr)
        # accuracies
        ytr_acc = lr.score(Xtr,ytr)
        yv_acc = lr.score(Xv,yv)
        # table-ize
        output ={
                'model':'LogReg',
                'params':f"C={c},class_weight='balanced',max_iter=500",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
            }
        metrics.append(output)
    # cycle through alpha for CNB
    for a in np.arange(.1,.6,.1):
        # naive bayes complement
        cnb = nb.ComplementNB(alpha=a)
        cnb.fit(Xtr,ytr)
        # accuracies
        ytr_acc = cnb.score(Xtr,ytr)
        yv_acc = cnb.score(Xv,yv)
        # table-ize
        output ={
                'model':'CNB',
                'params':f'alpha={a}',
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
            }
        metrics.append(output)
    metrics_df = pd.DataFrame(metrics)
    # cycle through alpha for MNB
    for a in np.arange(.1,.6,.1):
        # naive bayes multinomial
        mnb = nb.MultinomialNB(alpha=a)
        mnb.fit(Xtr,ytr)
        # accuracies
        ytr_acc = mnb.score(Xtr,ytr)
        yv_acc = mnb.score(Xv,yv)
        # table-ize
        output ={
                'model':'MNB',
                'params':f'alpha={a}',
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
            }
        metrics.append(output)
    metrics_df = pd.DataFrame(metrics)
    # # Calculate the difference between the train and validation scores
    # metrics_df['diff_score'] = abs(metrics_df.tr_acc - metrics_df.v_acc)
    # metrics_df['avg_score'] = (metrics_df.tr_acc + metrics_df.v_acc)/2
    return metrics_df

In [31]:
# readme length does not seem to help so
# not gonna use anymore for modeling

### add bigrams

In [32]:
X_train.head()

Unnamed: 0,lemmatized,length
407,swampdragon looking someone take longer time s...,3294
414,p alignleft hrefhttps www npmjs com package do...,5496
109,asmediafocusmanager asmediafocusmanager give a...,6448
238,littlebox super simple use cs icon littlebox h...,1193
318,img srchttps github com downloads sstephenson ...,678


In [33]:
#make my bag of words cv
cv = CountVectorizer(ngram_range=(1,3),token_pattern=r'(?u)\b\w+\b')
Xtr_bow_cv = cv.fit_transform(X_train.lemmatized)
Xv_bow_cv = cv.transform(X_val.lemmatized)
Xt_bow_cv = cv.transform(X_test.lemmatized)

#make my bag of words tfidf
tfidf = TfidfVectorizer(ngram_range=(1,3),token_pattern=r'(?u)\b\w+\b')
Xtr_bow_tfidf = tfidf.fit_transform(X_train.lemmatized)
Xv_bow_tfidf = tfidf.transform(X_val.lemmatized)
Xt_bow_tfidf = tfidf.transform(X_test.lemmatized)

In [34]:
Xtr_cv = pd.DataFrame(Xtr_bow_cv.todense(),columns=cv.get_feature_names_out(),index=X_train.index)
Xv_cv = pd.DataFrame(Xv_bow_cv.todense(),columns=cv.get_feature_names_out(),index=X_val.index)
Xt_cv = pd.DataFrame(Xt_bow_cv.todense(),columns=cv.get_feature_names_out(),index=X_test.index)
Xtr_cv

Unnamed: 0,aa,aa battery,aa battery keystone,aa single,aa single column,aaadffead,aaadffead fetch,aaadffead fetch log,aaadffead tz,aaadffead tz app,...,zzz gremlin,zzz gremlin g,zzz http,zzz http github,zzz python,zzz python msimplehttpserver,zzz start,zzz start web,zzz xposed,zzz xposed license
407,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
414,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
372,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
265,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
Xtr_tfidf = pd.DataFrame(Xtr_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=X_train.index)
Xv_tfidf = pd.DataFrame(Xv_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=X_val.index)
Xt_tfidf = pd.DataFrame(Xt_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=X_test.index)
Xtr_tfidf

Unnamed: 0,aa,aa battery,aa battery keystone,aa single,aa single column,aaadffead,aaadffead fetch,aaadffead fetch log,aaadffead tz,aaadffead tz app,...,zzz gremlin,zzz gremlin g,zzz http,zzz http github,zzz python,zzz python msimplehttpserver,zzz start,zzz start web,zzz xposed,zzz xposed license
407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# log reg C=0.1,class_weight='balanced',max_iter=500
# CNB alpha=0.5
# MNB alpha=0.1
# cv_metrics = class_models(Xtr_cv,y_train,Xv_cv,y_val)
# cv_metrics

In [37]:
# log reg C=0.1,class_weight='balanced',max_iter=500
# CNB alpha=0.1
# MNB alpha=0.1
# tfidf_metrics = class_models(Xtr_tfidf,y_train,Xv_tfidf,y_val)
# tfidf_metrics

In [38]:
import lugo_explore as e

In [39]:
javascript, java, objective_c, other, all_words = e.count_unique_words_by_language(train)

JavaScript unique words: 7580
Java unique words: 4232
Objective-C unique words: 4521
Other unique words: 10504
All unique words: 19470


In [None]:
def get_ngrams_by_language(train):
    
    # Get the words used in each programming language
    javascript = [word for row in train[train.top3other=='JavaScript']['lemmatized'] for word in row.split()]
    java = [word for row in train[train.top3other=='Java']['lemmatized'] for word in row.split()]
    objective_c = [word for row in train[train.top3other=='Objective-C']['lemmatized'] for word in row.split()]
    other = [word for row in train[train.top3other=='other']['lemmatized'] for word in row.split()]
    all_words = [word for row in train['lemmatized'] for word in row.split()]
    return javascript, java, objective_c, other, all_words

In [68]:
def analyze_unique_words(*args):
    # Initialize dictionaries to store n-grams and unique words
    ngram_dicts = {}
    unique_word_dicts = {}

    # Generate n-grams for each list of words
    for i, words in enumerate(args):
        ngrams = pd.Series(nltk.ngrams(words, 1)).value_counts()
        ngram_dicts[i] = {f'{k[0]} ': v for k, v in ngrams.to_dict().items()}

    # Get sets of words for each language
    word_sets = {i: set(ngram_dict.keys()) for i, ngram_dict in ngram_dicts.items()}

    # Find common words
    common_words = set.intersection(*word_sets.values())

    # Find and store unique words for each list
    for i, word_set in word_sets.items():
        unique_words = word_set - common_words
        unique_word_dicts[i] = {key: ngram_dicts[i][key] for key in unique_words}

    return unique_word_dicts

In [74]:
f = [list(analyze_unique_words(javascript, java, objective_c, other)[i].keys()) for i in range(0, 4)]
feat = f[0]
feat.extend(f[1])
feat.extend(f[2])
feat.extend(f[3])
feat = [word.strip() for word in feat]
feat = list(set(feat))
feat[:4]

['clientapplications',
 'bmmrecordentityprimaryattributekey',
 'copyfish',
 'setadapteranimatoradapter']

In [65]:
features = []
for n in range(1,4):
    f = [list(analyze_unique_words(javascript, java, objective_c, other,n=n)[i].keys()) for i in range(0, 4)]
    feat = f[0]
    feat.extend(f[1])
    feat.extend(f[2])
    feat.extend(f[3])
    features.extend([word.strip() for word in feat])
features

['grabox',
 'couchbase',
 'bootstap',
 'addthemepage',
 'aoyxeze',
 'juejin',
 'idealbankelement',
 'lukeed',
 'callcontractfunction',
 'invader',
 'seizing',
 'constructorstring',
 'feedbrochromerss',
 'rightafter',
 'recordof',
 'ccbbddaaddd',
 'holowaychuk',
 'cleaner',
 'mapitemtext',
 'lookupcharset',
 'codetilt',
 'outputn',
 'linkutmmediumlinkutmcampaignshare',
 'eaedabdafadfeb',
 'excluderequirecss',
 'lighter',
 'unregisterelement',
 'codekeepautoplaybindings',
 'resized',
 'probability',
 'decodememorawdata',
 'shaking',
 'deep',
 'axios',
 'slidrone',
 'cloning',
 'scopecoloption',
 'humanistic',
 'logbundle',
 'operating',
 'roundaboutoptions',
 'animatebearingtofocus',
 'transpiler',
 'manifest',
 'netfront',
 'uninstalling',
 'wanstrath',
 'composable',
 'pwc',
 'yuichi',
 'ondomready',
 'socketioflashsocket',
 'determines',
 'deleteapinode',
 'millermedeiros',
 'horrible',
 'subbhikaru',
 'sidwood',
 'gaearon',
 'grouped',
 'landfunctor',
 'tpl',
 'githubb',
 'playground

In [42]:
feat[:4]

['clientapplications',
 'bmmrecordentityprimaryattributekey',
 'copyfish',
 'setadapteranimatoradapter']

In [43]:
# [col for col in Xtr_cv.columns if len(col) == 2]
# [col for col in list(set(feat)) if col not in Xtr_cv.columns]

In [44]:
# LogReg C=0.1,class_weight='balanced',max_iter=500
# CNB alpha=0.5
# MNB alpha=0.4
# cv_metrics_f = class_models(Xtr_cv[feat],y_train,Xv_cv[feat],y_val)
# cv_metrics_f

In [45]:
# LogReg C=1,class_weight='balanced',max_iter=500
# CNB alpha=0.5
# MNB alpha=0.1
# tfidf_metrics_f = class_models(Xtr_tfidf[feat],y_train,Xv_tfidf[feat],y_val)
# tfidf_metrics_f

In [46]:
# cv_metrics = cv_metrics.assign(feat='all',vectorizer='cv')
# tfidf_metrics = tfidf_metrics.assign(feat='all',vectorizer='tfidf')
# cv_metrics_f = cv_metrics_f.assign(feat='unique',vectorizer='cv')
# tfidf_metrics_f = tfidf_metrics_f.assign(feat='unique',vectorizer='tfidf')

In [47]:
# all_metrics = pd.concat([cv_metrics,tfidf_metrics,cv_metrics_f,tfidf_metrics_f]).reset_index().drop(columns='index')
# all_metrics = all_metrics[all_metrics.model!='bl'].reset_index().drop(columns='index')
# all_metrics.v_acc = all_metrics.v_acc.astype(float)
# all_metrics

In [48]:
# cnb iloc 58
# all_metrics.iloc[58]

In [49]:
# mnb iloc 11
# all_metrics.iloc[11]

In [50]:
# log reg iloc 50
# all_metrics.iloc[50]

In [51]:
# test
# model CNB
# params alpha=0.5
# tr_acc 1.0
# v_acc 0.788991
# feat unique
# vectorizer tfidf


In [52]:
import modeling as m

In [53]:
feat = m.get_unique(train)

JavaScript unique words: 7580
Java unique words: 4232
Objective-C unique words: 4521
Other unique words: 10504
All unique words: 19470


In [54]:
m.cnb_model(Xtr_tfidf[feat],y_train,Xv_tfidf[feat],y_val)

Complement Naive Bayes
Train Accuracy:      100.0%
Validation Accuracy: 78.9%


In [55]:
m.test_model(Xtr_tfidf[feat],y_train,Xt_tfidf[feat],y_test)

Complement Naive Bayes
Baseline Accuracy: 42.33%
Test Accuracy:     69.72%


In [56]:
m.mnb_model(Xtr_cv,y_train,Xv_cv,y_val)

Multinomial Naive Bayes
Train Accuracy:      100.0%
Validation Accuracy: 77.06%


In [57]:
m.log_model(Xtr_tfidf[feat],y_train,Xv_tfidf[feat],y_val)

Logistic Regression
Train Accuracy:      97.55%
Validation Accuracy: 77.06%
