In [1]:
# data manipulation
import pandas as pd
import numpy as np
import itertools

# viz and stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# nlp
import re
import unicodedata
import nltk
import nltk.sentiment
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#split scale and model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn import naive_bayes as nb
from sklearn.feature_selection import RFE

# local imports
import acquire as a
import prepare as p

In [2]:
df = a.get_readmes()
df = p.prep_readmes(df)
df_backup = df

# Modeling

In [3]:
df = df_backup

In [4]:
df = df.assign(length = df['lemmatized'].str.len())
df

Unnamed: 0,repo,language,readme_contents,clean,lemmatized,top3other,length
0,Hardtack/HTProgressHUD,Objective-C,# HTProgressHUD\n\n[![No Maintenance Intended]...,htprogresshud maintenance intended http unmain...,htprogresshud maintenance intended http unmain...,Objective-C,3416
1,raycmorgan/Mu,JavaScript,"# Mu - a fast, streaming Node.js Mustache engi...",mu fast streaming node js mustache engine warn...,mu fast streaming node j mustache engine warni...,JavaScript,2168
2,rgeo/rgeo,Ruby,## RGeo\n\n[![Gem Version](https://badge.fury....,rgeo gem version https badge fury io rb rgeo s...,rgeo gem version http badge fury io rb rgeo sv...,other,4963
3,johnlui/AutoLayout,Swift,# Auto Layout 秘境\n\n### 1.《Auto Layout 使用心得》系列...,auto layout auto layout fir im keynote auto la...,auto layout auto layout fir im keynote auto la...,other,711
4,evnaz/ENSwiftSideMenu,Swift,# ENSwiftSideMenu\n\nA lightweight flyover sid...,enswiftsidemenu lightweight flyover side menu ...,enswiftsidemenu lightweight flyover side menu ...,other,1844
...,...,...,...,...,...,...,...
539,RealHacker/leetcode-solutions,Python,# leetcode-solutions\nRepo for all leetcode pr...,leetcode solutions repo leetcode problems solv...,leetcode solution repo leetcode problem solved...,other,128
540,Patrick-Kladek/CocoaDebugKit,Objective-C,CocoaDebugKit\n============\n[![Twitter: @Patr...,cocoadebugkit twitter patrickkladek https img ...,cocoadebugkit twitter patrickkladek http img s...,Objective-C,2259
541,gophercon/2016-talks,JavaScript,Slides for GopherCon 2016 regular program and ...,slides gophercon regular program lightning tal...,slide gophercon regular program lightning talk...,JavaScript,116
542,dinocore1/DevsmartLib-Android,Java,\n\nThe MIT License\n\nCopyright (c) 2011 Paul...,mit license copyright c paul soucy pauldev sma...,mit license copyright c paul soucy pauldev sma...,Java,958


In [5]:
train, val, test = p.split_data(df,'top3other')
train.shape,val.shape,test.shape

((326, 7), (109, 7), (109, 7))

In [6]:
# make the thing
cv = CountVectorizer()

# fit the thing
# use the thing
bag_of_words_cv = cv.fit_transform(train.lemmatized)

In [7]:
bag_of_words_cv.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [8]:
cv.get_feature_names_out()

array(['aa', 'aaadffead', 'aac', ..., 'zzhc', 'zzrqqoselistplfecedeee',
       'zzz'], dtype=object)

In [9]:
bow = pd.DataFrame(bag_of_words_cv.todense(),columns=cv.get_feature_names_out())
bow

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zshrc,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
322,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
323,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
bow_tf = bow.apply(lambda row: row/row.sum(), axis=1)
bow_tf

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zshrc,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
bow_atf = bow.apply(lambda row: (row/row.sum())/(row/row.sum()).max(), axis=1)
bow_atf

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zshrc,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#make it
tfidf = TfidfVectorizer()

#fit it/use it
bag_of_words_tfidf = tfidf.fit_transform(train.lemmatized)

In [13]:
bag_of_words_tfidf.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
bow_tfidf = pd.DataFrame(bag_of_words_tfidf.todense(),columns=tfidf.get_feature_names_out())
bow_tfidf

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zshrc,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
X_train = train[['lemmatized','length']]
y_train = train.top3other
X_val = val[['lemmatized','length']]
y_val = val.top3other
X_test = test[['lemmatized','length']]
y_test = test.top3other

In [16]:
X_train.head()

Unnamed: 0,lemmatized,length
407,swampdragon looking someone take longer time s...,3294
414,p alignleft hrefhttps www npmjs com package do...,5496
109,asmediafocusmanager asmediafocusmanager give a...,6448
238,littlebox super simple use cs icon littlebox h...,1193
318,img srchttps github com downloads sstephenson ...,678


In [17]:
y_train.head()

407     JavaScript
414          other
109    Objective-C
238          other
318          other
Name: top3other, dtype: object

In [18]:
# baseline
y_train.value_counts()[0]/y_train.value_counts().sum()

0.4233128834355828

In [19]:
#make my bag of words cv
cv = CountVectorizer()
Xtr_bow_cv = cv.fit_transform(X_train.lemmatized)
Xv_bow_cv = cv.transform(X_val.lemmatized)
Xt_bow_cv = cv.transform(X_test.lemmatized)

#make my bag of words tfidf
tfidf = TfidfVectorizer()
Xtr_bow_tfidf = tfidf.fit_transform(X_train.lemmatized)
Xv_bow_tfidf = tfidf.transform(X_val.lemmatized)
Xt_bow_tfidf = tfidf.transform(X_test.lemmatized)

In [20]:
type(X_train.length)

pandas.core.series.Series

In [21]:
# type(Xtr_bow_cv)
# Xtr_bow_cv
Xtr_cv = pd.DataFrame(Xtr_bow_cv.todense(),columns=cv.get_feature_names_out(),index=X_train.index)
Xv_cv = pd.DataFrame(Xv_bow_cv.todense(),columns=cv.get_feature_names_out(),index=X_val.index)
Xt_cv = pd.DataFrame(Xt_bow_cv.todense(),columns=cv.get_feature_names_out(),index=X_test.index)
Xtr_cv

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zshrc,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz
407,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
414,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
372,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
265,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
Xtr_tfidf = pd.DataFrame(Xtr_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=X_train.index)
Xv_tfidf = pd.DataFrame(Xv_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=X_val.index)
Xt_tfidf = pd.DataFrame(Xt_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=X_test.index)
Xtr_tfidf

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zshrc,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz
407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
Xtr_cvl = pd.concat([Xtr_cv,X_train.length],axis=1)
Xv_cvl = pd.concat([Xv_cv,X_val.length],axis=1)
Xt_cvl = pd.concat([Xt_cv,X_test.length],axis=1)
Xtr_cvl

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz,length
407,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3294
414,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5496
109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6448
238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1193
318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15770
372,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2940
265,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1617
245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10958


In [24]:
Xtr_tfidf_l = pd.concat([Xtr_tfidf,X_train.length],axis=1)
Xv_tfidf_l = pd.concat([Xv_tfidf,X_val.length],axis=1)
Xt_tfidf_l = pd.concat([Xt_tfidf,X_test.length],axis=1)
Xtr_tfidf_l

Unnamed: 0,aa,aaadffead,aac,aacde,aade,aaduino,aae,aaeaacbcbbac,aallfredo,aanu,...,zsja,zsoltk,zt,zykin,zz,zzarcon,zzhc,zzrqqoselistplfecedeee,zzz,length
407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3294
414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5496
109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6448
238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1193
318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15770
372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2940
265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1617
245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10958


In [25]:
def std(train,validate,test,scale=None):
    """
    The function applies the Standard Scaler method to scale the numerical features of the train, validate,
    and test datasets.
    
    :param train: a pandas DataFrame containing the training data
    :param validate: The validation dataset, which is used to evaluate the performance of the model
    during training and to tune hyperparameters
    :param test: The "test" parameter is a dataset that is used to evaluate the performance of a machine
    learning model that has been trained on the "train" dataset and validated on the "validate" dataset.
    The "test" dataset is typically used to simulate real-world scenarios and to ensure that the model
    is able
    :return: three dataframes: Xtr (scaled training data), Xv (scaled validation data), and Xt (scaled
    test data).
    """
    if scale is None:
        scale = train.columns.to_list()
    std_scale = StandardScaler()
    Xtr,Xv,Xt = train[scale],validate[scale],test[scale]
    Xtr = pd.DataFrame(std_scale.fit_transform(train[scale]),train[scale].index,train[scale].columns)
    Xv = pd.DataFrame(std_scale.transform(validate[scale]),validate[scale].index,validate[scale].columns)
    Xt = pd.DataFrame(std_scale.transform(test[scale]),test[scale].index,test[scale].columns)
    for col in scale:
        Xtr = Xtr.rename(columns={col: f'{col}_s'})
        Xv = Xv.rename(columns={col: f'{col}_s'})
        Xt = Xt.rename(columns={col: f'{col}_s'})
    return Xtr, Xv, Xt

In [26]:
Xtr_cvl_s,Xv_cvl_s,Xt_cvl_s = std(Xtr_cvl,Xv_cvl,Xt_cvl)
Xtr_cvl_s

Unnamed: 0,aa_s,aaadffead_s,aac_s,aacde_s,aade_s,aaduino_s,aae_s,aaeaacbcbbac_s,aallfredo_s,aanu_s,...,zsoltk_s,zt_s,zykin_s,zz_s,zzarcon_s,zzhc_s,zzrqqoselistplfecedeee_s,zzz_s,length_s,length_s.1
407,-0.078567,-0.05547,-0.096374,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.078567,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.089166,-0.101231,-0.156296
414,-0.078567,-0.05547,-0.096374,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.078567,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.089166,-0.101231,0.231595
109,-0.078567,-0.05547,-0.096374,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.078567,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.089166,-0.101231,0.399294
238,-0.078567,-0.05547,-0.096374,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.078567,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.089166,-0.101231,-0.526395
318,-0.078567,-0.05547,-0.096374,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.078567,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.089166,-0.101231,-0.617114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,-0.078567,-0.05547,-0.096374,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.078567,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.089166,-0.101231,2.041401
372,-0.078567,-0.05547,-0.096374,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.078567,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.089166,-0.101231,-0.218654
265,-0.078567,-0.05547,-0.096374,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.078567,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.089166,-0.101231,-0.451706
245,-0.078567,-0.05547,-0.096374,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.078567,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.089166,-0.101231,1.193748


In [27]:
Xtr_tfidf_l_s,Xv_tfidf_l_s,Xt_tfidf_l_s = std(Xtr_tfidf_l,Xv_tfidf_l,Xt_tfidf_l)
Xtr_tfidf_l_s

Unnamed: 0,aa_s,aaadffead_s,aac_s,aacde_s,aade_s,aaduino_s,aae_s,aaeaacbcbbac_s,aallfredo_s,aanu_s,...,zsoltk_s,zt_s,zykin_s,zz_s,zzarcon_s,zzhc_s,zzrqqoselistplfecedeee_s,zzz_s,length_s,length_s.1
407,-0.067624,-0.05547,-0.095414,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.077805,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.071443,-0.153894,-0.156296
414,-0.067624,-0.05547,-0.095414,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.077805,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.071443,-0.153894,0.231595
109,-0.067624,-0.05547,-0.095414,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.077805,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.071443,-0.153894,0.399294
238,-0.067624,-0.05547,-0.095414,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.077805,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.071443,-0.153894,-0.526395
318,-0.067624,-0.05547,-0.095414,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.077805,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.071443,-0.153894,-0.617114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,-0.067624,-0.05547,-0.095414,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.077805,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.071443,-0.153894,2.041401
372,-0.067624,-0.05547,-0.095414,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.077805,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.071443,-0.153894,-0.218654
265,-0.067624,-0.05547,-0.095414,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.077805,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.071443,-0.153894,-0.451706
245,-0.067624,-0.05547,-0.095414,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,...,-0.05547,-0.077805,-0.05547,-0.05547,-0.05547,-0.05547,-0.05547,-0.071443,-0.153894,1.193748


In [None]:
# DecisionTreeClassifier
# RandomForestClassifier
# KNeighborsClassifier
# LogisticRegression
# classification_report, confusion_matrix, ConfusionMatrixDisplay
# nb.ComplementNB
# nb.MultinomialNB