In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

from sklearn import cluster, metrics
from sklearn import manifold, decomposition

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from gensim.models.phrases import Phrases

from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report

from sklearn.preprocessing import LabelEncoder, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier


import time

import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Maeva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
sample_posts = pd.read_csv("data/subsetTop30Tags2.csv", sep=";")
sample_posts.head()

Unnamed: 0.1,Unnamed: 0,Tags,BodyClean,TitleClean,FullPosts,numberOfTags
0,4,python scripting,name main test script would like use module im...,python import initialize argparse name main,python import initialize argparse name main na...,2
1,5,git github,understand able disable pull request question ...,disable pull request github,disable pull request github understand able di...,2
2,11,laravel laravel,cant figure add header response used header gi...,add header response middleware,add header response middleware cant figure add...,2
3,19,python lambda,anyone behavior lambda function import x lambd...,lambda multiple statement python,lambda multiple statement python anyone behavi...,2
4,21,html css,container child one child dynamic width maximu...,moving element push adjacent element collide,moving element push adjacent element collide c...,2


In [3]:
sample_posts.shape

(29044, 6)

In [4]:
sample_posts = sample_posts.sample(5000)
sample_posts

Unnamed: 0.1,Unnamed: 0,Tags,BodyClean,TitleClean,FullPosts,numberOfTags
6269,38694,html css,simple list check box inside div give div fixe...,prevent wrapping element inside div,prevent wrapping element inside div simple lis...,2
22780,142207,javascript vuejs,total really obvious thing missing object load...,vuejs undefined error object value loaded rend...,vuejs undefined error object value loaded rend...,2
22327,139521,reactjs docker,docker build react application deploy set envi...,reading environment variable react set docker,reading environment variable react set docker ...,2
27574,175146,c c,wrapper around vector shape like template n n ...,function constructor accepts array rank,function constructor accepts array rank wrappe...,2
3875,23975,ios swift,would like initialize set value corresponding ...,set protocol swift,set protocol swift would like initialize set v...,2
...,...,...,...,...,...,...
27695,175993,angular typescript,way send pipe component component display data...,send pipe component,send pipe component way send pipe component co...,2
20054,125437,javascript reactjs,hey one question try modal open another compon...,open modal component reactjs,open modal component reactjs hey one question ...,2
16428,101923,python pandas,following panda col name name b col duplicate ...,transpose panda dataframe change column header...,transpose panda dataframe change column header...,2
18438,115547,php laravel,set value cache something like bar push sad va...,pushing value existing cache value laravel,pushing value existing cache value laravel set...,2


## Classifier

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(sample_posts.FullPosts))

print(data_words[:3])

[['prevent', 'wrapping', 'element', 'inside', 'div', 'simple', 'list', 'check', 'box', 'inside', 'div', 'give', 'div', 'fixed', 'width', 'label', 'check', 'box', 'go', 'line', 'retain', 'scroll', 'bar', 'user', 'hidden', 'part', 'want', 'say', 'use', 'seem', 'keep', 'getting', 'list', 'label', 'going', 'line', 'instead', 'hidden', 'thought', 'would', 'fix', 'instead', 'expanded', 'width', 'div', 'show', 'rest', 'label', 'div', 'input', 'input', 'input', 'input', 'div', 'question', 'give', 'div', 'fixed', 'width', 'label', 'check', 'box', 'go', 'line', 'retain', 'scroll', 'bar'], ['vuejs', 'undefined', 'error', 'object', 'value', 'loaded', 'rendered', 'total', 'really', 'obvious', 'thing', 'missing', 'object', 'loaded', 'via', 'call', 'inside', 'mounted', 'method', 'job', 'title', 'value', 'location', 'call', 'good', 'call', 'undefined', 'error', 'value', 'render', 'call', 'get', 'object', 'defined', 'sure', 'really', 'simple', 'cant', 'possibly', 'straight', 'forward', 'additional', 'e

In [6]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])

['prevent wrap element simple list check box give fix check go line retain scroll bar user hide part want say use seem keep get list label go line instead hide thought fix instead expand rest label input input input input div question fix check go line retain scroll bar', 'vuejs undefine error object value load render total really obvious thing miss object load call inside mount method job title value location call good call undefined error value render call get object define sure really simple possibly straight forward additional entire class router new mode history route new job mount function method get success function datum datum error function error function return']


In [7]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=2,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             max_features=50000             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)
print(vectorizer.get_feature_names_out())
# print(vectorizer.transform(data_lemmatized))

['abbreviation' 'ability' 'able' ... 'zip' 'zone' 'zoom']


In [8]:
print(data_vectorized)

  (0, 2382)	1
  (0, 3605)	1
  (0, 989)	1
  (0, 2903)	1
  (0, 1777)	2
  (0, 439)	3
  (0, 329)	1
  (0, 1195)	3
  (0, 1769)	3
  (0, 2690)	2
  (0, 2790)	2
  (0, 252)	2
  (0, 3471)	1
  (0, 1420)	2
  (0, 3548)	1
  (0, 2762)	1
  (0, 3466)	1
  (0, 1712)	2
  (0, 1606)	2
  (0, 3246)	1
  (0, 1086)	1
  (0, 2678)	1
  (0, 1582)	4
  (0, 906)	1
  (0, 2487)	1
  :	:
  (4997, 748)	1
  (4997, 1224)	1
  (4997, 1400)	3
  (4997, 966)	1
  (4997, 3024)	1
  (4997, 1471)	1
  (4997, 3315)	2
  (4998, 252)	2
  (4998, 3488)	6
  (4998, 2849)	1
  (4998, 1700)	1
  (4998, 2462)	3
  (4998, 1718)	1
  (4998, 1083)	1
  (4998, 374)	3
  (4998, 2743)	2
  (4999, 2802)	1
  (4999, 2062)	1
  (4999, 503)	2
  (4999, 1224)	1
  (4999, 2789)	2
  (4999, 768)	1
  (4999, 2461)	1
  (4999, 1518)	1
  (4999, 3348)	1


In [9]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  0.6492443463872035 %


In [10]:
X = sample_posts["FullPosts"]
y = sample_posts["Tags"]

In [11]:
# Split from the loaded dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [12]:
print("X_train shape : {}".format(X_train.shape))
print("X_test shape : {}".format(X_test.shape))
print("y_train shape : {}".format(y_train.shape))
print("y_test shape : {}".format(y_test.shape))

X_train shape : (4000,)
X_test shape : (1000,)
y_train shape : (4000,)
y_test shape : (1000,)


In [13]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [14]:
def lsa_reduction(X_train, X_test, n_comp=120):
    svd = TruncatedSVD(n_components=n_comp)
    normalizer = Normalizer()
    
    lsa_pipe = Pipeline([('svd', svd),
                        ('normalize', normalizer)]).fit(X_train)
    
    train_reduced = lsa_pipe.transform(X_train)
    test_reduced = lsa_pipe.transform(X_test)
    return train_reduced, test_reduced

In [15]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
svc = SVC(kernel='linear')
modelLR = LogisticRegression()

In [16]:
# X_train = vectorizer.fit_transform(X_train)
# y_train = vectorizer.fit_transform(y_train)
# X_test = vectorizer.transform(X_test)
# y_test = vectorizer.transform(y_test)

In [17]:
# One vs Restclassifier
# model_OVR = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train)
# model_OVR = OneVsRestClassifier(svc).fit(X_train, y_train)
model_OVR = OneVsRestClassifier(svc).fit(X_train, y_train)

In [18]:
pickle.dump(model_OVR, open('../ModelsAPI/model_OVR.pkl','wb'))

In [19]:
y_pred = model_OVR.predict(X_test)
print(y_pred)
print(y_pred.shape)

['python tensorflow ' 'git gitclone ' 'angular typescript '
 'visualstudio visualstudio ' 'python numpy ' 'python pandas ' 'html css '
 'python list ' 'python numpy ' 'dart flutter ' 'javascript jquery '
 'angular typescript ' 'python pytest ' 'python django '
 'git gitsubmodules ' 'angular typescript ' 'python python '
 'angular typescript ' 'python date ' 'javascript reactjs '
 'dart flutter ' 'javascript vuejs ' 'c c ' 'html css ' 'python pipe '
 'python matplotlib ' 'python tensorflow ' 'javascript reactjs '
 'python openai ' 'c c ' 'python scikitlearn ' 'css html '
 'javascript angular ' 'javascript angular ' 'java android '
 'javascript vuejs ' 'html css ' 'javascript reactjs '
 'python pythonsphinx ' 'ios xcode ' 'python pandas ' 'python fork '
 'python pandas ' 'docker githubactions ' 'python paramiko '
 'javascript reactjs ' 'python namedtuple ' 'javascript python '
 'python numpy ' 'javascript angular ' 'python pythonx '
 'python unittesting ' 'python urllib ' 'git azuredevop

In [20]:
y_pred.shape

(1000,)

In [21]:
test = ["Django supports Python. If you're under Linux and want to check the Python version you're using, run python -V from the command line If you want to check the Django version, open a Python console and type"]
# test_words = list(sent_to_words(test))
# test_words = lemmatization(test_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# test_words = count_vect.transform(test_words)
pred = model_OVR.predict(vectorizer.transform(test))
print(pred)

['python django ']
