In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

from sklearn import cluster, metrics
from sklearn import manifold, decomposition

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from gensim.models.phrases import Phrases

from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report

from sklearn.preprocessing import LabelEncoder, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier


import time

import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Maeva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
sample_posts = pd.read_csv("data/subsetTop30Tags2.csv", sep=";")
sample_posts.head()

Unnamed: 0.1,Unnamed: 0,Tags,BodyClean,TitleClean,FullPosts,numberOfTags
0,4,python scripting,name main test script would like use module im...,python import initialize argparse name main,python import initialize argparse name main na...,2
1,5,git github,understand able disable pull request question ...,disable pull request github,disable pull request github understand able di...,2
2,11,laravel laravel,cant figure add header response used header gi...,add header response middleware,add header response middleware cant figure add...,2
3,19,python lambda,anyone behavior lambda function import x lambd...,lambda multiple statement python,lambda multiple statement python anyone behavi...,2
4,21,html css,container child one child dynamic width maximu...,moving element push adjacent element collide,moving element push adjacent element collide c...,2


In [3]:
sample_posts.shape

(29044, 6)

In [4]:
sample_posts = sample_posts.sample(5000)
sample_posts

Unnamed: 0.1,Unnamed: 0,Tags,BodyClean,TitleClean,FullPosts,numberOfTags
25895,162344,python pip,install spacy library natural language python ...,pip install killed memory get around,pip install killed memory get around install s...,2
21036,131549,dart flutter,flutter simple tab tab row built another metho...,multiple widget used globalkey,multiple widget used globalkey flutter simple ...,2
19439,121703,dart flutter,concerned following message run window thread ...,run window message identical line,run window message identical line concerned fo...,2
18520,116071,jekyll githubpages,struggling hour avail use one page theme cayma...,jekyll theme github page work locally,jekyll theme github page work locally struggli...,2
6939,43024,html css,put demonstrate set background text p element ...,apply background color cs text line break,apply background color cs text line break put ...,2
...,...,...,...,...,...,...
20919,130829,git github,fatal unable access next unknown error x revoc...,revocation function unable check revocation re...,revocation function unable check revocation re...,2
6645,41101,git gitmerge,way git merge two branch without file word dra...,git merge two branch without actually merging ...,git merge two branch without actually merging ...,2
25812,161702,javascript html,found code event used work would like used ins...,event deprecated used instead,event deprecated used instead found code event...,2
8252,50761,python matplotlib,case crash kernel win plot specifically ax res...,matplotlibs pyplotsubplots crash kernel,matplotlibs pyplotsubplots crash kernel case c...,2


## Classifier

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(sample_posts.FullPosts))

print(data_words[:3])

[['pip', 'install', 'killed', 'memory', 'get', 'around', 'install', 'spacy', 'library', 'natural', 'language', 'python', 'pip', 'install', 'spacy', 'spacy', 'collected', 'package', 'spacy', 'already', 'answer', 'stack', 'overflow', 'due', 'low', 'memory', 'free', 'cant', 'extend', 'quick', 'tail', 'give', 'memory', 'kill', 'process', 'python', 'score', 'sacrifice', 'child', 'already', 'didnt', 'help', 'something', 'missing', 'file', 'manually', 'way', 'install', 'package', 'server', 'edit', 'complete', 'log', 'python', 'pip', 'install', 'spacy', 'converted', 'value', 'one', 'converted', 'value', 'one', 'spacy', 'location', 'search', 'version', 'spacy', 'getting', 'page', 'looking', 'cache', 'permanently', 'response', 'date', 'looking', 'cache', 'current', 'age', 'based', 'date', 'freshness', 'lifetime', 'freshness', 'lifetime', 'request', 'starting', 'new', 'connection', 'get', 'link', 'page', 'found', 'link', 'linked', 'hundred', 'line', 'found', 'link', 'version', 'version', 'version

In [6]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])

['kill memory get install spacy library natural language python install spacy spacy collect package spacy already answer stack overflow due low memory free extend quick tail give memory kill process score sacrifice child already help miss file manually way install package server edit complete spacy convert value convert value spacy location search version spacy getting page look cache permanently response date look cache current age base date freshness lifetime freshness lifetime request start new connection get link page find link link line find link version version version look cache current age base date freshness lifetime response fresh response requirement already satisfy spacy requirement already satisfied spacy requirement already satisfied spacy requirement already satisfied spacy requirement already satisfied request spacy requirement already satisfied spacy requirement already satisfied wasabi spacy requirement already satisfy spacy requirement already satisfied spacy require

In [7]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=2,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             max_features=50000             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)
print(vectorizer.get_feature_names_out())
# print(vectorizer.transform(data_lemmatized))

['abandon' 'ability' 'able' ... 'zip' 'zone' 'zoom']


In [8]:
print(data_vectorized)

  (0, 1662)	2
  (0, 1856)	3
  (0, 1551)	3
  (0, 2931)	18
  (0, 1714)	1
  (0, 1951)	1
  (0, 1676)	1
  (0, 2434)	1
  (0, 512)	2
  (0, 2130)	3
  (0, 124)	1
  (0, 2981)	1
  (0, 2118)	1
  (0, 1779)	1
  (0, 1229)	1
  (0, 1096)	1
  (0, 2448)	1
  (0, 3147)	1
  (0, 2369)	1
  (0, 2742)	1
  (0, 2703)	1
  (0, 452)	1
  (0, 1385)	1
  (0, 1889)	1
  (0, 1150)	1
  :	:
  (4999, 1498)	2
  (4999, 2113)	1
  (4999, 3215)	3
  (4999, 1333)	1
  (4999, 749)	1
  (4999, 2093)	1
  (4999, 1468)	1
  (4999, 2142)	1
  (4999, 2858)	2
  (4999, 1227)	1
  (4999, 1626)	1
  (4999, 502)	1
  (4999, 2034)	1
  (4999, 187)	1
  (4999, 160)	1
  (4999, 2446)	1
  (4999, 258)	1
  (4999, 819)	1
  (4999, 2463)	1
  (4999, 2674)	2
  (4999, 2675)	1
  (4999, 748)	1
  (4999, 2603)	1
  (4999, 2935)	1
  (4999, 640)	1


In [9]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  0.646850084222347 %


In [10]:
X = data_vectorized
y = sample_posts["Tags"]

In [11]:
# Split from the loaded dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [12]:
print("X_train shape : {}".format(X_train.shape))
print("X_test shape : {}".format(X_test.shape))
print("y_train shape : {}".format(y_train.shape))
print("y_test shape : {}".format(y_test.shape))

X_train shape : (4000, 3562)
X_test shape : (1000, 3562)
y_train shape : (4000,)
y_test shape : (1000,)


In [13]:
# X_train = vectorizer.fit_transform(X_train)
# X_test = vectorizer.transform(X_test)

In [14]:
def lsa_reduction(X_train, X_test, n_comp=120):
    svd = TruncatedSVD(n_components=n_comp)
    normalizer = Normalizer()
    
    lsa_pipe = Pipeline([('svd', svd),
                        ('normalize', normalizer)]).fit(X_train)
    
    train_reduced = lsa_pipe.transform(X_train)
    test_reduced = lsa_pipe.transform(X_test)
    return train_reduced, test_reduced

In [15]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
svc = SVC(kernel='linear')
modelLR = LogisticRegression()

In [16]:
# X_train = vectorizer.fit_transform(X_train)
# y_train = vectorizer.fit_transform(y_train)
# X_test = vectorizer.transform(X_test)
# y_test = vectorizer.transform(y_test)

In [17]:
# One vs Restclassifier
# model_OVR = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train)
# model_OVR = OneVsRestClassifier(svc).fit(X_train, y_train)
model_OVR = OneVsRestClassifier(svc).fit(X_train, y_train)

In [18]:
pickle.dump(model_OVR, open('../ModelsAPI/model_OVR.pkl','wb'))

In [19]:
y_pred = model_OVR.predict(X_test)
print(y_pred)
print(y_pred.shape)

['python pythonunittest ' 'python pandas ' 'python tor '
 'javascript reactjs ' 'python regex ' 'python pyinstaller '
 'python pandas ' 'python python ' 'python pip ' 'python opencv '
 'angular typescript ' 'php laravel ' 'python sqlite ' 'java android '
 'powershell gitlabci ' 'git gitcheckout ' 'python python '
 'python getattr ' 'dart flutter ' 'python pandas ' 'angular typescript '
 'python pip ' 'python pythonimaginglibrary ' 'javascript typescript '
 'ios swift ' 'androidstudio flutter ' 'python pandas ' 'ios swift '
 'python python ' 'python matplotlib ' 'laravel laravel '
 'python matplotlib ' 'git atlassiansourcetree ' 'javascript reactjs '
 'python matplotlib ' 'python pip ' 'python pip ' 'typescript angular '
 'python multiprocessing ' 'arrays swift ' 'angular typescript '
 'html css ' 'css cssanimations ' 'python pandas ' 'python pandas '
 'python pandas ' 'javascript nodejs ' 'python tensorflow '
 'python tensorflow ' 'javascript typescript ' 'python yaml '
 'dart flutter 

In [20]:
y_pred.shape

(1000,)

In [21]:
test = ["Django supports Python. If you're under Linux and want to check the Python version you're using, run python -V from the command line If you want to check the Django version, open a Python console and type"]
# test_words = list(sent_to_words(test))
# test_words = lemmatization(test_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# test_words = count_vect.transform(test_words)
pred = model_OVR.predict(vectorizer.transform(test))
print(pred)

['python django ']
