In [103]:
#Needed Packages 
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV


In [104]:
#load the prepreocessed dataframe into jupiter Notebook 
df= pd.read_excel (r'preprocessed_data.xlsx')

In [105]:
#visualize the data head
#data consits of the label = product_category 
#and the the text data, which consits of the title, body and tag 
df.head(100)

Unnamed: 0,sum,product_category
0,Angular loop is not updating <p>Just started o...,0
1,Calculate input of textbox and update another ...,0
2,"Inline ""display: none;"" is being automatically...",0
3,What does 'this.x1 = options.x1 || 0'; mean in...,0
4,Assign a text file to a js variable? <p>Is the...,0
...,...,...
95,HTML Page Being Loaded Into DIV Via Navigation...,0
96,FullCalendar display HTML in event title <p>I ...,0
97,"Make code wait for popup to open, then scrape ...",0
98,Why is this javascript object property undefin...,0


In [106]:
#visualize the data info 
#the data has 3899 entries 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sum               3900 non-null   object
 1   product_category  3900 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 61.1+ KB


In [107]:
#execute the test-training split training =8000 entries, test=2000 entries
df = df.sample(frac=1).reset_index(drop=True)

In [108]:
#test-training-split (training 80% test 20 %)
#calculating the split 
train_ =len(df)/100*90
test_ =len(df)/100*10

#assigning the splited dataframe 
train=df.head(int(train_))
test=df.tail(int(test_))

In [109]:
#visualize train and test info 
#train.info()
#test.info()

In [110]:
#create the a list out of the dataframe 
x_traning_target=train["product_category"].values.tolist()
y_training_text=train["sum"].values.tolist()
x_test_target=test['product_category'].values.tolist()
y_test_text=test['sum'].values.tolist()

In [111]:
#TEXT PREPROCESSING 
#tokenizing and filtering of stopword 
#these is executed using the CountVectorizer (as in class)
#the package buids a dictionary of features and trainforms them 
#into a feature vector: 
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(y_training_text)
X_train_counts.shape


(3510, 42840)

In [112]:
#The value of a word vocab is linted towords its frequency 
#in the whole training dataa
count_vect.vocabulary_.get(u'algorithm')

6135

In [113]:
#Term Frequency times Inverse Document Frequency
#Avoiding the potial discrepancies of long documents 
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape


tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3510, 42840)

In [114]:
#continue
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3510, 42840)

In [115]:
#Naive Bayes 
#Training of the classifier
clf = MultinomialNB().fit(X_train_tfidf, x_traning_target)

In [116]:
#Define a list of the product categories
target_names_=['javascript','java','python','c#','others']


In [117]:
#testing the Naive Bayes Classifizication
docs_new = ['Hier kann eine Probefrage rein!']

X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

#execute the prediction
predicted = clf.predict(X_new_tfidf)

#Output of the prediction
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, target_names_[predicted[0]]))

'Hier kann eine Probefrage rein!' => others


In [118]:
#create a pipeline 
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])
text_clf.fit(y_training_text,x_traning_target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [119]:
#testing the classifier 
docs_test =y_test_text
predicted = text_clf.predict(docs_test)
np.mean(predicted ==x_test_target)

0.41025641025641024

In [120]:
#improve the classivier by Stochastic Gradient Descent 
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss= 'squared_loss', penalty='elasticnet',
 alpha=1e-3, random_state=42, max_iter=30, tol=None)),])

In [121]:
#train the model 
text_clf.fit(y_training_text,x_traning_target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, loss='squared_loss', max_iter=30,
                               penalty='elasticnet', random_state=42,
                               tol=None))])

In [122]:
#testing the model and returing a prediction mean 
predicted = text_clf.predict(y_test_text)
np.mean(predicted == x_test_target)

0.7846153846153846

In [123]:
#creating a report 
print(metrics.classification_report(x_test_target, predicted, target_names=target_names_))

              precision    recall  f1-score   support

  javascript       0.89      0.77      0.83        62
        java       0.92      0.78      0.84        58
      python       0.94      0.88      0.91        56
          c#       0.81      0.42      0.56        59
      others       0.68      0.90      0.77       155

    accuracy                           0.78       390
   macro avg       0.85      0.75      0.78       390
weighted avg       0.81      0.78      0.78       390



In [124]:
#crate a matix of the 
metrics.confusion_matrix(x_test_target, predicted)

array([[ 48,   0,   1,   1,  12],
       [  1,  45,   0,   0,  12],
       [  0,   0,  49,   0,   7],
       [  0,   0,   0,  25,  34],
       [  5,   4,   2,   5, 139]], dtype=int64)

In [125]:

parameters = {
'vect__ngram_range': [(1, 1), (1, 2)],
 'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),}

In [126]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [127]:
gs_clf = gs_clf.fit(y_training_text[:600],x_traning_target[:600])

In [128]:
target_names_[gs_clf.predict(['python python python'])[0]]

'python'

In [129]:
gs_clf.best_score_
for param_name in sorted(parameters.keys()): print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: False
vect__ngram_range: (1, 2)
