In [53]:
#Needed Packages 
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV


In [54]:
#load the prepreocessed dataframe into jupiter Notebook 
df= pd.read_excel (r'preprocessed_data.xlsx')

In [55]:
#visualize the data head
#data consits of the label = product_category 
#and the the text data, which consits of the title, body and tag 
df.head(100)

Unnamed: 0,product_category,sum
0,4,My html form won't POST to mysql <p>Hello i ha...
1,4,Counting number of characters in a WCHAR strin...
2,3,How to get the appropriate ComboBox Tag <p>I'm...
3,0,Angular loop is not updating <p>Just started o...
4,4,"why vector is always slower than C array, at l..."
...,...,...
95,3,Using more than one object to iterate though i...
96,4,How to track position of an Android device in ...
97,2,Setting a class' metaclass using a decorator <...
98,1,Java compiler automatically renaming parameter...


In [56]:
#visualize the data info 
#the data has 3899 entries 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   product_category  10000 non-null  int64 
 1   sum               10000 non-null  object
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


In [57]:
#execute the test-training split training =8000 entries, test=2000 entries
df = df.sample(frac=1).reset_index(drop=True)

In [58]:
#test-training-split (training 80% test 20 %)
#calculating the split 
train_ =len(df)/100*80
test_ =len(df)/100*20

#assigning the splited dataframe 
train=df.head(int(train_))
test=df.tail(int(test_))

In [59]:
#visualize train and test info 
#train.info()
#test.info()

In [60]:
#create the a list out of the dataframe 
x_traning_target=train["product_category"].values.tolist()
y_training_text=train["sum"].values.tolist()
x_test_target=test['product_category'].values.tolist()
y_test_text=test['sum'].values.tolist()

In [61]:
#TEXT PREPROCESSING 
#tokenizing and filtering of stopword 
#these is executed using the CountVectorizer (as in class)
#the package buids a dictionary of features and trainforms them 
#into a feature vector: 
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(y_training_text)
X_train_counts.shape


(8000, 76597)

In [62]:
#The value of a word vocab is linted towords its frequency 
#in the whole training dataa
count_vect.vocabulary_.get(u'algorithm')

11432

In [63]:
#Term Frequency times Inverse Document Frequency
#Avoiding the potial discrepancies of long documents 
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape


tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(8000, 76597)

In [64]:
#continue
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(8000, 76597)

In [65]:
#Naive Bayes 
#Training of the classifier
clf = MultinomialNB().fit(X_train_tfidf, x_traning_target)

In [66]:
#Define a list of the product categories
target_names_=['javascript','java','python','c#','others']


In [67]:
#testing the Naive Bayes Classifizication
docs_new = ['Hier kann eine Probefrage rein!']

X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

#execute the prediction
predicted = clf.predict(X_new_tfidf)

#Output of the prediction
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, target_names_[predicted[0]]))

'Hier kann eine Probefrage rein!' => others


In [68]:
#create a pipeline 
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])
text_clf.fit(y_training_text,x_traning_target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [69]:
#testing the classifier 
docs_test =y_test_text
predicted = text_clf.predict(docs_test)
np.mean(predicted ==x_test_target)

0.643

In [70]:
#improve the classivier by Stochastic Gradient Descent 
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss= 'squared_loss', penalty='elasticnet',
 alpha=1e-3, random_state=42, max_iter=30, tol=None)),])

In [71]:
#train the model 
text_clf.fit(y_training_text,x_traning_target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, loss='squared_loss', max_iter=30,
                               penalty='elasticnet', random_state=42,
                               tol=None))])

In [72]:
#testing the model and returing a prediction mean 
predicted = text_clf.predict(y_test_text)
np.mean(predicted == x_test_target)

0.7815

In [73]:
#creating a report 
print(metrics.classification_report(x_test_target, predicted, target_names=target_names_))

              precision    recall  f1-score   support

  javascript       0.89      0.50      0.64       220
        java       0.83      0.51      0.63       166
      python       0.97      0.58      0.73       181
          c#       0.79      0.07      0.14       147
      others       0.76      0.97      0.85      1286

    accuracy                           0.78      2000
   macro avg       0.85      0.53      0.60      2000
weighted avg       0.80      0.78      0.75      2000



In [74]:
#crate a matix of the 
metrics.confusion_matrix(x_test_target, predicted)

array([[ 110,    0,    1,    0,  109],
       [   0,   85,    0,    1,   80],
       [   0,    0,  105,    0,   76],
       [   2,    0,    0,   11,  134],
       [  12,   18,    2,    2, 1252]], dtype=int64)

In [75]:

parameters = {
'vect__ngram_range': [(1, 1), (1, 2)],
 'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),}

In [76]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [77]:
gs_clf = gs_clf.fit(y_training_text[:600],x_traning_target[:600])

In [78]:
target_names_[gs_clf.predict(['python python python'])[0]]

'python'

In [79]:
gs_clf.best_score_
for param_name in sorted(parameters.keys()): print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)
