In [1]:
import pandas as pd
import random
import numpy as np
import itertools
import matplotlib.pyplot as plt
from time import time

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import f1_score, jaccard_score, accuracy_score, confusion_matrix
from sklearn.svm import LinearSVC

In [4]:
#get labelled data
df_data=pd.read_csv('corpus_labels.csv')
df_data=df_data[['corpus','category']]
df_data.rename(columns={'category':'label'}, inplace=True)
df_data.head()

Unnamed: 0,corpus,label
0,play game half half game point scored play ran...,Sports
1,paper front front paper issue paper error numb...,Computers
2,firework security leave midnight midnight ligh...,Society
3,commercial traffic flight passenger midnight t...,Recreation
4,television television program program show pro...,Arts


In [5]:
#Split data to training and testing data
X_train, X_test, y_train, y_test = train_test_split(df_data['corpus'], df_data['label'], 
                                                    test_size=0.2, random_state=4)
print ('Number of records in training data:', X_train.shape,  y_train.shape)
print ('Number of records in testing data:', X_test.shape,  y_test.shape)

Number of records in training data: (866,) (866,)
Number of records in testing data: (217,) (217,)


In [6]:
######################### Classification NAIVE BAYES #########################

In [7]:
#Feed data vectorizer, transformer and naive bayes classifier object to a pipeline
time_start=time()
obj_pipeline = Pipeline([('vect', TfidfVectorizer()), ('tfidf', TfidfTransformer()), 
                         ('clf', MultinomialNB())])
obj_pipeline.fit(X_train, y_train)
time_taken=time()-time_start
print('Time(seconds) taken to train naive bayes model:',time_taken)
#Predict values using test data
predicted_values = obj_pipeline.predict(X_test)
print('Accuracy Score: %s' % accuracy_score(predicted_values, y_test))
print('Classification Report')
print(metrics.classification_report(y_test, predicted_values))

Time(seconds) taken to train naive bayes model: 0.09142184257507324
Accuracy Score: 0.7188940092165899
Classification Report
              precision    recall  f1-score   support

        Arts       1.00      0.42      0.59        24
    Business       0.94      0.62      0.75        24
   Computers       0.93      0.81      0.87        16
       Games       0.00      0.00      0.00         2
      Health       1.00      0.14      0.25         7
        Home       1.00      0.40      0.57         5
  Recreation       0.00      0.00      0.00        13
     Science       0.00      0.00      0.00         3
     Society       0.60      1.00      0.75        82
      Sports       0.87      0.80      0.84        41

    accuracy                           0.72       217
   macro avg       0.63      0.42      0.46       217
weighted avg       0.73      0.72      0.68       217



  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
#predicted values
predicted_values[0:5]

array(['Arts', 'Business', 'Society', 'Society', 'Society'], dtype='<U10')

In [9]:
#f1-score
print('f1-score')
f1_score(y_test, predicted_values, average='weighted') 

f1-score


0.6752658734289885

In [10]:
#jaccard-score
print('jaccard-score')
jaccard_score(y_test, predicted_values, average='weighted')

jaccard-score


0.5460346270345445

In [11]:
######################### Classification SVM #########################

In [12]:
#Feed data vectorizer, transformer and SVM classifier object to a pipeline
time_start=time()
objPipelineSvm = Pipeline([('vect', TfidfVectorizer()), ('tfidf', TfidfTransformer()), 
                           ('clf', LinearSVC())])
objPipelineSvm.fit(X_train, y_train)
time_taken=time()-time_start
print('Time(seconds) taken to train SVM model:',time_taken)
#Predict values using test data
predicted_values = objPipelineSvm.predict(X_test)
print('Accuracy Score: %s' % accuracy_score(predicted_values, y_test))
print('Classification Report')
print(metrics.classification_report(y_test, predicted_values))

Time(seconds) taken to train SVM model: 0.07739925384521484
Accuracy Score: 0.8064516129032258
Classification Report
              precision    recall  f1-score   support

        Arts       0.67      0.75      0.71        24
    Business       0.88      0.92      0.90        24
   Computers       0.94      0.94      0.94        16
       Games       0.50      0.50      0.50         2
      Health       0.56      0.71      0.63         7
        Home       0.67      0.40      0.50         5
  Recreation       0.60      0.23      0.33        13
     Science       0.60      1.00      0.75         3
     Society       0.83      0.91      0.87        82
      Sports       0.89      0.76      0.82        41

    accuracy                           0.81       217
   macro avg       0.71      0.71      0.69       217
weighted avg       0.80      0.81      0.80       217



In [13]:
#f1-score
print('f1-score')
f1_score(y_test, predicted_values, average='weighted') 

f1-score


0.7968180946549042

In [14]:
#jaccard-score
print('jaccard-score')
jaccard_score(y_test, predicted_values, average='weighted')

jaccard-score


0.6835288917997892

In [None]:
######################### END #########################