# NLP classification

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
data=pd.read_csv('UTS_TT_DATA.csv')
data.head()

Unnamed: 0,row_number,affected_service,priority,description,title
0,13133,Other,Minor,"NTD ODU red. Rebooted in isolation, Still red....",NWAS-OTHER-*ANOM_ID36945*-ANOM_ID5775
1,17314,Intermittent signal,Minor,EU is reporting slow internet connection.\r\nP...,NWAS - Dropouts -*ANOM_ID42107*- ANOM_ID14570
2,17281,Loss of Signal,Minor,ISSUE: Red ODU\r\n\r\nService Address:|Addres...,NWAS -*ANOM_ID5433*-*ANOM_ID13853*
3,3252,Intermittent signal,Medium,Fault Symptom: Drop Outs\r\n\r\nHave you check...,NWAS || Intermittent || ANOM_ID47835 || -*ANOM...
4,10699,Loss of Signal,Minor,EU is unable to connect to the internet.\r\nPr...,NWAS||SERVICE LOSS||`ICE LOSS||-*ANOM_ID58216*...


In [3]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [4]:
data.affected_service.unique()

array(['Other', 'Intermittent signal', 'Loss of Signal', 'Dead UNI-D',
       'Signal degradation', 'Dead WNTD', 'Physical damage',
       'Packet Loss', 'Dead UNI-V', 'Power Supply Unit',
       'Intermittent Power', nan, 'No dataflow', 'Slow speed'],
      dtype=object)

In [4]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.affected_service.values)

TypeError: argument must be a string or number

In [9]:
data['normizateddescription'] = data['description'].apply(lambda i:nltk.word_tokenize(text=i) )

In [10]:
data['description'][1]

'EU is reporting slow internet connection.\r\nProvisioning between NBN and Optus has been checked and no issues found.\r\nOptus systems are showing no sync.\r\nEU has reported visible no damage to NBN equipment.\r\nEU has replaced Ethernet cable between WNTD and RG.\r\nEU has tested using an Ethernet device connected directly to the WNTD and experienced fault.\r\n\r\nPlease investigate.\r\n\r\nOpen orders - none\r\n\r\nFault Type - Intermittent service/dropouts\r\n\r\n##DIAGNOSTIC QUESTIONS##\r\nHas an isolation test been run? - Yes\r\nAny active incidents at time of fault? - No\r\nPower Status - Answer not provided\r\nPower Behaviour - Answer not provided\r\nLED Status - Answer not provided\r\nLED Behaviour - Answer not provided\r\nODU Status - Answer not provided\r\nODU Behaviour - Answer not provided\r\nUNI-D Status - Answer not provided\r\nUNI-D Behaviour - Answer not provided\r\nSignal Strengths Status - Answer not provided\r\nPorts being used - Answer not provided\r\nNTD Serial N

In [12]:
data['normizateddescription'][1]

['EU',
 'is',
 'reporting',
 'slow',
 'internet',
 'connection',
 '.',
 'Provisioning',
 'between',
 'NBN',
 'and',
 'Optus',
 'has',
 'been',
 'checked',
 'and',
 'no',
 'issues',
 'found',
 '.',
 'Optus',
 'systems',
 'are',
 'showing',
 'no',
 'sync',
 '.',
 'EU',
 'has',
 'reported',
 'visible',
 'no',
 'damage',
 'to',
 'NBN',
 'equipment',
 '.',
 'EU',
 'has',
 'replaced',
 'Ethernet',
 'cable',
 'between',
 'WNTD',
 'and',
 'RG',
 '.',
 'EU',
 'has',
 'tested',
 'using',
 'an',
 'Ethernet',
 'device',
 'connected',
 'directly',
 'to',
 'the',
 'WNTD',
 'and',
 'experienced',
 'fault',
 '.',
 'Please',
 'investigate',
 '.',
 'Open',
 'orders',
 '-',
 'none',
 'Fault',
 'Type',
 '-',
 'Intermittent',
 'service/dropouts',
 '#',
 '#',
 'DIAGNOSTIC',
 'QUESTIONS',
 '#',
 '#',
 'Has',
 'an',
 'isolation',
 'test',
 'been',
 'run',
 '?',
 '-',
 'Yes',
 'Any',
 'active',
 'incidents',
 'at',
 'time',
 'of',
 'fault',
 '?',
 '-',
 'No',
 'Power',
 'Status',
 '-',
 'Answer',
 'not',
 'pro

In [13]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [14]:
data.dropna(subset = ["affected_service"], inplace=True)
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.affected_service.values)

In [15]:
xtrain, xvalid, ytrain, yvalid = train_test_split(data.normizateddescription.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [16]:
print (xtrain.shape)
print (xvalid.shape)

(17950,)
(1995,)


### very first model: TF-IDF(Term Frequency - Inverse Document Frequency)+LR(Logistic Regression)

In [41]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

TypeError: 'list' object is not callable

In [5]:
data.affected_service.unique()

array(['Other', 'Intermittent signal', 'Loss of Signal', 'Dead UNI-D',
       'Signal degradation', 'Dead WNTD', 'Physical damage',
       'Packet Loss', 'Dead UNI-V', 'Power Supply Unit',
       'Intermittent Power', nan, 'No dataflow', 'Slow speed'],
      dtype=object)