### 1. Download Tickets Data

In [None]:
#Download file using wget
!wget https://privdatastorage.blob.core.windows.net/github/support-tickets-classification/datasets/all_tickets.csv --quiet

In [None]:
#Check if the file is available
!ls -l

### 2. Tickets Data Exploration

Load tickets data as dataframe

In [None]:
import pandas as pd
import numpy as np
import textwrap

In [None]:
# read file into pandas using a relative path. Please change the path as needed
tix_df = pd.read_table('all_tickets.csv', sep=',')

In [None]:
#Total number of tickets
tix_df.shape

In [None]:
#Columns in the dataframe
tix_df.columns

In [None]:
#Check the contents of dataframe
tix_df.sample(n=5)

Checking ticket content

In [None]:
my_wrap = textwrap.TextWrapper()

In [None]:
tix_num = np.random.randint(0, tix_df.shape[0])
print('Title: ')
print(tix_df.loc[tix_num, 'title'])
print('Body: ')
for line in my_wrap.wrap(tix_df.loc[tix_num, 'body']):
    print(line)

In [None]:
#Checkout missing values
tix_df.isnull().sum()

Visualization

In [None]:
#Ticket type
tix_df['ticket_type'].value_counts().plot(kind='barh')

In [None]:
#Category
tix_df['category'].value_counts().plot(kind='barh')

In [None]:
#Impact
tix_df['impact'].value_counts().plot(kind='barh')

In [None]:
#Urgency
tix_df['urgency'].value_counts().plot(kind='barh')

In [None]:
#Sub-category 1
tix_df['sub_category1'].value_counts().plot(kind='barh', figsize=(20,15))

In [None]:
#Sub-category 1 for specific category
tix_df[tix_df['category'].isin(['4'])]['sub_category1'].value_counts().plot(kind='barh')

### 3. Create Training & Test Dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#Build a new Columns which combines 'title' and 'body'
tix_df['title_body'] = tix_df['title'].astype('str') + tix_df['body']

#Column to predict
column_to_predict = 'ticket_type'

In [None]:
# split X and y into training and testing sets
trainX, testX, trainY, testY = train_test_split(tix_df['title_body'], tix_df[column_to_predict], random_state=2)

In [None]:
#Traing data
print(trainX.shape)
print(trainY.shape)

In [None]:
#Test Data
print(testX.shape)
print(testY.shape)

### 4. Tokenization & Vectorization

Using **CountVectorizer**, to get numeric features.

In [None]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()

In [None]:
#Feed SMS data to CountVectorizer
cvect.fit(trainX)

#Check the vocablury size
len(cvect.vocabulary_)

In [None]:
#What is there in the vocabulary
cvect.get_feature_names()

#### How do we reduce Vocabulary?

1. Stemming / Lemmatization
2. Remove Stop words
3. Remove words who appear very rarely (min_df)
4. Remove words which appear a lot (max_df)

In [None]:
#Stemming
from nltk.stem import PorterStemmer

Build a new CountVectorizer with stemmer

In [None]:
analyzer = CountVectorizer().build_analyzer()
stemmer = PorterStemmer()

In [None]:
#Function to Stem words
def get_stemmed_text(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

In [None]:
stem_cvectorizer = CountVectorizer(analyzer=get_stemmed_text)

In [None]:
#This will take few seconfs
stem_cvectorizer.fit(trainX)

#Vocabulary
len(stem_cvectorizer.get_feature_names())

With stemmer and minimum frequency

In [None]:
stem_min_cvectorizer = CountVectorizer(analyzer=get_stemmed_text, min_df=5)

In [None]:
#This will take few seconfs
stem_min_cvectorizer.fit(trainX)

#Vocabulary
len(stem_min_cvectorizer.get_feature_names())

In [None]:
#stem_min_cvectorizer.vocabulary_

Build Document-term Matrix (DTM)

In [None]:
#Convert Training text into Count Vectors
trainX_ct = stem_min_cvectorizer.transform(trainX)

In [None]:
#Size of Document Term Matrix
trainX_ct.shape

In [None]:
#Let's check the first record
trainX_ct[0]

In [None]:
#What's there in sparse matrix
print(trainX_ct[0])

From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):

> As most documents will typically use a very small subset of the words used in the corpus, the resulting matrix will have **many feature values that are zeros** (typically more than 99% of them).

> For instance, a collection of 10,000 short text documents (such as emails) will use a vocabulary with a size in the order of 100,000 unique words in total while each document will use 100 to 1000 unique words individually.

> In order to be able to **store such a matrix in memory** but also to **speed up operations**, implementations will typically use a **sparse representation** such as the implementations available in the `scipy.sparse` package.

Convert Test text also in numerical features

In [None]:
testX_ct = stem_min_cvectorizer.transform(testX)

In [None]:
testX_ct.shape

### 4. Building a Ticket Classifier

Let's first try K-Nearest Neigbour algorithm

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# instantiate the model (with the default parameters)
knn = KNeighborsClassifier()

# fit the model with data (occurs in-place)
knn.fit(trainX_ct, trainY)

Evaluation on Test Dataset

In [None]:
from sklearn import metrics

In [None]:
#Calculate accuracy on Test Dataset
predicted_test_y = knn.predict(testX_ct)
metrics.accuracy_score(testY, predicted_test_y)

Classification Report

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [None]:
def print_confusion_matrix(testY, predicted_test_y):
    mat = confusion_matrix(testY, predicted_test_y)

    plt.figure(figsize=(4, 4))
    sns.set()
    sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
                xticklabels=np.unique(testY),
                yticklabels=np.unique(testY))

    plt.xlabel('true label')
    plt.ylabel('predicted label')
    plt.show()

In [None]:
print_confusion_matrix(testY, predicted_test_y)

#### We can build Classifier using other algorithms e.g SVM

In [None]:
from sklearn.svm import SVC

In [None]:
#Train an SVM with default parameters
svc = SVC()
svc.fit(trainX_ct, trainY)

In [None]:
#Calculate accuracy on Test Dataset
predicted_test_y_svc = svc.predict(testX_ct)
metrics.accuracy_score(testY, predicted_test_y_svc)

In [None]:
print_confusion_matrix(testY, predicted_test_y_svc)

### 5. Using TF-IDF Vectorizer

In [None]:
# import and instantiate TF-IDF Vectorizer (with the default parameters)
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(analyzer=get_stemmed_text, min_df=5)

In [None]:
#Feed data to CountVectorizer
tvect.fit(trainX)

#Check the vocablury size
len(tvect.vocabulary_)

In [None]:
#Convert Training into numerical values
trainX_tfidf = tvect.transform(trainX)
trainX_tfidf.shape

In [None]:
#Convert Test SMSes also to tf-idf vectors
testX_tfidf = tvect.transform(testX)

In [None]:
print(trainX_tfidf[0])

Build an SVM

In [None]:
svc_tf = SVC()
svc_tf.fit(trainX_tfidf, trainY)

In [None]:
#Calculate accuracy on Test Dataset
predicted_test_y_svc_tf = svc_tf.predict(testX_tfidf)
metrics.accuracy_score(testY, predicted_test_y_svc_tf)

In [None]:
print_confusion_matrix(testY, predicted_test_y_svc_tf)

In [None]:
svc_tf_b = SVC(class_weight='balanced')
svc_tf_b.fit(trainX_tfidf, trainY)

In [None]:
#Calculate accuracy on Test Dataset
predicted_test_y_svc_tf_b = svc_tf_b.predict(testX_tfidf)
metrics.accuracy_score(testY, predicted_test_y_svc_tf_b)

In [None]:
print_confusion_matrix(testY, predicted_test_y_svc_tf_b)

TF-IDF with ngram



```
tvect_ngram = TfidfVectorizer(ngram_range=(1,2)) 
#Tokens can be made of 1 word or 2 words
```

The movie was awesome

Words as tokens = "The", "movie", "was", awesome"

ngrams (1,2) -> 
- "The", 
- "movie", 
- "was", 
- "awesome", 
- "The movie", 
- "movie was", 
- "was awesome"

### 7. Building a Deep Learning Model

In [None]:
import tensorflow as tf

We will use CountVectorizer features in this case. This can be replaced by TF-IDF features

In [None]:
#Start building a Keras Sequential Model
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

In [None]:
#Add hidden layers
model.add(tf.keras.layers.Dense(100, activation='relu', input_shape=(len(tvect.vocabulary_),)))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))

#Add Output layer
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
#Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
model.fit(trainX_tfidf, trainY,
           validation_data=(testX_tfidf, testY), 
           epochs=10, batch_size=32)

In [None]:
#Convert csr sparse matrix to dense
model.fit(trainX_tfidf.todense(), trainY,
           validation_data=(testX_tfidf.todense(), testY), 
           epochs=10, batch_size=32)

In [None]:
#Calculate accuracy on Test Dataset
predicted_test_y_dense = model.predict(testX_tfidf.todense())

#Convert predictions into binary
predicted_test_y_dense_binary = predicted_test_y_dense >= 0.5

#Calculate score
metrics.accuracy_score(testY, predicted_test_y_dense_binary)

In [None]:
model.evaluate(testX_tfidf.todense(), testY)

In [None]:
print_confusion_matrix(testY, predicted_test_y_dense_binary)