In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
import pickle

from html.parser import HTMLParser
import re
import itertools
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords_eng = stopwords.words('english')

import string

from unidecode import unidecode

from googletrans import Translator
from autocorrect import Speller

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kenem001\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df_original = pd.read_csv('../Data/data_generalised_cleaned_us_3.csv')

df = df_original
df = df[df['Translated'].notna()]
df = df[df['Rep_Class'].notna()]

In [3]:
# Data split

def data_split(max):
    global X, y, x_train, x_test, y_train, y_test, train_max, test_max

    X = df.loc[:, 'Translated']
    y = df.loc[:, 'Rep_Class']

    ts = 0.8

    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        X, y, train_size=ts)

    train_max = int(max * ts)
    test_max = int(max * (1 - ts))

    x_train, x_test, y_train, y_test = \
        np.array(x_train[0:train_max]), \
        np.array(x_test[0:test_max]),  \
        np.array(y_train[0:train_max]),  \
        np.array(y_test[0:test_max])


In [4]:
def clean(input_text):
    '''
    This function cleans the body received from e-mails so that features
    to be processed by the Support Vector Machine are extracted from it 
    more effectively

    via https://www.geeksforgeeks.org/python-efficient-text-data-cleaning/
    '''

    apos_dict = {"'s":" is","n't":" not","'m":" am","'ll":" will",
           "'d":" would","'ve":" have","'re":" are"}

    warning_message = 'CAUTION: This email originated from OUTSIDE the Government Email Infrastructure. DO NOT CLICK LINKS or OPEN attachments unless you recognise the sender and know the content is safe.'

    if warning_message in input_text:
        input_text = re.sub(warning_message, '', input_text)

    # Convert to unicode
    input_text = unidecode(input_text) 

    # Remove URLs, hashtags
    input_text = re.sub(r'https?:\/\/.\S+', '', input_text)
    input_text = re.sub(r'#', '', input_text)

    # Contraction replacement
    for key, value in apos_dict.items():
        if key in input_text:
            input_text = input_text.replace(key,value)

    # Split attached words
    input_text = ' '.join([s for s in re.split('([A-Z][a-z]+[^A-Z]*)', input_text) if s])

    # Remove punctuation
    input_text_list = []
    for word in input_text:
        if word not in string.punctuation:
            input_text_list.append(word)

    input_text_list = ''.join(input_text_list).split(' ')
    input_text = ' '.join(input_text_list)

    # Translation + Spell check
    input_text = re.sub(r'(?<![\w\d])kons(?![\w\d])', 'konsumatur', input_text)
    input_text = re.sub(r'(?<![\w\d])cons(?![\w\d])', 'consumer', input_text)

    # translator = Translator()

    # lang = translator.detect(input_text).lang

    # input_text = translator.translate(input_text, dest='en').text
    # input_text_list = input_text.split(' ')

    spell = Speller(lang='en')
    
    for i in range(len(input_text_list)):
        input_text_list[i] = spell(input_text_list[i])

    input_text = ' '.join(input_text_list)

    # Convert to lowercase
    input_text = input_text.lower()

    # Remove stopwords
    exceptions = ['not']
    ignore = ['hi', 'hello', 'dear', 'sir', 'madam', 'ms', 'mr', 'regards']

    input_text_tokens = input_text.split()

    input_text_list = []
    for word in input_text_tokens:
        if (word not in stopwords_eng) and (word not in ignore) or (word in exceptions):
            input_text_list.append(word)

    input_text = ' '.join(input_text_list)

    input_text = re.sub(' +', ' ', input_text)

    return input_text, lang

In [6]:
best = 0

vec = CountVectorizer()

while True:
    data_split(df.shape[0])
    features = vec.fit_transform(x_train)
    model = svm.SVC(kernel='linear', C=1, gamma=0.01, probability=True)
    test = vec.transform(x_test)

    model.fit(features, y_train)

    score = model.score(test, y_test)

    if score > best:
        best = score
        with open("model.pkl", "wb") as f:
            pickle.dump(model, f)

        with open("vec.pkl", "wb") as f:
            pickle.dump(vec, f)

        print(best)


0.5809304113802384
0.5859284890426759
0.5990003844675125
0.5993848519800077
0.6032295271049596
0.6055363321799307
0.6151480199923106
0.615916955017301
0.6163014225297963


In [None]:
best = 0

vec = CountVectorizer()

while True:
    data_split(df.shape[0])
    features = vec.fit_transform(x_train)

    tuned_parameters = {
        'kernel': ['linear'],
        'gamma': [0.001, 0.01, 0.1],
        'C': [0.1, 1, 10]
    }

    model = GridSearchCV(svm.SVC(probability=True), tuned_parameters)
    test = vec.transform(x_test)

    model.fit(features, y_train)

    score = model.score(test, y_test)

    if score > best:
        best = score
        with open("model.pkl", "wb") as f:
            pickle.dump(model, f)

        with open("vec.pkl", "wb") as f:
            pickle.dump(vec, f)

        print(best)


0.6036139946174548
0.6143790849673203
0.6147635524798154
0.6166858900422915


KeyboardInterrupt: 

In [None]:
vec = CountVectorizer()
data_split(1000)
features = vec.fit_transform(x_train)
model = svm.SVC(kernel='linear', C=1, gamma=0.01, probability=True)
test = vec.transform(x_test)

model.fit(features, y_train)

score = model.score(test, y_test)
print(score)

0.5276381909547738


In [None]:
model.classes_

array(['Contract and Sales', 'Delivery of Goods/Provision of Service',
       'Flights', 'Invoicing/Billing', 'Pricing Tariff',
       'Quality of Goods and Service', 'Unfair Commercial Practice',
       'Warranty/Statutory Commercial Guarantees'], dtype=object)

In [None]:
vec = CountVectorizer()
data_split(df.shape[0])
features = vec.fit_transform(x_train)

tuned_parameters = {
    'kernel' : ['linear'],
    'gamma' : [0.001, 0.01, 0.1],
    'C' : [0.1, 1, 10]
}
model = GridSearchCV(svm.SVC(probability=True), tuned_parameters)
test = vec.transform(x_test)

model.fit(features, y_train)

score = model.score(test, y_test)

print(score)

0.6036139946174548
