# Service Quality Monitoring in Confined Spaces Through Mining Twitter Data

## Task1: Aspect Extraction


### Baseline Approaches: Skip-gram

In [1]:
import warnings
warnings.filterwarnings("ignore")

#general rules
from tabulate import tabulate
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from collections import Counter, defaultdict

# gensim
from gensim.models.word2vec import Word2Vec

# classification tools
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# text-preprocessing tools
# python -m spacy download en
import spacy

# gloabl parameters
TRAIN_SET_PATH = "scs.txt"
TEST_SET_PATH = "fss.txt"
encoding="utf-8"

In [2]:
import nltk
# nltk.download('stopwords')
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

lemmatiser = WordNetLemmatizer()
stemmer = SnowballStemmer(language='english')
nlp = spacy.load('en', disable=['parser', 'ner'])

stops = set(stopwords.words('english'))  # nltk stopwords list

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def stemming(texts):
    return [[stemmer.stem(word) for word in line] for line in texts]

def remove_stop_words(texts):
    refined_texts = [[word for word in line if word not in stops] for line in texts]
    return refined_texts

def remote_punctuation(texts):
    tokenizer = RegexpTokenizer(r'\w+')
    results = [tokenizer.tokenize(" ".join(line)) for line in texts]
    print(results)

def process_texts(x):
    x1 = lemmatization(x)
    x2 = remove_stop_words(x1)
    x3 = stemming(x2)
    return x3

In [3]:
X, y = [], []

with open(TRAIN_SET_PATH, "r") as infile:
    for line in infile:
        label, text = line.split("|")
        if (label != '-1'):
            X.append(text.split())
            y.append(label)

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
X = [tokenizer.tokenize(" ".join(line)) for line in X]

X, y = np.array(process_texts(X)), np.array(y)
print ("total examples %s" % len(y))

In [5]:
to_be_removed = [i for i,x in enumerate(X) if(len(x)==0)]
to_be_removed.sort(reverse=True)
print(to_be_removed)

for i in to_be_removed:
    print('removing index no:',i)
    X = np.delete(X, i)
    y= np.delete(y, i)

print ("total examples %s" % len(y))

[1291, 1077, 897, 835, 827, 722, 706, 601, 300, 142]
removing index no: 1291
removing index no: 1077
removing index no: 897
removing index no: 835
removing index no: 827
removing index no: 722
removing index no: 706
removing index no: 601
removing index no: 300
removing index no: 142
total examples 1363


In [6]:
# and a tf-idf version of the same
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(word2vec))])
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

# Adaptive Online Clustering

In [12]:
# from sklearn.metrics.pairwise import cosine_similarity
# from scipy import sparse

# def similarity(d, center):
#     d_sparse = sparse.csr_matrix(d)
#     center_sparse = sparse.csr_matrix(center)
#     return cosine_similarity(d_sparse, center_sparse, dense_output=True)

# def max_similarity(clusters, centers, d):
#     d_sparse = sparse.csr_matrix(d)
#     sims=[]
#     for k,cluster in clusters.items():
#         center_sparse = sparse.csr_matrix(centers[k])
#         sims.append(cosine_similarity(d_sparse, center_sparse, dense_output=True))
#     maxValue = np.max(sims)
#     maxIndex = np.where(sims == maxValue)[0][0]
#     return maxIndex, maxValue

# def recalc_centroid(X, list_of_ids):
#     return np.average(X[list_of_ids],axis=0)

In [13]:
# clusters={}
# centers = {}
# numTopic = 0 
# sigma = 0.5
# maxValue_lst=[]
# for i,d in enumerate(X_vectors):
#     if len(clusters)==0:
#         print('creating a new cluster')
#         clusters[numTopic] = []
#         centers[numTopic] = []
#         clusters[numTopic].append(i)
#         centers[numTopic].append(d)
#         numTopic += 1
#     else:
#         maxIndex, maxValue = max_similarity(clusters, centers, d)
#         maxValue_lst.append(maxValue)
#         if maxValue >= sigma:
#             clusters[maxIndex].append(i)
#             centers[maxIndex] = recalc_centroid(X_vectors, clusters[maxIndex])
#         else:
#             if (np.sum(d)>0):
#                 print('creating new cluster')
#                 clusters[numTopic] = []
#                 centers[numTopic] = []
#                 clusters[numTopic].append(i)
#                 centers[numTopic].append(d)
#                 numTopic += 1

In [14]:
# print('Number of clusters:',len(clusters))

In [15]:
# len([x for x in maxValue_lst if (x>0.9)])/len(maxValue_lst)

# Classification (SVM & Linear Regression & MLP)

In [36]:
X_TEST, y_TEST = [], []

with open(TEST_SET_PATH, "r") as infile:
    for line in infile:
        label, text = line.split("|")
        if (label != '-1'):
            X_TEST.append(text.split())
            y_TEST.append(label)
            
tokenizer = RegexpTokenizer(r'\w+')
X_TEST = [tokenizer.tokenize(" ".join(line)) for line in X_TEST]

X_TEST, y_TEST = np.array(process_texts(X_TEST)), np.array(y_TEST)
print ("total examples %s" % len(y_TEST))

to_be_removed = [i for i,x in enumerate(X_TEST) if(len(x)==0)]
to_be_removed.sort(reverse=True)
print(to_be_removed)

for i in to_be_removed:
    print('removing index no:',i)
    X_TEST = np.delete(X_TEST, i)
    y_TEST= np.delete(y_TEST, i)

print ("total examples %s" % len(y_TEST))

total examples 1190
[769]
removing index no: 769
total examples 1189


In [42]:
y_list_train = []
for i,v in enumerate(y):
    classes = v.split(',')
    classes2=[int(cl.strip()) for cl in classes]
    y_list_train.append(tuple(classes2))

In [43]:
y_list_test = []
for i,v in enumerate(y_TEST):
    classes = v.split(',')
    classes2=[int(cl.strip()) for cl in classes]
    y_list_test.append(tuple(classes2))

In [44]:
def get_classification_report_as_df(report):
    cl_results = report.split()[4:39]
    cl_results_rest = report.split()[39:]
    df = pd.DataFrame(columns=['P','R','F'])
    for i in range(0,len(cl_results),5):
        df.loc[len(df)] = cl_results[i+1:i+4]

    for i in range(0,len(cl_results_rest),6):
        df.loc[len(df)] = cl_results_rest[i+2:i+5]

    return df

In [45]:
def mean_std_from_results(reports_dic):
    mean_result = pd.DataFrame(columns=['P','R','F'])
    std_result = pd.DataFrame(columns=['P','R','F'])
    for j in range(0,len(reports_dic[1])): # for each aspect
        df = pd.DataFrame(columns=['P','R','F'])
        for i in reports_dic: # for each fold
            df.loc[len(df)] = list(reports_dic[i].loc[j])
        df = df.apply(pd.to_numeric)
        mean_result.loc[len(mean_result)] = df.mean()
        std_result.loc[len(std_result)] = df.std()
    return mean_result, std_result

In [46]:
svm_reports_dic=dict()
svm_f1_dic=dict()
svm_ROC_dic=dict()

lr_reports_dic=dict()
lr_f1_dic=dict()
lr_ROC_dic=dict()

mlp_reports_dic=dict()
mlp_f1_dic=dict()
mlp_ROC_dic=dict()

for i in range(1,11):
    
    model = Word2Vec(X, size=100, window=3, min_count=5, workers=2)
    w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}
    vec = TfidfEmbeddingVectorizer(w2v)
    vec.fit(X,y)
    X_vectors = vec.transform(X)
    X_vectors_TEST = vec.transform(X_TEST)
    
    #SVM
    mlb = MultiLabelBinarizer()
    y_train_transformed = mlb.fit_transform(y_list_train)
    clf = OneVsRestClassifier(svm.SVC(kernel='rbf',C=1,gamma=1/len(y_list_train)))
    clf = clf.fit(X_vectors, y_train_transformed) 
    predicted_labels = clf.predict(X_vectors_TEST)
    y_test_transformed = mlb.fit_transform(y_list_test)
    report = classification_report(y_test_transformed, predicted_labels)
    svm_reports_dic[i] = get_classification_report_as_df(report)
    svm_f1_dic[i] = f1_score(y_test_transformed, predicted_labels, average='micro')
    svm_ROC_dic[i] = roc_auc_score(y_test_transformed, predicted_labels)
    
    #LR
    clf = OneVsRestClassifier(LogisticRegression(random_state=42))
    clf = clf.fit(X_vectors, y_train_transformed) 
    predicted_labels = clf.predict(X_vectors_TEST)
    y_test_transformed = mlb.fit_transform(y_list_test)
    report = classification_report(y_test_transformed, predicted_labels)
    lr_reports_dic[i] = get_classification_report_as_df(report)
    lr_f1_dic[i] = f1_score(y_test_transformed, predicted_labels, average='micro')
    lr_ROC_dic[i] = roc_auc_score(y_test_transformed, predicted_labels)
    
    #MLP
    clf = OneVsRestClassifier(MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(512), random_state=1))

    clf.fit(X_vectors, y_train_transformed)
    predicted_labels = clf.predict(X_vectors_TEST)
    y_test_transformed = mlb.fit_transform(y_list_test)
    report = classification_report(y_test_transformed, predicted_labels)
    mlp_reports_dic[i] = get_classification_report_as_df(report)
    mlp_f1_dic[i] = f1_score(y_test_transformed, predicted_labels, average='micro')
    mlp_ROC_dic[i] = roc_auc_score(y_test_transformed, predicted_labels)

In [47]:
svm_mean, svm_std = mean_std_from_results(svm_reports_dic)
lr_mean, lr_std = mean_std_from_results(lr_reports_dic)
mlp_mean, mlp_std = mean_std_from_results(mlp_reports_dic)

svm_mean.to_excel('w2v_svm_mean.xlsx')
svm_std.to_excel('w2v_svm_std.xlsx')

lr_mean.to_excel('w2v_lr_mean.xlsx')
lr_std.to_excel('w2v_lr_std.xlsx')

mlp_mean.to_excel('w2v_mlp_mean.xlsx')
mlp_std.to_excel('w2v_mlp_std.xlsx')

In [60]:
print('svm',np.mean(list(svm_ROC_dic.values())))
print('lr',np.mean(list(lr_ROC_dic.values())))
print('mlp',np.mean(list(mlp_ROC_dic.values())))

svm 0.5
lr 0.5203075036391409
mlp 0.5690304473664938


In [61]:
mlp_mean

Unnamed: 0,P,R,F
0,0.402,0.063,0.106
1,0.127,0.04,0.058
2,0.7,0.451,0.547
3,0.866,0.855,0.859
4,0.43,0.049,0.087
5,0.627,0.069,0.123
6,0.019,0.024,0.022
7,0.828,0.623,0.71
8,0.451,0.22,0.258
9,0.748,0.623,0.647
