# Service Quality Monitoring in Confined Spaces Through Mining Twitter Data

## Task1: Aspect Extraction

### Baseline Approaches: LDA

In [2]:
import warnings
warnings.filterwarnings("ignore")

# General tools
import os
import re
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
from tabulate import tabulate


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# text-processing tools
import spacy
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


# classification tools
from sklearn import svm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier


# global parameters
TRAIN_SET_PATH = "scs.txt"
TEST_SET_PATH = "fss.txt"

In [3]:
# NLTK Stop words
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
nlp = spacy.load('en', disable=['parser', 'ner'])
stemmer = SnowballStemmer(language='english')

In [4]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def stemming(texts):
    return [[stemmer.stem(word) for word in doc] for doc in texts]

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
def read_data(path):
    df = pd.read_csv(TRAIN_SET_PATH, sep="|", header=None, encoding = "latin1")
    df.columns = ["aspect", "txt"]
    df = df[df.aspect!='-1'].reset_index(drop=True)
    
    # Convert to list
    data = df.txt.values.tolist()

    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]
    data_words = list(sent_to_words(data))
    
    return data_words,df

def preprocess_data(path):
    
    data_words,df = read_data(path)
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=3, threshold=10) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=10)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words, bigram_mod)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    data_words_nostops = remove_stopwords(data_lemmatized)
    data_stemmed = stemming(data_words_nostops)
    
    return data_stemmed,df

In [6]:
# transform y to features
def y_to_feature(y):
    y_list_train = []
    for i,v in enumerate(y):
        classes = v.split(',')
        classes2=[int(cl.strip()) for cl in classes]
        y_list_train.append(tuple(classes2))
    return y_list_train

# Creating the Dictionary and Corpus needed for Topic Modeling

In [7]:
# Create Dictionary
texts,df = preprocess_data(TRAIN_SET_PATH)
y_train = y_to_feature(list(df.aspect))

id2word = corpora.Dictionary(texts)

# Create Corpus
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Building the Topic Model

In [8]:
mallet_path = '/usr/bin/mallet' # update this path
os.environ['MALLET_HOME'] = '/usr/mallet'

# Classifcation

In [26]:
# Create Dictionary
texts_test,df_test = preprocess_data(TEST_SET_PATH)
y_test = y_to_feature(list(df_test.aspect))
id2word_test = corpora.Dictionary(texts_test)

# Create Corpus
# Term Document Frequency
corpus_test = [id2word_test.doc2bow(text) for text in texts_test]

# Classification for 6 (5 Fold)

In [27]:
def get_features(lda_model, corpus):
    features=[]
    for i, row in enumerate(lda_model[corpus]):
        features.append([y for x,y in row])
    return features

def get_classification_report_as_df(report):
    cl_results = report.split()[4:39]
    cl_results_rest = report.split()[39:]
    df = pd.DataFrame(columns=['P','R','F'])
    for i in range(0,len(cl_results),5):
        df.loc[len(df)] = cl_results[i+1:i+4]

    for i in range(0,len(cl_results_rest),6):
        df.loc[len(df)] = cl_results_rest[i+2:i+5]

    return df

def mean_std_from_results(reports_dic):
    mean_result = pd.DataFrame(columns=['P','R','F'])
    std_result = pd.DataFrame(columns=['P','R','F'])
    for j in range(0,len(reports_dic[1])): # for each aspect
        df = pd.DataFrame(columns=['P','R','F'])
        for i in reports_dic: # for each fold
            df.loc[len(df)] = list(reports_dic[i].loc[j])
        df = df.apply(pd.to_numeric)
        mean_result.loc[len(mean_result)] = df.mean()
        std_result.loc[len(std_result)] = df.std()
    return mean_result, std_result

In [30]:
svm_reports_dic6=dict()
svm_f1_dic6=dict()
svm_ROC_dic6=dict()

lr_reports_dic6=dict()
lr_f1_dic6=dict()
lr_ROC_dic6=dict()

mlp_reports_dic6=dict()
mlp_f1_dic6=dict()
mlp_ROC_dic6=dict()

for i in range(1,11):
    ldamallet6 = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=6, id2word=id2word)
    train_features = get_features(ldamallet6, corpus)
    test_features = get_features(ldamallet6, corpus_test)
    
    #SVM
    mlb = MultiLabelBinarizer()
    y_train_transformed = mlb.fit_transform(y_train)
    clf = OneVsRestClassifier(svm.SVC(kernel='rbf',C=1,gamma=1/len(y_train_transformed)))
    clf = clf.fit(train_features, y_train_transformed) 
    predicted_labels = clf.predict(test_features)
    y_test_transformed = mlb.fit_transform(y_test)
    report = classification_report(y_test_transformed, predicted_labels)
    svm_reports_dic6[i] = get_classification_report_as_df(report)
    svm_f1_dic6[i] = f1_score(y_test_transformed, predicted_labels, average='micro')
    svm_ROC_dic6[i] = roc_auc_score(y_test_transformed, predicted_labels)
    
    #LR
    clf = OneVsRestClassifier(LogisticRegression(random_state=42))
    clf = clf.fit(train_features, y_train_transformed) 
    predicted_labels = clf.predict(test_features)
    y_test_transformed = mlb.fit_transform(y_test)
    report = classification_report(y_test_transformed, predicted_labels)
    lr_reports_dic6[i] = get_classification_report_as_df(report)
    lr_f1_dic6[i] = f1_score(y_test_transformed, predicted_labels, average='micro')
    lr_ROC_dic6[i] = roc_auc_score(y_test_transformed, predicted_labels)
    
    #MLP
    clf = OneVsRestClassifier(MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(512), random_state=1))

    clf.fit(train_features, y_train_transformed)
    predicted_labels = clf.predict(test_features)
    y_test_transformed = mlb.fit_transform(y_test)
    report = classification_report(y_test_transformed, predicted_labels)
    mlp_reports_dic6[i] = get_classification_report_as_df(report)
    mlp_f1_dic6[i] = f1_score(y_test_transformed, predicted_labels, average='micro')
    mlp_ROC_dic6[i] = roc_auc_score(y_test_transformed, predicted_labels)
    

# Classification for 10

In [31]:
svm_reports_dic10=dict()
svm_f1_dic10=dict()
svm_ROC_dic10=dict()

lr_reports_dic10=dict()
lr_f1_dic10=dict()
lr_ROC_dic10=dict()

mlp_reports_dic10=dict()
mlp_f1_dic10=dict()
mlp_ROC_dic10=dict()

for i in range(1,11):
    ldamallet10 = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=10, id2word=id2word)
    train_features = get_features(ldamallet10, corpus)
    test_features = get_features(ldamallet10, corpus_test)
    
    #SVM
    mlb = MultiLabelBinarizer()
    y_train_transformed = mlb.fit_transform(y_train)
    clf = OneVsRestClassifier(svm.SVC(kernel='rbf',C=1,gamma=1/len(y_train_transformed)))
    clf = clf.fit(train_features, y_train_transformed) 
    predicted_labels = clf.predict(test_features)
    y_test_transformed = mlb.fit_transform(y_test)
    report = classification_report(y_test_transformed, predicted_labels)
    svm_reports_dic10[i] = get_classification_report_as_df(report)
    svm_f1_dic10[i] = f1_score(y_test_transformed, predicted_labels, average='micro')
    svm_ROC_dic10[i] = roc_auc_score(y_test_transformed, predicted_labels)
    
    #LR
    clf = OneVsRestClassifier(LogisticRegression(random_state=42))
    clf = clf.fit(train_features, y_train_transformed) 
    predicted_labels = clf.predict(test_features)
    y_test_transformed = mlb.fit_transform(y_test)
    report = classification_report(y_test_transformed, predicted_labels)
    lr_reports_dic10[i] = get_classification_report_as_df(report)
    lr_f1_dic10[i] = f1_score(y_test_transformed, predicted_labels, average='micro')
    lr_ROC_dic10[i] = roc_auc_score(y_test_transformed, predicted_labels)
    
    #MLP
    clf = OneVsRestClassifier(MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(512), random_state=1))

    clf.fit(train_features, y_train_transformed)
    predicted_labels = clf.predict(test_features)
    y_test_transformed = mlb.fit_transform(y_test)
    report = classification_report(y_test_transformed, predicted_labels)
    mlp_reports_dic10[i] = get_classification_report_as_df(report)
    mlp_f1_dic10[i] = f1_score(y_test_transformed, predicted_labels, average='micro')
    mlp_ROC_dic10[i] = roc_auc_score(y_test_transformed, predicted_labels)


In [32]:
svm_mean6, svm_std6 = mean_std_from_results(svm_reports_dic6)
lr_mean6, lr_std6 = mean_std_from_results(lr_reports_dic6)
mlp_mean6, mlp_std6 = mean_std_from_results(mlp_reports_dic6)

In [33]:
svm_mean10, svm_std10 = mean_std_from_results(svm_reports_dic10)
lr_mean10, lr_std10 = mean_std_from_results(lr_reports_dic10)
mlp_mean10, mlp_std10 = mean_std_from_results(mlp_reports_dic10)

# Exporting the results

In [48]:
svm_mean6.to_excel('lda_svm6_mean.xlsx')
svm_std6.to_excel('lda_svm6_std.xlsx')

lr_mean6.to_excel('lda_lr6_mean.xlsx')
lr_std6.to_excel('lda_lr6_std.xlsx')

mlp_mean6.to_excel('lda_mlp6_mean.xlsx')
mlp_std6.to_excel('lda_mlp6_std.xlsx')

In [49]:
svm_mean10.to_excel('lda_svm10_mean.xlsx')
svm_std10.to_excel('lda_svm10_std.xlsx')

lr_mean10.to_excel('lda_lr10_mean.xlsx')
lr_std10.to_excel('lda_lr10_std.xlsx')

mlp_mean10.to_excel('lda_mlp10_mean.xlsx')
mlp_std10.to_excel('lda_mlp10_std.xlsx')