# Hackathon: Arlines interior Services

- **Lihan Fang**
- **Wen Tian**
- **Haofan Chen**
- **Siying Chen**
- **Xiaotong Zhang**

***Feb. 2021***

# Import packages

In [None]:
import os
import string
import datetime
import time
import re
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

!pip install gensim
from gensim import corpora
from gensim.models import Word2Vec
from gensim.models.ldamulticore import LdaMulticore
from gensim.test.utils import datapath

!pip install aspect-based-sentiment-analysis
import aspect_based_sentiment_analysis as absa

!pip install wordcloud
from wordcloud import WordCloud

import tensorflow as tf

# Import data

In [None]:
reviews = pd.read_csv('./drive/MyDrive/data/reviews_airline.csv')
# reviews = pd.read_excel('raw_messages_for_test.xlsx')
# reviews.date.apply(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d"))
reviews.head()

In [None]:
# Group reviews by airlines
n_reviews = reviews.groupby('airline')['body'].agg('count')
n_reviews.sort_values(inplace=True, ascending=False)
n_reviews.head(10)

# Topic Modeling
## Preprocess data

In [None]:
# Punctuation
punc = string.punctuation
# Stop words
nltk.download('stopwords')
stop = set(stopwords.words('english'))
# Lemma/Stemmer
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

# Clean function
def clean_doc(doc):
    # Remove Punctuations and uncase the words
    doc = ''.join([c.lower() for c in doc if c not in punc])
    # Remove stop words
    doc = ' '.join([c for c in doc.split() if c not in stop])
    # Stemming/Lemmatizing
    doc = ' '.join([stemmer.stem(c) for c in doc.split()])
    # doc = ' '.join([lemma.lemmatize(c) for c in doc.split()])
    return doc

# Test the clean function
doc = reviews[reviews.airline=='american-airlines'].body.iloc[1]
# doc = reviews[0]
clean_doc(doc)

In [None]:
# Clean the text data
start = time.time()
corpus = list(reviews.body.apply(lambda x: re.sub('✅ ', '', x)))
corpus = [clean_doc(doc) for doc in corpus]
end = time.time()
# Save corpus
file = np.array(corpus)
np.save('corpus.npy',file)
print('Cleaning takes {} sec in total.'.format(end-start))
corpus

In [None]:
# # Load corpus
# file = np.load('corpus.npy')
# corpus = a.tolist()
# corpus

## Vectorization

### TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=200, max_df=0.95, min_df=2)
X = tfidf_vectorizer.fit_transform(corpus)
print(X.shape)
print(tfidf_vectorizer.get_feature_names())

### Word2Vec

In [None]:
# word2vec = Word2Vec(corpus, size=200,  window=5,  min_count=5,  negative=3, sample=0.001, hs=1, workers=4)

## Topic extraction

### NMF

In [None]:
# NMF decomposition
# n_components: Number of topics
nmf = NMF(n_components = 8, init='nndsvd', max_iter=300, alpha=0.1, l1_ratio=0.5)
# Get the W matrix
W = nmf.fit_transform(X)
# Get the H matrix
H = nmf.components_

# To print the 5 top words of each of the 8 topics
def print_top_words(H, feature_names, n_top_words):
    for topic_idx, topic in enumerate(H):
        print("Topic #%d:" % (topic_idx+1))
        print(", ".join([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]))

print_top_words(H, tfidf_vectorizer.get_feature_names(), n_top_words = 5)

### LDA

In [None]:
# LDA model
start = time.time()
lda = LDA(n_components=8, n_jobs=-1, verbose=5)
# Train the model
Y = lda.fit_transform(X)
end = time.time()
print('LDA training takes {} sec.'.format(end-start))

In [None]:
# Make dataframe
topic_names = ["Topic" + str(i) for i in range(1, lda.n_components + 1)]
df_document_topic = pd.DataFrame(np.round(Y, 2), columns = topic_names)
# Get dominant topic for each document
dominant_topic = (np.argmax(df_document_topic.values, axis=1)+1)
df_document_topic['Dominant_topic'] = dominant_topic
df_document_topic

In [None]:
# Make dataframe
docnames = ['Doc' + str(i) for i in range(len(corpus))]
df_document_topic = pd.DataFrame(np.round(Y, 2), columns=topic_names, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(lda.components_)

# Assign Column and Index
df_topic_keywords.columns = tfidf_vectorizer.get_feature_names()
df_topic_keywords.index = topic_names

df_topic_no = pd.DataFrame(df_topic_keywords.idxmax())
df_scores = pd.DataFrame(df_topic_keywords.max())

tmp = pd.merge(df_topic_no, df_scores, left_index=True, right_index=True)
tmp.columns = ['topic', 'relevance_score']

# Show keywords related to certain topic
all_topics = []
for i in tmp['topic'].unique():    
    tmp_1 = tmp.loc[tmp['topic'] == i].reset_index()
    tmp_1 = tmp_1.sort_values('relevance_score', ascending=False).head(5)
    
    tmp_2 = []
    tmp_2.append(tmp_1['topic'].unique()[0])
    tmp_2.append(list(tmp_1['index'].unique()))
    all_topics.append(tmp_2)

all_topics = pd.DataFrame(all_topics, columns=['Dominant_topic', 'keywords'])
all_topics

## Visualization

### t-SNE

In [None]:
# # Calculate the clustering target
# start = time.time()
# target = np.array([np.argmax(Y[i]) for i in range(len(Y))])
# tsne = TSNE()
# trans = tsne.fit_transform(Y)
# data = pd.DataFrame(np.concatenate((trans, target.reshape(-1,1)), axis=1), columns=['F1', 'F2', 'labels'])
# data['labels'] = data.labels.apply(int)
# plt.figure(figsize=(10, 10))
# sns.scatterplot(data=data, x='F1', y='F2', hue='labels', legend=True)
# plt.title('t-SNE Dimension Reduction of Topic Clustering')
# plt.show()
# plt.savefig('tsne.png')
# end = time.time()
# print('t-SNE takes {} sec.'.format(end-start))

### WordCloud

In [None]:
plt.figure(figsize=(15,5))
for i in range(8):
    plt.subplot(2, 4, i+1)
    temp = all_topics[all_topics.Dominant_topic=='Topic'+str(i+1)].keywords.iloc[0]
    wc = ', '.join([w for w in temp])
    cloud = WordCloud(width=800, height=600, mode='RGBA', background_color=None).generate(wc)
    plt.imshow(cloud, interpolation='bilinear')
    plt.title('Topic '+ str(i+1))
    plt.axis('off')

plt.show()
plt.savefig('wordcloud1.png')

# Aspect-based Sentiment Analysis

In [None]:
# Only the first 5 keywords will be used
all_topics.keywords.apply(lambda x: x[:2])
all_topics

## Sentiment analysis function

In [None]:
# Load pretrained ABSA model
nlp = absa.load()

In [None]:
# Target text
text = reviews.body.iloc[-1000]
text

In [None]:
# Aspects of a topic
aspects = all_topics[all_topics.Dominant_topic=='Topic'+str(7)].keywords.iloc[0]
aspects

In [None]:
# Sentiment Analysis
asp = list(nlp(text, aspects))

In [None]:
# Show analysis results
for i, a in enumerate(asp):
    print(aspects[i], '\t', a.sentiment, '\t', a.scores)

In [None]:
# Overall score on this topic
neu, neg, pos = .0, .0, .0
for a in asp:
    neu += a.scores[0]
    neg += a.scores[1]
    pos += a.scores[2]
neu/len(asp), neg/len(asp), pos/len(asp)

## Reviews comprehension

In [None]:
# Reviews Comprehension Function
def ReviewsComprehension(docs, topics, path='result.csv'):
    # Number of documents and topics
    n_topics = len(topics)
    n_docs = len(docs)
    
    # Make DataFrame
    columns = ['Topic'+str(i+1) for i in range(n_topics)] + ['result']
    index = range(n_docs)
    result = pd.DataFrame(columns=columns, index=index)
    
    # Sentiment analysis
    for i in tqdm(range(n_docs)):
        text = docs[i]
        for j in range(n_topics):
            aspects = topics[topics.Dominant_topic=='Topic'+str(j+1)].keywords.iloc[0]
            asp = list(nlp(text, aspects))
            neu, neg, pos = .0, .0, .0
            for a in asp:
                neu += a.scores[0]
                neg += a.scores[1]
                pos += a.scores[2]
            neu, neg, pos = neu/len(asp), neg/len(asp), pos/len(asp)
            if neu > neg and neu > pos:
                result.iloc[i,j] = 0
            if neg > neu and neg > pos:
                result.iloc[i, j] = -1
            if pos > neu and pos > neg:
                result.iloc[i, j] = 1
        # Calculate the overall review        
        result.iloc[i, -1] = result.iloc[i, :-1].sum()
        # Save the result every 50 documents
        if (i+1) % 10 == 0:
            result.to_csv(path)
            
    return result

In [None]:
# Some reviews are too long for our model to digest
data = list(reviews.body)
len_ = []
for doc in data:
    len_.append(len(doc.split()))

sns.set_style('dark')
plt.hist(len_)
plt.title('Some reviews are too long (>512 tokens)')

In [None]:
# Split some lenthy texts to fit into the model
data_ = []
for doc in data:
    t = doc.split()
    if len(t) > 300 and len(t) < 600:
        data_.append(' '.join(t[:300]))
        data_.append(' '.join(t[300:]))
    if len(t) > 600 and len(t) < 900:
        data_.append(' '.join(t[:300]))
        data_.append(' '.join(t[300:600]))
        data_.append(' '.join(t[600:]))
    if len(t) > 900 and len(t) < 1200:
        data_.append(' '.join(t[:300]))
        data_.append(' '.join(t[300:600]))
        data_.append(' '.join(t[600:900]))
        data_.append(' '.join(t[900:]))
    if len(t) > 1200 and len(t) < 1500:
        data_.append(' '.join(t[:300]))
        data_.append(' '.join(t[300:600]))
        data_.append(' '.join(t[600:900]))
        data_.append(' '.join(t[900:1200]))
        data_.append(' '.join(t[1200:]))
    if len(t) > 1500:
        data_.append(' '.join(t[:300]))
        data_.append(' '.join(t[300:600]))
        data_.append(' '.join(t[600:900]))
        data_.append(' '.join(t[900:1200]))
        data_.append(' '.join(t[1200:1500]))
        data_.append(' '.join(t[1500:]))
    else:
        data_.append(doc)
# for doc in data:
#     t = doc.split()
#     if len(t) < 300:
#         data_.append(' '.join(t))
len(data_)

In [None]:
# Comprehense reviews
ReviewsComprehension(data_, all_topics)

## Visualization

In [None]:
# Load the data
result = pd.read_csv('result.csv')
result.dropna(inplace=True)
result.head()

In [None]:
# Support function for plotting
def stat(result, n_topics):
    topics = ['Topic'+ str(i+1) for i in range(n_topics)]
    df = pd.DataFrame(columns=['Topic', 'Sentiment', 'Counts'], index=range(3*n_topics))
    i = 0
    for t in topics:
        temp = result[t].value_counts()
        for idx in temp.index.tolist():
            if idx == -1.0:
                df.iloc[i, 0] = t
                df.iloc[i, 1] = 'Negative'
                df.iloc[i, 2] = temp[idx]
                i += 1
            if idx == 1.0:
                df.iloc[i, 0] = t
                df.iloc[i, 1] = 'Postive'
                df.iloc[i, 2] = temp[idx]
                i += 1
            if idx == 0.0:
                df.iloc[i, 0] = t
                df.iloc[i, 1] = 'Neutral'
                df.iloc[i, 2] = temp[idx]
                i += 1
    df.dropna(inplace=True)
    
    return df

In [None]:
# Visualize the results
temp = stat(result, 8)
plt.figure(figsize=(15, 8))
sns.barplot(data=temp, x='Topic', y='Counts', hue='Sentiment')
plt.title('Aspect-based Topic Sentiment Analysis')
plt.savefig('sentiment.png', bbox_inches='tight')

## Further observation

In [None]:
# Different class information
reviews.seat_type.value_counts()

In [None]:
# Extract the most recent reviews for different class
economy_reviews = list(reviews[reviews.seat_type == 'Economy Class'].body[:200])
business_reviews = list(reviews[reviews.seat_type == 'Business Class'].body[:200])
premium_reviews = list(reviews[reviews.seat_type == 'Premium Economy'].body[:200])
first_reviews = list(reviews[reviews.seat_type == 'First Class'].body[:200])

In [None]:
# Deal with the length issues
economy = []
business = []
premium = []
first = []

for doc in economy_reviews:
    t = doc.split()
    if len(t) < 300:
        economy.append(' '.join(t))

for doc in business_reviews:
    t = doc.split()
    if len(t) < 300:
        business.append(' '.join(t))

for doc in premium_reviews:
    t = doc.split()
    if len(t) < 300:
        premium.append(' '.join(t))

for doc in first_reviews:
    t = doc.split()
    if len(t) < 300:
        first.append(' '.join(t))

In [None]:
# Sentiment analysis
temp1 = ReviewsComprehension(economy, all_topics, 'economy.csv')
temp2 = ReviewsComprehension(business, all_topics, 'business.csv')
temp3 = ReviewsComprehension(premium, all_topics, 'premium.csv')
temp4 = ReviewsComprehension(first, all_topics, 'first.csv')

In [None]:
## Visualize the results
# Economy Class
result = pd.read_csv('economy.csv')
result.dropna(inplace=True)
temp = stat(result, 8)
plt.figure(figsize=(15, 8))
sns.barplot(data=temp, x='Topic', y='Counts', hue='Sentiment')
plt.title('Economy Class Sentiment Analysis')
plt.savefig('economy.png', bbox_inches='tight')

In [None]:
# Business Class
result = pd.read_csv('business.csv')
result.dropna(inplace=True)
temp = stat(result, 8)
plt.figure(figsize=(15, 8))
sns.barplot(data=temp, x='Topic', y='Counts', hue='Sentiment')
plt.title('Business Class Sentiment Analysis')
plt.savefig('business.png', bbox_inches='tight')

In [None]:
# Premium Economy Class
result = pd.read_csv('premium.csv')
result.dropna(inplace=True)
temp = stat(result, 8)
plt.figure(figsize=(15, 8))
sns.barplot(data=temp, x='Topic', y='Counts', hue='Sentiment')
plt.title('Premium Economy Sentiment Analysis')
plt.savefig('premium.png', bbox_inches='tight')

In [None]:
# First Class
result = pd.read_csv('first.csv')
result.dropna(inplace=True)
temp = stat(result, 8)
plt.figure(figsize=(15, 8))
sns.barplot(data=temp, x='Topic', y='Counts', hue='Sentiment')
plt.title('First Class Sentiment Analysis')
plt.savefig('first.png', bbox_inches='tight')