In [1]:
import pandas as pd
import string
import numpy as np

# Prerpocessing
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer

# LDA
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocessing

In [2]:
data = pd.read_csv('../raw_data/airlines_tweets.csv')
                   #index_col = 0)
data = data[['airline_sentiment', 'text']]
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [3]:
data.shape

(14640, 2)

## Remove user names

In [4]:
data.text = data.text.apply(lambda x: ' '.join([word for word in x.split() if not word.startswith("@")]))

In [5]:
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,What said.
1,positive,plus you've added commercials to the experienc...
2,neutral,I didn't today... Must mean I need to take ano...
3,negative,"it's really aggressive to blast obnoxious ""ent..."
4,negative,and it's a really big bad thing about it


## Remove punctuation and lowercase

In [6]:
for p in string.punctuation:
    data.text = data.text.str.replace(p, '', regex=True)
data.text = data.text.str.lower()
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,what said
1,positive,plus youve added commercials to the experience...
2,neutral,i didnt today must mean i need to take another...
3,negative,its really aggressive to blast obnoxious enter...
4,negative,and its a really big bad thing about it


## Remove stop words

In [7]:
stop_words = set(stopwords.words('english'))
data.text = data.text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,said
1,positive,plus youve added commercials experience tacky
2,neutral,didnt today must mean need take another trip
3,negative,really aggressive blast obnoxious entertainmen...
4,negative,really big bad thing


## Lemmatize

In [8]:
lemmatizer = WordNetLemmatizer()
data.text = data.text.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,said
1,positive,plus youve added commercial experience tacky
2,neutral,didnt today must mean need take another trip
3,negative,really aggressive blast obnoxious entertainmen...
4,negative,really big bad thing


## Remove non english words

In [9]:
en_words = set(words.words())

In [10]:
data.text = data.text.apply(lambda x: " ".join(w for w in x.split() if w in en_words))

In [11]:
data.text.replace('', np.nan, inplace=True)
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,said
1,positive,plus youve added commercial experience tacky
2,neutral,didnt today must mean need take another trip
3,negative,really aggressive blast obnoxious entertainmen...
4,negative,really big bad thing


In [12]:
data.dropna(subset=['text'], inplace=True)

In [13]:
data.shape

(14494, 2)

## Remove strings with less than 3 words

In [15]:
#for fun
data  = data.assign(length = lambda x : x['text'].str.split().apply(len))\
             .query("length >= 3")\
             .reset_index()\
             .drop(columns =['index', 'length'])
data

Unnamed: 0,date,text
0,2022-06-01 12:19:12.452841,yes right getting even bad bad
1,2022-06-01 12:19:12.452841,hopefully backup save near future must think e...
2,2022-06-01 12:19:12.452841,e fan die post marketing
3,2022-06-01 12:19:12.452841,probably release around midnight japan
4,2022-06-01 12:19:12.452841,still still talking cruise episode looking prop
...,...,...
500,2022-06-01 12:19:12.452841,yes right getting even bad bad
501,2022-06-01 12:19:12.452841,hopefully backup save near future must think e...
502,2022-06-01 12:19:12.452841,e fan die post marketing
503,2022-06-01 12:19:12.452841,probably release around midnight japan


In [14]:
data['length'] = data.text.apply(lambda x: len(x.split()) )
data = data[data['length'] >=3]
data

Unnamed: 0,airline_sentiment,text,length
1,positive,plus youve added commercial experience tacky,6
2,neutral,didnt today must mean need take another trip,8
3,negative,really aggressive blast obnoxious entertainmen...,9
4,negative,really big bad thing,4
5,negative,seriously would pay flight seat didnt really b...,10
...,...,...,...
14633,negative,flight leaving tomorrow morning auto night fli...,9
14635,positive,thank got different flight,4
14636,negative,leaving minute late flight warning communicati...,11
14638,negative,money change flight dont answer phone suggesti...,9


In [15]:
data = data.reset_index().drop(columns=['index', 'length'])
data

Unnamed: 0,airline_sentiment,text
0,positive,plus youve added commercial experience tacky
1,neutral,didnt today must mean need take another trip
2,negative,really aggressive blast obnoxious entertainmen...
3,negative,really big bad thing
4,negative,seriously would pay flight seat didnt really b...
...,...,...
12924,negative,flight leaving tomorrow morning auto night fli...
12925,positive,thank got different flight
12926,negative,leaving minute late flight warning communicati...
12927,negative,money change flight dont answer phone suggesti...


In [22]:
data_neg = data[data['airline_sentiment'] == 'negative']

# LDA model

In [37]:
%%time
vectorizer = TfidfVectorizer().fit(data['text'])

X = vectorizer.transform(data_neg['text'])

lda_model = LatentDirichletAllocation(n_components=3).fit(X)

CPU times: user 16.7 s, sys: 9.98 ms, total: 16.7 s
Wall time: 16.7 s


In [38]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        
print_topics(lda_model, vectorizer)

Topic 0:
[('flight', 99.72268446892434), ('seat', 69.34925887493473), ('one', 43.44983336444326), ('crew', 40.580383542351065), ('gate', 38.83220430613449), ('like', 36.925181930751236), ('agent', 35.59129228962725), ('want', 34.39913951979312), ('bag', 33.22015860748108), ('united', 30.55571864226753)]
Topic 1:
[('flight', 175.08450036049024), ('plane', 105.31422263820687), ('hour', 76.30322455236409), ('gate', 68.29620020039482), ('luggage', 65.95729241464414), ('sitting', 59.6878357289925), ('bag', 57.715800667843595), ('waiting', 54.0996774172336), ('delay', 53.45090793748363), ('connection', 51.515353408686096)]
Topic 2:
[('flight', 209.05301984187602), ('hold', 163.81364948825578), ('hour', 154.59371893880078), ('service', 153.8997514949972), ('customer', 146.5293282781081), ('get', 139.3000117995159), ('call', 130.70240803844808), ('help', 121.2837037091878), ('time', 118.74148496581856), ('phone', 107.4833682343449)]


In [39]:
topics = [[vectorizer.get_feature_names()[i]\
          for i in topic.argsort()[:-10 - 1:-1]]\
          for idx, topic in enumerate(lda_model.components_)]
topics

[['flight',
  'seat',
  'one',
  'crew',
  'gate',
  'like',
  'agent',
  'want',
  'bag',
  'united'],
 ['flight',
  'plane',
  'hour',
  'gate',
  'luggage',
  'sitting',
  'bag',
  'waiting',
  'delay',
  'connection'],
 ['flight',
  'hold',
  'hour',
  'service',
  'customer',
  'get',
  'call',
  'help',
  'time',
  'phone']]