## Project 4: Data Cleaning & Modeling

### Import All Libraries & Packages

In [1]:
import numpy  as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn           as sns

import html.parser
import string
import nltk

import spacy

from   nltk.tokenize      import WordPunctTokenizer,word_tokenize
from   nltk.stem.wordnet  import WordNetLemmatizer
from   nltk.corpus        import stopwords
from   nltk.stem          import PorterStemmer

from   sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from   sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from   sklearn.model_selection         import train_test_split
from   sklearn.decomposition           import TruncatedSVD
from   sklearn.decomposition           import NMF
from   sklearn.cluster                 import KMeans
from   sklearn.metrics.pairwise        import cosine_similarity

from   sklearn.decomposition           import PCA

%matplotlib inline

### Set Global Parameters

In [2]:
# Set global parameters
SEED                    = 42     # Random seed
sample_test_size        = 0.05    # 5% of sample size for testing
cv_value                = 5     # Cross validation number of folds

# Load stop words and punctuations
stop_words          = list(ENGLISH_STOP_WORDS)

exclude_punctuation = set(string.punctuation) 
lemmatizer          = WordNetLemmatizer()
stemmer             = PorterStemmer()

#sp                  = spacy.load('en', disable=['parser', 'ner'])
sp                  = spacy.load('en')

# Tokenizer object
tok                 = WordPunctTokenizer()

# This is a handy dictionary taken from a git python of tthustla. As negations are important
# for sentiment analysis and can change the overall picure
# Not used in the end but nice to have for later
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}

# this value is to be used later in the function 
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

In [3]:
# Extend stop words
stop_words.extend(['flight', 'fly','flights','flies','sent','just','don','flying',
                    'got','plane','getting','get','thanks','delta','going','southwest',
                    'know','sure','delayed','airline','olivia','want','look','did',
                    'did','time','let','airline','really','time','need','new','hey','united',
                    'doe','people','doesn','isn','american','didn','doe','man','jetblue','make',
                    'tayler','englishboston','airline','jet blue','yeah','john','dont','doing'
                  ])

### Load Data Files

In [7]:
# Let's only focus on Delta airlines

file_path = './data/'
#file_name = 'pulled_airline_tweets_delta_no_RT_main.csv'
file_name = 'pulled_airline_tweets_delta_no_RT2_subset_w_secs.csv'

csv_file  = file_path+file_name

df_all = pd.read_csv(csv_file,
                 encoding ='utf-8-sig',   #utf-16-le, utf-8-sig
                 header   = None,
                 names  =['airline', 'date', 'raw tweet'])   # Add header

In [5]:
"""
# Let's load all files and then combine all
fn_jb = './data/pulled_airline_tweets_jetblue_no_RT.csv'
fn_sw = './data/pulled_airline_tweets_southwest_no_RT.csv'
fn_un = './data/pulled_airline_tweets_united_no_RT.csv'
fn_am = './data/pulled_airline_tweets_american_no_RT.csv'
fn_de = './data/pulled_airline_tweets_delta_no_RT.csv'


df_jb = pd.read_csv(fn_jb,
                    encoding ='utf-8-sig',   #utf-16-le, utf-8-sig
                    header   = None,
                    names  =['airline', 'date', 'raw tweet'])   # Add header

df_sw = pd.read_csv(fn_sw,
                    encoding ='utf-8-sig',   #utf-16-le, utf-8-sig
                    header   = None,
                    names  =['airline', 'date', 'raw tweet'])   # Add header

df_un = pd.read_csv(fn_un,
                    encoding ='utf-8-sig',   #utf-16-le, utf-8-sig
                    header   = None,
                    names  =['airline', 'date', 'raw tweet'])   # Add header

df_am = pd.read_csv(fn_am,
                    encoding ='utf-8-sig',   #utf-16-le, utf-8-sig
                    header   = None,
                    names  =['airline', 'date', 'raw tweet'])   # Add header

df_de = pd.read_csv(fn_de,
                    encoding ='utf-8-sig',   #utf-16-le, utf-8-sig
                    header   = None,
                    names  =['airline', 'date', 'raw tweet'])   # Add header

# Now test appending
df_all = df_jb.append(df_sw, ignore_index=True)
df_all = df_all.append(df_un, ignore_index=True)
df_all = df_all.append(df_am, ignore_index=True)
df_all = df_all.append(df_de, ignore_index=True)
""";

In [8]:
df_all.head(5)

Unnamed: 0,airline,date,raw tweet
0,Delta,11/10/19 23:59:48,"b'Hey @Delta, when do you plan to change the S..."
1,Delta,11/10/19 23:56:57,b'@Delta - Annoying and disappointing. Getting...
2,Delta,11/10/19 23:56:51,b'@Delta Why is DL 2167 late ?'
3,Delta,11/10/19 23:49:50,"b""@MLGPuckett @Delta @dallascowboys @Vikings @..."
4,Delta,11/10/19 23:48:07,b'@solitarybrother @Delta Nice'


In [9]:
df_all.shape

(5159, 3)

In [10]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5159 entries, 0 to 5158
Data columns (total 3 columns):
airline      5159 non-null object
date         5159 non-null object
raw tweet    5159 non-null object
dtypes: object(3)
memory usage: 121.0+ KB


In [11]:
# Grab the text only
df_text_raw = df_all['raw tweet']

In [12]:
# Let's take a look
df_text_raw[0:5]

0    b'Hey @Delta, when do you plan to change the S...
1    b'@Delta - Annoying and disappointing. Getting...
2                      b'@Delta Why is DL 2167 late ?'
3    b"@MLGPuckett @Delta @dallascowboys @Vikings @...
4                      b'@solitarybrother @Delta Nice'
Name: raw tweet, dtype: object

### Text Cleaning, Tokenization, Lemmatization .etc

#### Examples

In [13]:
# Remove b' from the beginning of the text 
print(df_text_raw[11])

b'@jeffwilcox @Delta Never seen it before - still not used to this \xe2\x80\x98tip for everything\xe2\x80\x99 culture.\n\nI have status so can use the priority lines anyway, so will avoid it in the future.'


In [14]:
print(df_text_raw[11].replace("b'",''))

@jeffwilcox @Delta Never seen it before - still not used to this \xe2\x80\x98tip for everything\xe2\x80\x99 culture.\n\nI have status so can use the priority lines anyway, so will avoid it in the future.'


In [15]:
# Remove \n
print(re.sub(r'\\n','',df_text_raw[6]))

b'@Delta seems like every Sunday lately when I fly and want to watch NFL games the satellite is down and the WiFi isn\xe2\x80\x99t working. #Fail DL1224 https://t.co/GjDY4vGMe8'


In [16]:
# Remove https: or ulrs
print(re.sub('https?://[A-Za-z0-9./]+','',df_text_raw[6]))

b'@Delta seems like every Sunday lately when I fly and want to watch NFL games the satellite is down and the WiFi isn\xe2\x80\x99t working. #Fail DL1224 '


In [17]:
# Remove @ mention
print(re.sub('@[A-Za-z0-9_]+','',df_text_raw[6]))

b' seems like every Sunday lately when I fly and want to watch NFL games the satellite is down and the WiFi isn\xe2\x80\x99t working. #Fail DL1224 https://t.co/GjDY4vGMe8'


In [18]:
# Replace (hashtag) #word with word
print(re.sub(r'#([^\s]+)', r'\1', df_text_raw[16]))

b'@Cab98Nate @Delta I\xe2\x80\x99m not one either but can\xe2\x80\x99t They avoid certain altitudes and the direction of where they fly to avoid severe turbulence? Was the worst. Didn\xe2\x80\x99t say they could control it. Other airlines have better pilots? \xf0\x9f\xa4\xb7\xf0\x9f\x8f\xbb\xe2\x80\x8d\xe2\x99\x82\xef\xb8\x8f'


In [19]:
# HTML decoding
print(html.unescape(df_text_raw[16]))

b'@Cab98Nate @Delta I\xe2\x80\x99m not one either but can\xe2\x80\x99t They avoid certain altitudes and the direction of where they fly to avoid severe turbulence? Was the worst. Didn\xe2\x80\x99t say they could control it. Other airlines have better pilots? \xf0\x9f\xa4\xb7\xf0\x9f\x8f\xbb\xe2\x80\x8d\xe2\x99\x82\xef\xb8\x8f'


In [20]:
# Remove annoying BOM characters
print(df_text_raw[16])

b'@Cab98Nate @Delta I\xe2\x80\x99m not one either but can\xe2\x80\x99t They avoid certain altitudes and the direction of where they fly to avoid severe turbulence? Was the worst. Didn\xe2\x80\x99t say they could control it. Other airlines have better pilots? \xf0\x9f\xa4\xb7\xf0\x9f\x8f\xbb\xe2\x80\x8d\xe2\x99\x82\xef\xb8\x8f'


In [21]:
print(re.sub(r'\\x[a-z0-9A-Z]{2}','', df_text_raw[16]))

b'@Cab98Nate @Delta Im not one either but cant They avoid certain altitudes and the direction of where they fly to avoid severe turbulence? Was the worst. Didnt say they could control it. Other airlines have better pilots? '


In [22]:
# Remove punctuation symbols
text = df_text_raw[16]
print(''.join(ch for ch in text if ch not in exclude_punctuation))

bCab98Nate Delta Ixe2x80x99m not one either but canxe2x80x99t They avoid certain altitudes and the direction of where they fly to avoid severe turbulence Was the worst Didnxe2x80x99t say they could control it Other airlines have better pilots xf0x9fxa4xb7xf0x9fx8fxbbxe2x80x8dxe2x99x82xefxb8x8f


In [23]:
# All lower case 
print(text.lower())

b'@cab98nate @delta i\xe2\x80\x99m not one either but can\xe2\x80\x99t they avoid certain altitudes and the direction of where they fly to avoid severe turbulence? was the worst. didn\xe2\x80\x99t say they could control it. other airlines have better pilots? \xf0\x9f\xa4\xb7\xf0\x9f\x8f\xbb\xe2\x80\x8d\xe2\x99\x82\xef\xb8\x8f'


In [24]:
# Numbers
text = df_text_raw[3]
print(text)

b"@MLGPuckett @Delta @dallascowboys @Vikings @hastr0 @CoreyDunn @PUCKETT111 I'm always rooting for an NFC north rival win... enjoy Jerry's World and get the W! #GoPackGo"


In [25]:
# Remove numbers
print(re.sub("[^a-zA-Z]", " ", text))

b  MLGPuckett  Delta  dallascowboys  Vikings  hastr   CoreyDunn  PUCKETT    I m always rooting for an NFC north rival win    enjoy Jerry s World and get the W   GoPackGo 


In [26]:
print(tok.tokenize(text))

['b', '"@', 'MLGPuckett', '@', 'Delta', '@', 'dallascowboys', '@', 'Vikings', '@', 'hastr0', '@', 'CoreyDunn', '@', 'PUCKETT111', 'I', "'", 'm', 'always', 'rooting', 'for', 'an', 'NFC', 'north', 'rival', 'win', '...', 'enjoy', 'Jerry', "'", 's', 'World', 'and', 'get', 'the', 'W', '!', '#', 'GoPackGo', '"']


In [27]:
# Remove stop-words
print(' '.join(i for i in tok.tokenize(text) if (i.strip() not in stop_words)))

b "@ MLGPuckett @ Delta @ dallascowboys @ Vikings @ hastr0 @ CoreyDunn @ PUCKETT111 I ' m rooting NFC north rival win ... enjoy Jerry ' s World W ! # GoPackGo "


In [28]:
# Lemmatization via nltk
#print(' '.join(lemmatizer.lemmatize(word) for word in text.split()))

print(' '.join(lemmatizer.lemmatize(word) for word in word_tokenize(text)))
# The lemmatizer is installed on my /Users/phoitack/nltk_data
#print("rocks :", lemmatizer.lemmatize("rocks")) 

b '' @ MLGPuckett @ Delta @ dallascowboys @ Vikings @ hastr0 @ CoreyDunn @ PUCKETT111 I 'm always rooting for an NFC north rival win ... enjoy Jerry 's World and get the W ! # GoPackGo ''


In [29]:
# Lemmatization via spacy
# I would not use this for now
doc = sp(text)

print(' '.join([token.lemma_ for token in doc]))

b"@MLGPuckett @delta @dallascowboys @Vikings @hastr0 @CoreyDunn @PUCKETT111 -PRON- be always root for an NFC north rival win ... enjoy Jerry 's World and get the W ! # gopackgo "


In [30]:
# Stem via NLTK
#print(' '.join([token.lemma_ for token in doc]))
input_str = word_tokenize(text)

print('\nRaw: \n')
print(text)
print('\nStemmed: \n')
print(' '.join(stemmer.stem(word) for word in word_tokenize(text)))



Raw: 

b"@MLGPuckett @Delta @dallascowboys @Vikings @hastr0 @CoreyDunn @PUCKETT111 I'm always rooting for an NFC north rival win... enjoy Jerry's World and get the W! #GoPackGo"

Stemmed: 

b '' @ mlgpuckett @ delta @ dallascowboy @ vike @ hastr0 @ coreydunn @ puckett111 I 'm alway root for an nfc north rival win ... enjoy jerri 's world and get the W ! # gopackgo ''


#### Putting it all together

In [31]:
# Put it all in a function
def clean_tweets(text):
    
    # Lower case
    text = text.lower()
    
    # Remove the b' at the beginning
    text = text.replace("b'",'')
    
    # Remove \n
    text = re.sub(r'\\n','',text)
    
    # Handle negation properly
    #text = neg_pattern.sub(lambda x: negations_dic[x.group()], text)
    
    # Remove https
    text = re.sub('https?://[A-Za-z0-9./]+','', text)
    
    # Remove @ mention
    text = re.sub('@[A-Za-z0-9_]+','', text)
    
    # Replace (hashtag) #word with word
    text = re.sub(r'#([^\s]+)', r'\1', text)
    
    # html unescape
    text = html.unescape(text)
    
    # Remove BOM characters. This took me all night.
    text = re.sub(r'\\x[a-z0-9A-Z]{2}', ' ', text)
    
    # Remove numbers
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # trim
    text = text.strip('\'"')
    
    # Remove punctuation and symbols
    text = ''.join(ch for ch in text if ch not in exclude_punctuation)
    
    # This removes the b' at the beginning if it did not catch it the first time
    # due to b''
    if text[0] == 'b':
        text = text[1:]
    else:
        text = text
    
    text = text.strip()
    
    return (text)


text_new = clean_tweets(df_text_raw[99])

print('Raw Tweet:\n',df_text_raw[99])
print('\nCleaned Tweet:\n',text_new)

Raw Tweet:
 b"@MeatheadMilitia @SouthwestAir @Delta Ah I see. Well. Either way the savings are worth it I think. That's a whole new video card. \xf0\x9f\x98\x84"

Cleaned Tweet:
 ah i see  well  either way the savings are worth it i think  that s a whole new video card


In [32]:
def remove_stopwords(text):

    tokenized_text = tok.tokenize(text)
    text = ' '.join(i for i in tokenized_text if (i.strip() not in stop_words))
    
    return(text)

text_new2 = remove_stopwords(text_new)

print(text_new2)

ah way savings worth think s video card


In [33]:
def lemmatize(text):
    
    # lemmatize: replace word with root of the word.
    text = ' '.join(lemmatizer.lemmatize(word) for word in word_tokenize(text))
    
    return(text)

text_new3 = lemmatize(text_new2)

print(text_new3)

ah way saving worth think s video card


In [34]:
def stem(text):
    
    text = ' '.join(stemmer.stem(word) for word in word_tokenize(text))
    
    return(text)

text_new3 = stem(text_new2)

print(text_new3)

ah way save worth think s video card


In [35]:
def remove_non_english(text):
    
    words = set(nltk.corpus.words.words())
    
    text = ' '.join(w for w in word_tokenize(text) \
                       if w.lower() in words or not w.isalpha())
    
    return(text)

print(remove_non_english(text_new3))

ah way save worth think s video card


In [36]:
def remove_one_and_two_letter(text):
    
    text = ' '.join( [w for w in text.split() if len(w)>2] )
    
    return(text)

In [37]:
# Process data using Natural Language Processing techniques: clean, remove stop words and lemmatize.
# Nice way of processing the dataframe on the spot with functions

df_all['clean_tweet'] = df_all['raw tweet'].apply(clean_tweets)
df_all['clean_tweet'] = df_all['clean_tweet'].apply(remove_stopwords)
df_all['clean_tweet'] = df_all['clean_tweet'].apply(lemmatize)
df_all['clean_tweet'] = df_all['clean_tweet'].apply(remove_one_and_two_letter)
df_all['clean_tweet'] = df_all['clean_tweet'].apply(remove_stopwords)
df_all['clean_tweet'] = df_all['clean_tweet'].apply(lemmatize)

In [38]:
# First pass of cleaning tweets
print(len(df_all['clean_tweet']))

5159


In [39]:
df1 = df_all.dropna()
df1.shape

(5159, 4)

In [40]:
# Start removing non-english words
# This will take a while to complete

#clean_tweet2 = []

#print('Cleaning and parsing the tweets...\n')

#update_freq = 100

#total_tweets = len(df_all['clean_tweet'])

#for i in range(0,len(df_all['clean_tweet'])):
#    if( (i+1)%update_freq == 0 ):
#        print('Tweets %d of %d has been processed' % ( i+1, total_tweets ) )                                                                   
#    clean_tweet2.append(remove_non_english(df_all['clean_tweet'][i]))

In [41]:
#print(clean_tweet2[667])

In [42]:
# Append to df_all
#df_all['clean_tweet_v2'] = clean_tweet2

#df_all

# Now write out to csv file
df_all.to_csv('./data/pulled_all_delta_airline_tweets_clean_subset_secs.csv',encoding='utf-8-sig')

#df_all.info()

In [43]:
#df_temp = pd.read_csv('./data/pulled_all_airline_tweets_clean_final.csv',encoding ='utf-8-sig')

#df_temp.info()

### Vectorization

In [None]:
# Set X matrix
X = df_all['clean_tweet']
#X = df_all['clean_tweet_v2']


#### Count-Vectorizer on entire dataset

This section is to look at the brute force vectorization feature output. I would like to know which misspelled words were missed.

In [None]:
cvec = CountVectorizer(stop_words='english',binary=False)

X_cvec = cvec.fit_transform(X)

df_cvec = pd.DataFrame(X_cvec.toarray(), columns=cvec.get_feature_names())

df_cvec.head(5)

#### Train-Test Split

In [None]:
# Train, test split
# In this case I do not have a target as of yet hence Unsupervised Learning
X_train, X_test = train_test_split(X, test_size=sample_test_size, random_state=SEED)

#### Count-Vectorizer

In [None]:
# Start with default options

min_df_input = 8
max_df_input = 0.9

cv1 = CountVectorizer(stop_words='english',min_df=min_df_input,max_df=max_df_input)
X_train_cv1 = cv1.fit_transform(X_train)
X_test_cv1  = cv1.transform(X_test)

# Bigrams
cv2 = CountVectorizer(ngram_range=(1,2), stop_words='english',min_df=min_df_input,max_df=max_df_input)
X_train_cv2 = cv2.fit_transform(X_train)
X_test_cv2  = cv2.transform(X_test)

# Trigrams
cv3 = CountVectorizer(ngram_range=(1,3), stop_words='english',min_df=min_df_input,max_df=max_df_input)
X_train_cv3 = cv3.fit_transform(X_train)
X_test_cv3  = cv3.transform(X_test)


In [None]:
df_cv1 = pd.DataFrame(X_train_cv1.toarray(), columns=cv1.get_feature_names())

In [None]:
df_cv1.head()

In [None]:
#for col in data.columns: 
#    print(col) 

#### Term Frequency - Inverse Document Frequency (TF-IDF)

In [None]:
# Let's take a stab at it
# Default vectorizer options

min_df_input = 10
max_df_input = 0.90

tfidf1         = TfidfVectorizer(stop_words='english',min_df=min_df_input,max_df=max_df_input)
X_train_tfidf1 = tfidf1.fit_transform(X_train)
X_test_tfidf1  = tfidf1.transform(X_test)

# Bi-grams
tfidf2         = TfidfVectorizer(ngram_range=(1,2), stop_words='english',min_df=min_df_input,max_df=max_df_input)
X_train_tfidf2 = tfidf2.fit_transform(X_train)
X_test_tfidf2  = tfidf2.transform(X_test)

# Tri-grams
tfidf3         = TfidfVectorizer(ngram_range=(1,3), stop_words='english',min_df=min_df_input,max_df=max_df_input)
X_train_tfidf3 = tfidf3.fit_transform(X_train)
X_test_tfidf3  = tfidf3.transform(X_test)

In [None]:
df_tfidf1 = pd.DataFrame(X_train_tfidf1.toarray(), columns=tfidf1.get_feature_names()).head()

df_tfidf1

In [None]:
#for col in data.columns: 
#    print(col) 

In [None]:
pd.DataFrame(X_train_tfidf1.toarray(), columns=tfidf1.get_feature_names()).shape

In [None]:
pd.DataFrame(X_train_tfidf2.toarray(), columns=tfidf2.get_feature_names()).head()

In [None]:
pd.DataFrame(X_train_tfidf3.toarray(), columns=tfidf3.get_feature_names()).head()

In [None]:
pd.DataFrame(X_train_tfidf3.toarray(), columns=tfidf3.get_feature_names()).shape

Note that we have more columns than rows. We definitely need to reduce the dimensionality of it

### Dimensionality Reduction

#### Latent Semantic Analysis (LSA)

In [None]:
#print(X_train)

In [None]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Singular Value Decomposition (SVD) applied to Natural Language Processing (NLP)

#s_num_topics = 2
#e_num_topics = s_num_topics+1

#for i in range(s_num_topics,e_num_topics):
    
#    lsa = TruncatedSVD(i,n_iter=100)
    
#    X_train_cv1_lsa_topic = lsa.fit_transform(X_train_cv1)
#    X_train_cv2_lsa_topic = lsa.fit_transform(X_train_cv2)
#    X_train_cv3_lsa_topic = lsa.fit_transform(X_train_cv3)
    
#    X_train_tfidf1_lsa_topic = lsa.fit_transform(X_train_tfidf1)
#    X_train_tfidf2_lsa_topic = lsa.fit_transform(X_train_tfidf2)
#    X_train_tfidf3_lsa_topic = lsa.fit_transform(X_train_tfidf3)
    
#    lsa.explained_variance_ratio_
    

In [None]:
num_topics = 7

lsa1 = TruncatedSVD(num_topics,n_iter=100, random_state=SEED)

X_train_tfidf1_lsa1_topic = lsa1.fit_transform(X_train_tfidf1)

lsa1.explained_variance_ratio_


In [None]:
# Bi-grams
#lsa2 = TruncatedSVD(num_topics,n_iter=100)

#X_train_tfidf2_lsa2_topic = lsa2.fit_transform(X_train_tfidf2)


In [None]:
# Tri-grams
#lsa3 = TruncatedSVD(num_topics,n_iter=100)

#X_train_tfidf3_lsa3_topic = lsa3.fit_transform(X_train_tfidf3)

In [None]:
# This is from the lecture notebook (Topic modeling with LSA and NMF)
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix+1)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(lsa1, tfidf1.get_feature_names(), 10)

In [None]:
# cosine similarity of training vect, LSA
CS1 = cosine_similarity(X_train_tfidf1_lsa1_topic).round(3)
print(CS1)
print(CS1.shape)

In [None]:
# cosine similarity
cosine_similarity((X_train_tfidf1_lsa1_topic[0], X_train_tfidf1_lsa1_topic[1], 
                   X_train_tfidf1_lsa1_topic[2], X_train_tfidf1_lsa1_topic[3],
                   X_train_tfidf1_lsa1_topic[4], X_train_tfidf1_lsa1_topic[5])).round(3)

In [None]:
cosine_similarity((X_train_tfidf1_lsa1_topic[0], X_train_tfidf1_lsa1_topic[3])).round(2)

In [None]:
#Vt = pd.DataFrame(X_train_tfidf1_lsa1_topic.round(5),
#                  index = X_train,
#                  columns = ["component_1","component_2","component_3","component_4","component_5"])
#Vt

In [None]:
#display_topics(lsa2, tfidf2.get_feature_names(), 10)

In [None]:
#display_topics(lsa3, tfidf3.get_feature_names(), 10)

In [None]:
# https://www.analyticsvidhya.com/blog/2018/10/stepwise-guide-topic-modeling-latent-semantic-analysis/
#for i, comp in enumerate(lsa1.components_):
#    terms_comp = zip(tfidf1.get_feature_names(), comp)
#    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
#    print("Topic "+str(i)+": ")
#    for t in sorted_terms:
#        print(t[0])
#        print(" ")

#### Non-Negative Matrix Factorization (NMF)

In [None]:
nmf_topic = 7

nmf1 = NMF(nmf_topic, random_state=SEED)
X_train_tfidf1_nmf1_topic = nmf1.fit_transform(X_train_tfidf1)

In [None]:
# NMF with
display_topics(nmf1, tfidf1.get_feature_names(), 10)

In [None]:
# Topics from LSA
display_topics(lsa1, tfidf1.get_feature_names(), 10)

#### Latent Dirichlet Allocation (LDA)

In [None]:
import warnings 
warnings.simplefilter("ignore", DeprecationWarning)
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
import pyLDAvis
import pyLDAvis.sklearn

In [None]:
# Tweak the two parameters below
lda_topics = 5
number_words = 10

# Create and fit the LDA model
lda1 = LDA(n_components=lda_topics, n_jobs=-1, random_state=SEED)
X_train_tfidf1_lda1_topic = lda1.fit_transform(X_train_tfidf1)

lda2 = LDA(n_components=lda_topics, n_jobs=-1, random_state=SEED)
X_train_tfidf2_lda2_topic = lda2.fit_transform(X_train_tfidf2)

In [None]:
display_topics(lda1, tfidf1.get_feature_names(), number_words)

In [None]:
display_topics(lda2, tfidf2.get_feature_names(), number_words)

In [None]:
# Let plot using pLDAvis
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# Uni-grams
pyLDAvis.enable_notebook()

vis_sk = pyLDAvis.sklearn.prepare(lda1, X_train_tfidf1, tfidf1)

vis_sk

In [None]:
# bi-grams
vis_sk2 = pyLDAvis.sklearn.prepare(lda2, X_train_tfidf2, tfidf2)

vis_sk2

#### LDA w/o pyLDAvis (gensim)

In [None]:
# Gensim
import gensim
#import gensim.corpora as corpora, models, similarities, matutils
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import corpora, models, similarities, matutils

from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

In [None]:
doc_word = X_train_tfidf1.transpose()

In [None]:
corpus = matutils.Sparse2Corpus(doc_word)

In [None]:
id2word = dict((v, k) for k, v in tfidf1.vocabulary_.items())

In [None]:
lda_wo_vis = models.LdaModel(corpus=corpus, num_topics=7, id2word=id2word, passes=5)

In [None]:
lda_wo_vis.print_topics()

In [None]:
lda_corpus = lda_wo_vis[corpus]
lda_corpus

#### LDA w/ pyLDAvis (gensim)

In [None]:
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# borrowed from class notebook
#doc_word = X_train_cv1.transpose()
# Tokenize text
def tokenize_text(text):
    
    text = word_tokenize(text)
    
    return (text)

text_data = []

for line in X_train:
    tokens = tokenize_text(line)
    text_data.append(tokens)


In [None]:
id2word = corpora.Dictionary(text_data)

In [None]:
corpus = [id2word.doc2bow(text) for text in text_data]

In [None]:
# Convert sparse matrix of counts to a gensim corpus
#doc_word = count_vectorizer.transform(ng_train.data).transpose()
#corpus = matutils.Sparse2Corpus(doc_word)

In [None]:
#id2word = dict((v, k) for k, v in cv1.vocabulary_.items())


In [None]:
#lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=5)
lda_model = gensim.models.ldamulticore.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=7, 
                                            update_every=1,
                                            chunksize=100,
                                            passes=5,
                                            alpha='auto',
                                            random_state=SEED);

In [None]:
from pprint import pprint

pprint(lda_model.print_topics())

In [None]:
# Transform the docs from the word space to the topic space (like "transform" in sklearn)
#lda_corpus = lda_model[corpus]
#lda_corpus

In [None]:
#X_train_tfidf1_lda2_topic = lda_model.fit(X_train_tfidf1)

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

In [None]:
# Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) 

# Compute Coherence Score

coherence_model_lda = CoherenceModel(model=lda_model, texts=X_train, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### Modeling

#### K-Means with LSA (SVD)

In [None]:
# Focus only on tf-idf
s_num_cluster = 2
e_num_cluster = 30

cluster_range = range(s_num_cluster,e_num_cluster+1)

inertia_vals1 = []
inertia_vals2 = []
inertia_vals3 = []

for i in cluster_range:
    
    # n_init, number of times the K-mean algorithm will run
    km1 = KMeans(n_clusters = i, random_state = SEED, n_jobs = -1) 
    #km2 = KMeans(n_clusters = i, random_state = SEED, n_jobs = -1)
    #km3 = KMeans(n_clusters = i, random_state = SEED, n_jobs = -1)
    
    km_fit1 = km1.fit(X_train_tfidf1_lsa1_topic)
    #km_fit2 = km2.fit(X_train_tfidf2_lsa2_topic)
    #km_fit3 = km3.fit(X_train_tfidf3_lsa3_topic)
    
    inertia_vals1.append(km_fit1.inertia_)
    #inertia_vals2.append(km_fit2.inertia_)
    #inertia_vals3.append(km_fit3.inertia_)
    
    #print('Status: ',)


In [None]:
plt.rc('font', size=12)
plt.figure(figsize=(10,8));
plt.plot(cluster_range,inertia_vals1,linewidth=2.5,label='Uni-gram');
#plt.plot(cluster_range,inertia_vals2,linewidth=2.5,label='Bi-gram');
#plt.plot(cluster_range,inertia_vals3,linewidth=2.5,label='Tri-gram');
plt.xlabel('Number of clusters, k');
plt.ylabel('Inertia from KMeans');
#plt.grid();
plt.title('Inertia vs. Number of Clusters (SVD). Num Topics: '+ str(num_topics));
plt.legend(loc='best')
plt.savefig('kmeans_inertia_tfidf.svg')

In [None]:
# For two topics, let's pick k = 5 for uni-grams and k = 5 for bi and trigrams

# Uni-gram
num_clusters = 5

km1f      = KMeans(n_clusters = num_clusters, n_jobs = -1, random_state=SEED)
km_fit1f  = km1f.fit(X_train_tfidf1_lsa1_topic)

#print(terms1)

#clusters_1 = km1f.labels_.tolist()

#print(clusters_1)

#print(order_centroids1)

In [None]:
plt.rcParams['figure.figsize'] = [5,5]
sns.set_style("whitegrid")
sns.set_context("talk")

# helper function that allows us to display data in 2 dimensions an highlights the clusters
def display_cluster(X, km, num_topics, num_clust):
    color = 'brgcmyk'
    alpha = 0.5
    s = 20
    
    plt.title('k-Means, Topics:'+str(num_topics) + ' Clusters:' + str(num_clust))
    
    if num_clust == 0:
        plt.scatter(X[:,0],X[:,1],c = color[0],alpha = alpha,s = s)
    else:
        for i in range(num_clust):
            plt.scatter(X[km.labels_==i,0],X[km.labels_==i,1],c = color[i],alpha = alpha,s=s)
            plt.scatter(km.cluster_centers_[i][0],km.cluster_centers_[i][1],c = color[i], marker = 'x', s = 100)

In [None]:
display_cluster(X_train_tfidf1_lsa1_topic,km1f,num_topics,num_clusters)

In [None]:
# Plot all clusters

max_num_clusters = 7

for i in range(2,max_num_clusters+1):
    
    km1f      = KMeans(n_clusters = i, n_jobs = -1, random_state=SEED)
    km_fit1f  = km1f.fit(X_train_tfidf1_lsa1_topic)
    
    display_cluster(X_train_tfidf1_lsa1_topic,km1f,5,i)
    
    image_file_name = 'kmeans_tfidf_topic_'+str(num_topics)+'_'+'cluster_'+str(i)+'.svg'

    plt.savefig(image_file_name)

In [None]:
#print(len(terms1))

#print("Top terms per cluster:")

#order_centroids1 = km1f.cluster_centers_.argsort()[:, ::-1]

#terms1 = tfidf1.get_feature_names()

#for i in range(n_clust):
#    print("Cluster %d:" % i),
#    for ind in order_centroids1[i, :50]:
#        print(' %s' % terms1[ind]),

#print(terms1)

In [None]:
# Create one big looping over number of topics and clusters
# Number topics is from 2 to 10
# Number of clusters is from 2 to 7

n_topics   = 2
n_clusters = 2

for i_topic in range(2,n_topics+1):
    for i_clust in range(2,n_clusters+1):
        
        lsa1 = TruncatedSVD(i_topic,n_iter=100)

        X_train_tfidf1_lsa1_topic = lsa1.fit_transform(X_train_tfidf1)
    
        km1f      = KMeans(n_clusters = i_clust, n_jobs = -1, random_state=SEED)
    
        km_fit1f  = km1f.fit(X_train_tfidf1_lsa1_topic)
    
        display_cluster(X_train_tfidf1_lsa1_topic,km1f,i_topic,i_clust)
    
        image_file_name = 'kmeans_tfidf_topic_'+str(i_topic)+'_'+'cluster_'+str(i_clust)+'.svg'

        plt.savefig(image_file_name)

#### K-Means with NMF

In [None]:
# Focus only on tf-idf
s_num_cluster = 2
e_num_cluster = 30

cluster_range = range(s_num_cluster,e_num_cluster+1)

inertia_vals_nmf1 = []

for i in cluster_range:
    
    # n_init, number of times the K-mean algorithm will run
    km1 = KMeans(n_clusters = i, random_state = SEED, n_jobs = -1) 

    km_fit_nmf1 = km1.fit(X_train_tfidf1_nmf1_topic)

    inertia_vals_nmf1.append(km_fit_nmf1.inertia_)


In [None]:
plt.rc('font', size=12)
plt.figure(figsize=(10,8));
plt.plot(cluster_range,inertia_vals_nmf1,linewidth=2.5,label='Uni-gram, NMF');
plt.xlabel('Number of clusters, k');
plt.ylabel('Inertia from KMeans');
#plt.grid();
plt.title('Kmeans Inertia NMF. Num Topics: '+ str(num_topics));
plt.legend(loc='best')
plt.savefig('kmeans_inertia_tfidf_nmf_topic3.svg')

In [None]:
num_clusters = 5

km1n      = KMeans(n_clusters = num_clusters, n_jobs = -1, random_state=SEED)
km_fit_nmf1  = km1n.fit(X_train_tfidf1_nmf1_topic)

In [None]:
display_cluster(X_train_tfidf1_nmf1_topic,km1n,nmf_topic,num_clusters)

#### K-Means with LDA from Sci-kit Learn

In [None]:
# Focus only on tf-idf
s_num_cluster = 2
e_num_cluster = 30

cluster_range = range(s_num_cluster,e_num_cluster+1)

inertia_vals_lda1 = []

for i in cluster_range:
    
    # n_init, number of times the K-mean algorithm will run
    km1 = KMeans(n_clusters = i, random_state = SEED, n_jobs = -1) 

    km_fit_lda1 = km1.fit(X_train_tfidf1_lda1_topic)

    inertia_vals_lda1.append(km_fit_lda1.inertia_)


In [None]:
plt.rc('font', size=12)
plt.figure(figsize=(10,8));
plt.plot(cluster_range,inertia_vals_lda1,linewidth=2.5,label='Uni-gram, LDA');
plt.xlabel('Number of clusters, k');
plt.ylabel('Inertia from KMeans');
#plt.grid();
plt.title('Kmeans Inertia LDA. Num Topics: '+ str(lda_topics));
plt.legend(loc='best')
plt.savefig('kmeans_inertia_tfidf_nmf_topic5.svg')

In [None]:
num_clusters = 5

km_lda1      = KMeans(n_clusters = num_clusters, n_jobs = -1, random_state=SEED)
km_fit_lda1  = km_lda1.fit(X_train_tfidf1_lda1_topic)

In [None]:
display_cluster(X_train_tfidf1_lda1_topic,km_lda1,lda_topics,num_clusters)