<h1>Analyzing Text Message Data - NLP ML</h1>

In [32]:
# import libraries
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

In [2]:
# import dataset clean_nus_sms.csv
clean_nus_sms = pd.read_csv('./clean_nus_sms.csv')

In [3]:
clean_nus_sms.head()

Unnamed: 0.1,Unnamed: 0,id,Message,length,country,Date
0,0,10120,Bugis oso near wat...,21,SG,2003/4
1,1,10121,"Go until jurong point, crazy.. Available only ...",111,SG,2003/4
2,2,10122,I dunno until when... Lets go learn pilates...,46,SG,2003/4
3,3,10123,Den only weekdays got special price... Haiz......,140,SG,2003/4
4,4,10124,Meet after lunch la...,22,SG,2003/4


In [4]:
# also make a copy
clean_nus_sms_copy = clean_nus_sms.copy()

clean_nus_sms_copy.head()

Unnamed: 0.1,Unnamed: 0,id,Message,length,country,Date
0,0,10120,Bugis oso near wat...,21,SG,2003/4
1,1,10121,"Go until jurong point, crazy.. Available only ...",111,SG,2003/4
2,2,10122,I dunno until when... Lets go learn pilates...,46,SG,2003/4
3,3,10123,Den only weekdays got special price... Haiz......,140,SG,2003/4
4,4,10124,Meet after lunch la...,22,SG,2003/4


In [14]:
# Text Preprocessing

# 1) Tokenization

def tokenize_words(text):
    if isinstance(text, str):
        sentences = word_tokenize(text)
    else:
        sentences = []
    return sentences

new_clean_nus_sms = pd.DataFrame()
new_clean_nus_sms['Words_Tokenized_Message'] = clean_nus_sms_copy['Message'].apply(tokenize_words)
new_clean_nus_sms.head()

Unnamed: 0,Words_Tokenized_Message
0,"[Bugis, oso, near, wat, ...]"
1,"[Go, until, jurong, point, ,, crazy, .., Avail..."
2,"[I, dunno, until, when, ..., Lets, go, learn, ..."
3,"[Den, only, weekdays, got, special, price, ......"
4,"[Meet, after, lunch, la, ...]"


In [25]:
# 2) Noise removal

def preprocess_text(words):
    if not words:  
        return ''
    
    # a single string
    text = ' '.join(words)
    
    # remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # remove URLs
    text = re.sub(r'http\S+', '', text)

    # remove punctuation and symbols
    text = re.sub(r'[^\w\s]', '', text)

    # remove digits
    text = re.sub(r'\d+', '', text)

    # tokenize the text
    tokens = word_tokenize(text)

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Join the tokens back into a single string
    processed_text = ' '.join(filtered_tokens)

    return processed_text
new_clean_nus_sms['Noise_Removed_Message'] = new_clean_nus_sms['Words_Tokenized_Message'].apply(preprocess_text)
new_clean_nus_sms.head()



Unnamed: 0,Words_Tokenized_Message,Noise_Removed_Message
0,"[Bugis, oso, near, wat, ...]",Bugis oso near wat
1,"[Go, until, jurong, point, ,, crazy, .., Avail...",Go jurong point crazy Available bugis n great ...
2,"[I, dunno, until, when, ..., Lets, go, learn, ...",dunno Lets go learn pilates
3,"[Den, only, weekdays, got, special, price, ......",Den weekdays got special price Haiz Cant eat l...
4,"[Meet, after, lunch, la, ...]",Meet lunch la


In [33]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/marius/nltk_data...


True

In [35]:
# 3) Text normalization

# normalize 
def tokenize_and_normalize(text):
    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

# stemming
def stem_tokens(tokens):
    porter = PorterStemmer()
    stemmed = [porter.stem(token) for token in tokens]
    return stemmed

# lemmatization
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized

new_clean_nus_sms['Tokens'] = new_clean_nus_sms['Noise_Removed_Message'].apply(tokenize_and_normalize)
new_clean_nus_sms['Stemmed_Tokens'] = new_clean_nus_sms['Tokens'].apply(stem_tokens)
new_clean_nus_sms['Lemmatized_Tokens'] = new_clean_nus_sms['Tokens'].apply(lemmatize_tokens)
new_clean_nus_sms.head()

Unnamed: 0,Words_Tokenized_Message,Noise_Removed_Message,Tokens,Stemmed_Tokens,Lemmatized_Tokens
0,"[Bugis, oso, near, wat, ...]",Bugis oso near wat,"[Bugis, oso, near, wat]","[bugi, oso, near, wat]","[Bugis, oso, near, wat]"
1,"[Go, until, jurong, point, ,, crazy, .., Avail...",Go jurong point crazy Available bugis n great ...,"[Go, jurong, point, crazy, Available, bugis, n...","[go, jurong, point, crazi, avail, bugi, n, gre...","[Go, jurong, point, crazy, Available, bugis, n..."
2,"[I, dunno, until, when, ..., Lets, go, learn, ...",dunno Lets go learn pilates,"[dunno, Lets, go, learn, pilates]","[dunno, let, go, learn, pilat]","[dunno, Lets, go, learn, pilate]"
3,"[Den, only, weekdays, got, special, price, ......",Den weekdays got special price Haiz Cant eat l...,"[Den, weekdays, got, special, price, Haiz, Can...","[den, weekday, got, special, price, haiz, cant...","[Den, weekday, got, special, price, Haiz, Cant..."
4,"[Meet, after, lunch, la, ...]",Meet lunch la,"[Meet, lunch, la]","[meet, lunch, la]","[Meet, lunch, la]"


In [37]:
# Conduct analysis data
X = new_clean_nus_sms['Lemmatized_Tokens']
X

0                                  [Bugis, oso, near, wat]
1        [Go, jurong, point, crazy, Available, bugis, n...
2                         [dunno, Lets, go, learn, pilate]
3        [Den, weekday, got, special, price, Haiz, Cant...
4                                        [Meet, lunch, la]
                               ...                        
48593                                         [Come, NOON]
48594                                               [LOVE]
48595                                                [CYA]
48596                                              [GUEST]
48597                           [MANY, MANY, MANY, PEOPLE]
Name: Lemmatized_Tokens, Length: 48598, dtype: object

In [7]:
# Conduct sentiment analysis on the text messages.

In [8]:
# Find the most common topics users text about

In [9]:
# Determine how similar various users’ texts are (by topic, vocabulary use, or Levenshtein distance)

In [10]:
# Figure out what kinds of biases exist in the text message data

In [11]:
# Create frequency distributions for messaging, sentiments, or key words/phrases

In [12]:
# Use POS tagging to find commonly used phrases