In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('data/sentiment_data.csv', sep=',', header=0)
data = data.sample(frac=1)
data.head()

Unnamed: 0,msg_id,from_number,date_inserted,message,binary_sentiment,sentiment,respond
109,118077,19125960000.0,7/23/2020 0:03,I'm not seeing the 40%off at checkout.,0,1,1
604,597998,12762520000.0,6/8/2020 18:34,"Done, GOD Bless 🙏🏼",1,3,1
15,25676,13367060000.0,7/4/2020 0:57,Hey wyd,1,2,1
31,42516,19167990000.0,6/22/2020 20:47,I went to the site and acquired some items. Wh...,1,3,1
85,89830,12109130000.0,7/14/2020 13:58,Henry...got a sale going now?,0,2,1


In [3]:
def clean_msg(msg):
    msg = BeautifulSoup(msg, "lxml").get_text()
    # Removing the URL links
    msg = re.sub(r"https?://[A-Za-z0-9./]+", ' ', msg)
    # Keeping only letters
    msg = re.sub(r"[^a-zA-Z.!?']", ' ', msg)
    # Removing additional whitespaces
    msg = re.sub(r" +", ' ', msg)
    return msg

In [4]:
# train_size = int(len(data)/2)
messages = [clean_msg(msg) for msg in data['message']]
train_size = int(len(messages)/2)
train_msg = messages[:train_size]
test_msg = messages[train_size:]

# filtered_msg = [i for i, msg in enumerate(messages) if len(msg) > 0]
# train_size = int(len(filtered_msg)/2)
# train_msg = [messages[i] for i in filtered_msg[:train_size]]
# test_msg = [messages[i] for i in filtered_msg[train_size:]]

vectorizer = TfidfVectorizer()
vectorizer.fit(train_msg)
trans_train_msg = vectorizer.transform(train_msg)
trans_test_msg = vectorizer.transform(test_msg)

In [5]:
binary_sent = data['binary_sentiment']
train_bin_sent = binary_sent[:train_size]
test_bin_sent = binary_sent[train_size:]

# train_bin_sent = [binary_sent[i] for i in filtered_msg[:train_size]]
# test_bin_sent = [binary_sent[i] for i in filtered_msg[train_size:]]

binary_model = KMeans(n_clusters=2, init='k-means++')
binary_model.fit(trans_train_msg)
prediction = binary_model.predict(trans_test_msg)
print("Accuracy: {}".format(accuracy_score(test_bin_sent, prediction)))
# print(prediction)
# print(test_bin_sent)

Accuracy: 0.5943600867678959


In [6]:
sentiment = data['sentiment']
train_sent = sentiment[:train_size]
test_sent = sentiment[train_size:]

# train_sent = [sentiment[i] for i in filtered_msg[:train_size]]
# test_sent = [sentiment[i] for i in filtered_msg[train_size:]]

sentiment_model = KMeans(n_clusters=3, init='k-means++')
sentiment_model.fit(trans_train_msg)
prediction = sentiment_model.predict(trans_test_msg)
print("Accuracy: {}".format(accuracy_score(test_sent, prediction)))
# print(prediction)
# print(test_sent)

Accuracy: 0.06073752711496746


In [7]:
binary_resp = data['respond']
train_bin_resp = binary_resp[:train_size]
test_bin_resp = binary_resp[train_size:]

# train_bin_resp = [binary_sent[i] for i in filtered_msg[:train_size]]
# test_bin_resp = [binary_sent[i] for i in filtered_msg[train_size:]]

resp_model = KMeans(n_clusters=2, init='k-means++')
resp_model.fit(trans_train_msg)
prediction = resp_model.predict(trans_test_msg)
print("Accuracy: {}".format(accuracy_score(test_bin_resp, prediction)))
# print(prediction)
# print(test_bin_sent)

Accuracy: 0.613882863340564
