In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('data/sentiment_data.csv', sep=',', header=0)
data = data.sample(frac=1)
data.head()

Unnamed: 0,msg_id,from_number,date_inserted,message,binary_sentiment,sentiment,respond
873,366952,,,YTKzck,0,2,0
349,331763,19179160000.0,7/9/2020 21:02,I don't see the 35%off in my cart is there a c...,0,1,1
510,484325,19123290000.0,7/19/2020 21:06,UUGATL,0,2,0
742,77326,,,You got it,1,3,0
864,339151,,,Ok,0,2,0


In [3]:
def clean_msg(msg):
    msg = BeautifulSoup(msg, "lxml").get_text()
    # Removing the URL links
    msg = re.sub(r"https?://[A-Za-z0-9./]+", ' ', msg)
    # Keeping only letters
    msg = re.sub(r"[^a-zA-Z.!?']", ' ', msg)
    # Removing additional whitespaces
    msg = re.sub(r" +", ' ', msg)
    return msg

In [4]:
messages = [clean_msg(msg) for msg in data['message']]
train_size = int(len(messages)/2)
train_msg = messages[:train_size]
test_msg = messages[train_size:]

# filtered_msg = [i for i, msg in enumerate(messages)]
# train_size = int(len(filtered_msg)/2)
# train_msg = [messages[i] for i in filtered_msg[:train_size]]
# test_msg = [messages[i] for i in filtered_msg[train_size:]]

vectorizer = TfidfVectorizer()
vectorizer.fit(train_msg)
trans_train_msg = vectorizer.transform(train_msg)
trans_test_msg = vectorizer.transform(test_msg)

In [5]:
binary_sent = data['binary_sentiment']
train_bin_sent = binary_sent[:train_size]
test_bin_sent = binary_sent[train_size:]

# train_bin_sent = [binary_sent[i] for i in filtered_msg[:train_size]]
# test_bin_sent = [binary_sent[i] for i in filtered_msg[train_size:]]

binary_model = LogisticRegression()
binary_model.fit(trans_train_msg, train_bin_sent)
prediction = binary_model.predict(trans_test_msg)
print("Accuracy: {}".format(accuracy_score(test_bin_sent, prediction)))
# print(prediction)
# print(test_bin_sent)

Accuracy: 0.806941431670282


In [6]:
sentiment = data['sentiment']
train_sent = sentiment[:train_size]
test_sent = sentiment[train_size:]

# train_sent = [sentiment[i] for i in filtered_msg[:train_size]]
# test_sent = [sentiment[i] for i in filtered_msg[train_size:]]

sentiment_model = LogisticRegression()
sentiment_model.fit(trans_train_msg, train_sent)
prediction = sentiment_model.predict(trans_test_msg)
print("Accuracy: {}".format(accuracy_score(test_sent, prediction)))
# print(prediction)
# print(test_sent)

Accuracy: 0.7418655097613883


In [7]:
binary_resp = data['respond']
train_bin_resp = binary_resp[:train_size]
test_bin_resp = binary_resp[train_size:]

# train_bin_resp = [binary_sent[i] for i in filtered_msg[:train_size]]
# test_bin_resp = [binary_sent[i] for i in filtered_msg[train_size:]]

resp_model = LogisticRegression()
resp_model.fit(trans_train_msg, train_bin_resp)
prediction = resp_model.predict(trans_test_msg)
print("Accuracy: {}".format(accuracy_score(test_bin_resp, prediction)))
# print(prediction)
# print(test_bin_sent)

Accuracy: 0.8524945770065075
