In [1]:
import pandas as pd
import numpy as np  
import re 
import string

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

In [4]:
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/theDoctor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from textblob import TextBlob

In [6]:
def load_dataset(filepath):
    df = pd.read_csv(filepath, encoding='latin-1')
    df.columns = ["text","sentiment"]
    return df

In [16]:

def load_text_data(filename, append = False):
    if not append:
        data = []
        
    textfile = open(filename, "r")
    for tweet in textfile:
        parts = tweet.split(',')
        if len(parts) > 2:
            newTweet = [parts[0],parts[1], 0, "".join(parts[3:])]
            data.append(newTweet)
    textfile.close()
    df = pd.DataFrame(data)
    df.columns = ["User", "Date", "Retweet", "text"]
    return df


In [8]:
def delete_redundant_cols(df, cols):
    for col in cols:
        del df[col]
        return df

In [9]:
def preprocess_tweet_text(tweet):

    tweet = tweet.lower()

    tweet = re.sub(r"http\S+|www\S+|https\S+","",tweet, flags=re.MULTILINE)   

    tweet = tweet.translate(str.maketrans("","",string.punctuation))

    tweet = re.sub(r'\@\w+|\#',"",tweet)

    tweet_tokens = word_tokenize(tweet)
    filtered_words = [word for word in tweet_tokens if word not in stop_words]

    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]

    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]

    return " ".join(lemma_words)
preprocess_tweet_text("Hi there, how are you preparing for your exams?")    

'hi prepar exam'

In [10]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [11]:
def int_to_string(sentiment):
    if sentiment < -0.1:
        return "Negative"
    elif sentiment < 0.1:
        return "Neutral"
    else:
        return "Positive"

In [18]:
# Load dataset
dataset = load_text_data("datasets/dataset 2021090613_29_33.txt")
# Remove unwanted columns from dataset
# n_dataset = remove_unwanted_cols(dataset, ['t_id', 'created_at', 'query', 'user'])
#Preprocess data
dataset["cleanText"] = dataset['text'].apply(preprocess_tweet_text)



# determine the sentiment of the tweet using TextBlob and use that result to train the model
sent = []
for x in dataset['cleanText']:
    s = TextBlob(x).sentiment.polarity
    if s < -0.1:
        r = -1
    elif s > 0.1:
        r = 1
    else:
        r = 0
    sent.append(r) 
#create the sentiment column
dataset["sentiment"] = sent

In [19]:
# Split dataset into Train, Test

# Same tf vector will be used for Testing sentiments on unseen trending data
tf_vector = get_feature_vector(np.array(dataset['cleanText']).ravel())
X = tf_vector.transform(np.array(dataset['cleanText']).ravel())
y = np.array(dataset['sentiment']).ravel()
# y = y.astype('float')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

# Training Logistics Regression model
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

0.7272727272727273
0.7272727272727273


In [20]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   User       52 non-null     object
 1   Date       52 non-null     object
 2   Retweet    52 non-null     int64 
 3   text       52 non-null     object
 4   cleanText  52 non-null     object
 5   sentiment  52 non-null     int64 
dtypes: int64(2), object(4)
memory usage: 2.6+ KB


In [13]:
dataset[dataset['sentiment'] < 0]

Unnamed: 0,User,Date,Retweet,text,cleanText,sentiment
4,two001snake,Mon Sep 06 18:26:37 +0000 2021,0,ALL #Corrupt @USCongress #FakeNews #MSM #TERR...,corrupt uscongress fakenew msm terrifi wethepe...,-1
9,shepersists2,Mon Sep 06 18:23:54 +0000 2021,0,I am anti war but the way Americans left Afgh...,anti war way american left afghanistan disgust...,-1
47,APissedOffCons1,Mon Sep 06 17:59:17 +0000 2021,0,If you were trying to destroy the country wou...,tri destroy countri would anyth differ biden now…,-1


In [14]:
dataset[dataset['sentiment'] > 0]

Unnamed: 0,User,Date,Retweet,text,cleanText,sentiment
1,global_police,Mon Sep 06 18:29:09 +0000 2021,0,@TheEconomist Good it's such a wonderful time...,theeconomist good wonder time sister biden,1
6,twistedcomputer,Mon Sep 06 18:24:35 +0000 2021,0,#Maddow gets paid millions as #Biden's modern...,maddow get paid million biden modern day tokyo...,1
18,SaintlySicilian,Mon Sep 06 18:20:32 +0000 2021,0,I know that in certain countries (North Korea...,know certain countri north korea former soviet...,1
21,00001Kat,Mon Sep 06 18:18:44 +0000 2021,0,Wait-I thought we were going to “follow the #...,waiti thought go “ follow scienc ” much biden ...,1
29,clayandbuck,Mon Sep 06 18:14:27 +0000 2021,0,"""What does the #Biden administration have to ...",biden administr hang hat right claytravi biden...,1
30,DickMorrisTweet,Mon Sep 06 18:13:31 +0000 2021,0,People Think Better Of Trump As They Get To K...,peopl think good trump get know biden – lunch ...,1
33,ChuckNorton1,Mon Sep 06 18:12:24 +0000 2021,0,Taliban Holding 6 planes full of Americans an...,taliban hold 6 plane full american siv hostag,1
39,leylaboulton,Mon Sep 06 18:04:40 +0000 2021,0,Let’s hope the #Biden administration can do m...,let ’ hope biden administr protect women ’ rig...,1


In [None]:
# test_file_name = "trending_tweets/08-04-2020-1586291553-tweets.csv"
# test_ds = load_dataset(test_file_name, ["t_id", "hashtag", "created_at", "user", "text"])
# test_ds = remove_unwanted_cols(test_ds, ["t_id", "created_at", "user"])

# # Creating text feature
# test_ds.text = test_ds["text"].apply(preprocess_tweet_text)
# test_feature = tf_vector.transform(np.array(test_ds.iloc[:, 1]).ravel())

# # Using Logistic Regression model for prediction
# test_prediction_lr = LR_model.predict(test_feature)

# # Averaging out the hashtags result
# test_result_ds = pd.DataFrame({'hashtag': test_ds.hashtag, 'prediction':test_prediction_lr})
# test_result = test_result_ds.groupby(['hashtag']).max().reset_index()
# test_result.columns = ['heashtag', 'predictions']
# test_result.predictions = test_result['predictions'].apply(int_to_string)

# print(test_result)