In [52]:
import pandas as pd
import numpy as np
import re
import string
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# Global Parameters
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\c2099176\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [137]:
# Loading the dataset
def load_dataset(type_):
    with open(f"../data/{type_}_text.txt", "r", encoding="utf-8") as fp:
        x = fp.readlines()
    with open(f"../data/{type_}_labels.txt", "r", encoding="utf-8") as fp:
        y = [int(i) for i in fp.readlines()]
        
    data = {'text': x, 'label': y}
    df = pd.DataFrame(data)
    
    return df

In [41]:
# Pre-processing the tweets
def preprocess_tweets(tweet):
    tweet.lower()
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    # Stemming and Lemmatizing both are normalization techniques.
    # We will use either of these not both.
    # Stemming is faster than lemmatization    
    
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]
    #lemmatizer = WordNetLemmatizer()
    #lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return " ".join(stemmed_words)

In [109]:
# Convert Value to String
def value_to_string(sentiment):
    if sentiment == 0:
        return "Negative"
    elif sentiment == 1:
        return "Neutral"
    else:
        return "Positive"

In [234]:
def naive_bayes_model(X_train, y_train, X_any, y_any):
    # Training Naive Bayes model
    NB_model = MultinomialNB()
    NB_model.fit(X_train, y_train)
    y_predict_nb = NB_model.predict(X_any)
    print(accuracy_score(y_any, y_predict_nb))
    return NB_model

In [242]:
def logistic_reg_model(X_train, y_train, X_any, y_any):
    # Training Logistics Regression model
    LR_model = LogisticRegression(solver='lbfgs', max_iter=10000)
    LR_model.fit(X_train, y_train)
    y_predict_lr = LR_model.predict(X_any)
    print(accuracy_score(y_any, y_predict_lr))
    return LR_model

In [256]:
# Load train dataset
train_dataset = load_dataset("train")
# Preprocess train data
train_dataset.text = train_dataset['text'].apply(preprocess_tweets)

# Load val dataset
val_dataset = load_dataset("val")
# Preprocess val data
val_dataset.text = val_dataset['text'].apply(preprocess_tweets)

# Load test dataset
test_dataset = load_dataset("test")
# Preprocess data
test_dataset.text = test_dataset['text'].apply(preprocess_tweets)

In [257]:
vector = TfidfVectorizer(sublinear_tf=True)

In [213]:
# train tf vector
tf_vector = vector.fit(np.array(train_dataset.iloc[:, 0]).ravel())
X_train = tf_vector.transform(np.array(train_dataset.iloc[:, 0]).ravel())
y_train = np.array(train_dataset.iloc[:, 1]).ravel()

In [238]:
# validation tf vector
X_val = tf_vector.transform(np.array(val_dataset.iloc[:, 0]).ravel())
y_val = np.array(val_dataset.iloc[:, 1]).ravel()

In [250]:
# test tf vector
X_test = tf_vector.transform(np.array(test_dataset.iloc[:, 0]).ravel())
y_test = np.array(test_dataset.iloc[:, 1]).ravel()
# test_feature = tf_vector.transform(np.array(test_dataset.iloc[:, 0]).ravel())

In [231]:
naive_bayes_model(X_train, y_train, X_val, y_val)

0.601


In [253]:
logistic_reg_model(X_train, y_train, X_val, y_val)

0.6675


LogisticRegression(max_iter=10000)

In [254]:
logistic_reg_model(X_train, y_train, X_test, y_test)

0.5945945945945946


LogisticRegression(max_iter=10000)