Set `jupyter.notebookFileRoot` to `${workspaceFolder}`.

In [23]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression

import src.models.naive_bayes as naive_bayes
import src.models.logreg as logreg
from src.models.vectorization import CustomTfidfVectorizer
import src.models.tokenization as tkn


In [24]:
#frame = pd.read_csv("dataset.csv", delimiter=",", quotechar="'", names=("tweet", "label"))

path = "src/models/dataset.xlsx" # adjust to where you keep the dataset

frame = pd.read_excel(
    path, 
    sheet_name="Worksheet", 
    header=None, 
    usecols="A:B", 
    names=("tweet", "label")
)

X = frame["tweet"].values
y = frame["label"].values 

print("tweets: size ", X.size)
print(X[:10])
print(y[:10])

print("AI tweets:", np.count_nonzero(y))

tweets: size  13683
['Jolly Jovial #JeromePowell at the FOMC #Powell #FOMC #FOMCMeeting #RateCut #Rate #RateCuts #25bps #Economy #Inflation #Deflation #Economics #Fed #TheFed #FederalReserve $BTC #BTC #Bitcoin'
 'Concept art of Devil May Cry when it was still in development as Resident Evil 4'
 '\'"I have this weird habit of,  when having spare time between projects,  just staring at my desktop and trying to build up the courage or motivation to start a new game. Like I don\'\'t like the commitment of starting a new game,  or always feel like there\'\'s something more productive."\''
 "'This is not a joke. I’ve been through this exact experience with my boyfriend. When he was stressed or depressed,  he started using curse words and making inappropriate jokes. He made some pretty bad jokes and used some really inappropriate language. If you'"
 "'Social Justice is the topic that everyone seems to be talking about these days. A social justice movement has been growing in America for years

In [25]:
# Run if you wish to split phrases written in Pascal case
X = list(map(tkn.split_pascals, X))

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, test_size=1000)

In [27]:
# Custom-preprocessed sets
custom_vec = CustomTfidfVectorizer(min_df=5, max_df_ratio=0.6)

X_train_custom_prep = custom_vec.fit_transform(X_train)
X_test_custom_prep = custom_vec.transform(X_test)

In [28]:
# Sklearn-preprocessed sets
sklearn_vec = TfidfVectorizer(min_df=5, max_df=0.6, sublinear_tf=True)

X_train_sklearn_prep = sklearn_vec.fit_transform(X_train).toarray()
X_test_sklearn_prep = sklearn_vec.transform(X_test).toarray()

In [29]:
# Custom logistic regression 
lr = logreg.LogisticRegressionModel()

## on custom-preprocessed data
lr.train(X_train_custom_prep, y_train)

accuracy = (lr.classify(X_test_custom_prep) == y_test).mean()
print("Accuracy logreg custom/custom: ", accuracy)

## on sklearn-preprocessed data
lr.train(X_train_sklearn_prep, y_train)

accuracy = (lr.classify(X_test_sklearn_prep) == y_test).mean()
print("Accuracy logreg sklearn/custom: ", accuracy)

  aux = ys - np.reciprocal(1 + np.exp(-xs @ beta))


Accuracy logreg custom/custom:  0.717
Accuracy logreg sklearn/custom:  0.846


In [33]:
# Custom naive Bayes
nb = naive_bayes.NaiveBayesModel()

# it uses its own preprocessing
X_train_bayes = nb.preprocess_set(X_train)
X_test_bayes = nb.preprocess_set(X_test)

nb.train(X_train_bayes, y_train)

results = nb.classify(X_test_bayes)
correct = 0
for result, label in zip(results, y_test):
    if result == label:
        correct += 1
acc = correct/1000
print("Accuracy custom Bayes: ", acc)

Accuracy custom Bayes:  0.547


In [34]:
# Sklearn's LogisticRegression
slr = LogisticRegression(penalty=None)

## on custom-preprocessed data
slr.fit(X_train_custom_prep, y_train)

accuracy = (slr.predict(X_test_custom_prep) == y_test).mean()
print("Accuracy logreg custom/sklearn: ", accuracy)

## on sklearn-preprocessed data
slr.fit(X_train_sklearn_prep, y_train)

accuracy = (slr.predict(X_test_sklearn_prep) == y_test).mean()
print("Accuracy logreg sklearn/sklearn: ", accuracy)

Accuracy logreg custom/sklearn:  0.821
Accuracy logreg sklearn/sklearn:  0.893


In [35]:
# Sklearn's ComplementNB
cnb = ComplementNB()

## on custom-preprocessed data
cnb.fit(X_train_custom_prep, y_train)

accuracy = (cnb.predict(X_test_custom_prep) == y_test).mean()
print("Accuracy Bayes custom/sklearn: ", accuracy)

## on sklearn-preprocessed data
cnb.fit(X_train_sklearn_prep, y_train)

accuracy = (cnb.predict(X_test_sklearn_prep) == y_test).mean()
print("Accuracy Bayes sklearn/sklearn: ", accuracy)

Accuracy Bayes custom/sklearn:  0.811
Accuracy Bayes sklearn/sklearn:  0.875
