In [22]:
import pandas as pd
import matplotlib.pyplot as plt

# nltk -> natural language tool kit, the library we will use basic NLP functions

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [None]:
data = pd.read_csv("data/samples/sample_20000.csv")

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Lemmatize each token
    # lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Stem each token
    
    lemmatized_tokens = [stemmer.stem(token) for token in tokens]
    # Join the tokens back into a string
    return " ".join(lemmatized_tokens)


data["selftext_string"] = data["selftext"].astype(str)
data["selftext_preprocessed"] = data["selftext_string"].apply(preprocess)
print(data["selftext_preprocessed"].head(5).values)

["my four roommat and i ( all in colleg ) went out to dinner with a few other friend includ one of their girlfriend and her sister . we alreadi had a histori of not be super nice to each other , as i see her boss my roommat around all the time ( none of my busi i know ) but thi day push her to almost full out hatr of me . we had a tabl for over ten peopl and they are sit across from me . the meal goe fine until we get the bill . the waitress hand out pen for the receipt sinc we all pay by card . we all get similar pen except for her , she get one of the nicest pen i have ever seen . thi thing had heft and mass , and wrote like a dream ( i borrow it after she use it to sign my receipt ) . it look almost exactli like [ thi ] ( http : //static1.jetpens.com/images/a/000/006/6234.jpg ) . as we are about to leav she start talk to her sister about take the pen . i overhear thi and mention that they should n't take the pen , they in no way need it and the waitress is just work to support herse

In [None]:
data["link_flair_text"].unique()

In [18]:
asshole_flairs = ["asshole", 
                  "slight asshole",
                  "Asshole", 
                  "asshole (a bit)", 
                  "Asshole (but funny/justified)", 
                  "justified asshole",
                  "huge asshole", 
                  "asshole (Kind of)",
                  "asshole (tiny bit)", 
                  "Crouching Liar; hidden asshole",
                  "Not the A-hole POO Mode",
                  "Asshole POO Mode"]
data["target"] = data["link_flair_text"].apply(lambda x: 1 if x in asshole_flairs else 0)

In [19]:
documents = data["selftext_preprocessed"].tolist()
titles = data["title"].tolist()
tfidfvectorizer = TfidfVectorizer(stop_words="english")
countvectorizer = TfidfVectorizer(stop_words="english", use_idf=False, norm=None)

tfidf = tfidfvectorizer.fit_transform(raw_documents=documents)
tfidf_df = pd.DataFrame(tfidf.todense(), index=titles, columns=tfidfvectorizer.get_feature_names_out())

absolute_tfidf = countvectorizer.fit_transform(raw_documents=documents)
absolute_tfidf_df = pd.DataFrame(absolute_tfidf.todense(), index=titles, columns=countvectorizer.get_feature_names_out())


In [20]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, data["target"], test_size=0.2, random_state=42)

In [25]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.99      0.88      3136
           1       0.53      0.02      0.05       863

    accuracy                           0.78      3999
   macro avg       0.66      0.51      0.46      3999
weighted avg       0.73      0.78      0.70      3999

F1 Score: 0.046511627906976744
