In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/nicolas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nicolas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nicolas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/nicolas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Logistik Regression Modell

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    return {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}.get(tag, wordnet.NOUN)

stop_words = set(stopwords.words('english'))
additional_stopwords = {"al", "god", "https", "ahly", "http"}
stop_words = stop_words.union(additional_stopwords)

def custom_preprocess(text):
    text = text.lower()
    text = re.sub(r'\bha\s+ha\b', 'haha', text)
    text = re.sub(r"\b(\w+)'\s*s\b", r"\1's", text)
    return text

def preprocess(tweet):
    tweet = tweet.lower()
    words = re.findall(r'\b[a-zA-Z]+\b', tweet)
    lemmatizer = WordNetLemmatizer()
    processed_words = []
    for w in words:
        lemma = lemmatizer.lemmatize(w, get_wordnet_pos(w))
        if lemma not in stop_words:
            processed_words.append(lemma)
    return " ".join(processed_words)

df = pd.read_csv("./assets/combined.csv")
df.dropna(subset=["tweet", "Status"], inplace=True)
df["processed_tweet"] = df["tweet"].apply(preprocess)

X = df["processed_tweet"]
y = df["Status"]

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9537689642814479
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      4420
           1       0.94      0.96      0.95      3951

    accuracy                           0.95      8371
   macro avg       0.95      0.95      0.95      8371
weighted avg       0.95      0.95      0.95      8371

