In [None]:

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb

# NLTK setup (necessary only for initial run )


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# 1.
df = pd.read_csv("data/Suicide_Detection.csv")

# 2. Label encoding
df["class"] = df["class"].map({"non-suicide": 0, "suicide": 1})

# 3. clean funciton
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www.\S+", "", text)      # URL sil
    text = re.sub(r"[^a-z\s]", " ", text)            # özel karakter/emoji sil
    text = re.sub(r"\s+", " ", text).strip()
    words = text.split()
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

df["text"] = df["text"].apply(clean_text)

# 4. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["class"], test_size=0.2, stratify=df["class"], random_state=42
)

# 5. TF-IDF vectorization (n-gram ile)
vectorizer = TfidfVectorizer(
    max_features=24700,
    ngram_range=(1,2),  # unigram + bigram
    sublinear_tf=True
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 6. XGBoost model


model = xgb.XGBClassifier(

    learning_rate=0.2,
    n_estimators=400,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="aucpr",
    use_label_encoder=False
)

# 7. Training
model.fit(
    X_train_tfidf, y_train,
    eval_set=[(X_train_tfidf, y_train), (X_test_tfidf, y_test)],
    verbose=True
)

# 8. Prediction
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))