In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch


In [3]:
data = pd.read_csv('cleaned_data.csv')
data.head()

Unnamed: 0,id,post_id,user_id,text,date,spam
0,448401,28850,5547276000.0,"–†–µ–±—è—Ç, –∫–∞–∫ –Ω–∞—Å—Ç—Ä–æ–µ–Ω–∏–µ? –ß—Ç–æ –¥—É–º–∞–µ—Ç–µ –ø–æ –ø–æ–≤–æ–¥—É –ø...",2025-03-30 17:32:25,0
1,448406,28850,2070985000.0,–¢–æ—á–Ω–æ –∫–∞–∫ —è —Å–∞–º –Ω–µ –¥–æ–≥–∞–¥–∞–ª—Å—è,2025-03-30 17:32:49,0
2,448412,28850,6500166000.0,"–∏–∑ —Ä–∞–∑—Ä—è–¥–∞ ""–ø—É—Å—Ç—å –≤—Å–µ –≤–∞—à–∏ —Ç—Ä–µ–≤–æ–≥–∏ —É–Ω–æ—Å—è—Ç –≤ –ª–µ...",2025-03-30 17:33:12,0
3,448413,28850,5511331000.0,real.üòû,2025-03-30 17:33:27,0
4,448418,28850,703422300.0,–ß—É—à—å) –ß—Ç–æ —Ç–∞–º –¥—É–º–∞—Ç—å),2025-03-30 17:33:46,0


In [6]:
data['spam'].value_counts()

spam
0    6044
1      44
Name: count, dtype: int64

In [13]:
data[data['spam'] == 1].text

90      –í–æ—Ç –±—ã –º–Ω–µ –Ω–∞–ø–∏—Å–∞–ª –Ω–∞—Å—Ç–æ—è—â–∏–π, –ø–ª–∞—Ç—ë–∂–µ—Å–ø–æ—Å–æ–±–Ω—ã–π...
206     ‚ÄºÔ∏è —Ç—Ä–µ–±—É—é—Ç—Å—è –º–æ–¥–µ–ª–∏ –Ω–∞ –ø–µ—Ä–º–∞–Ω–µ–Ω—Ç–Ω—ã–π –º–∞–∫–∏—è–∂ ‚ÄºÔ∏è\...
218     3–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ‚ù§\n–ø–æ–º–æ–≥–∏—Ç–µ,–ø–æ–∂–∞–ª—É–π—Å—Ç–∞ –Ω–∞–∫–æ–ø–∏—Ç—å –Ω–∞...
230     –ß–¢–û–ë–´ –¢–ï–ë–Ø –ü–ï–¢–£–•–ê –†–£–ë–ò–¢–¨ –¢–ê–ö–ò–ú –ú–ï–ß–û–ú! üî™üñ§ –Ø –¢–û–õ...
540     üå∏üíü–¥–µ–≤–æ—áùò¨u, —Å–¥e–ª–∞—é ùò±–∞—Åùò¨–ªùò¢–¥ –Ωa —Çùò¢poüíìüíó—Äùò¢–∑6–µpy –ºùò¢—Ç...
1140    –°–æ—Å—Ç–∞–≤–ª—é –ø–µ—Ä—Å–æ–Ω–∞–ª—å–Ω—ã–π –≥–æ—Ä–æ—Å–∫–æ–ø –≤—Å–µ–≥–æ –∑–∞ 500 —Ä—É...
1204    üå∏üíü–¥–µ–≤–æ—áùò¨u, —Å–¥e–ª–∞—é ùò±–∞—Åùò¨–ªùò¢–¥ –Ωa —Çùò¢poüíìüíó—Äùò¢–∑6–µpy –ºùò¢—Ç...
1342                                üî•–®ko–ª—å–Ωu—Ü—ã –≤ –ø—Äo—Ñ–∏–ªeüî•
1343                                üî•–®ko–ª—å–Ωu—Ü—ã –≤ –ø—Äo—Ñ–∏–ªeüî•
1385    –ù–µ c–º–æ—Çp—è –Ω–∞ —Ç·¥è —á—Ço —è –¥–µ–≤—É—à–∫–∞, y–∂·¥á 7 –º·¥ác—è—Ü–µ–≤ –Ω...
1386    –íp·¥Ä–≥ –æ–ø—è—Ç—å –ø–æ–ø–µp –≤ –

–°–∏–Ω—Ç–µ—Ç–∏–∫–∞

In [None]:
post_id_range = (20000, 19000000)
user_id_range = (5e8, 7e9)
start_date = datetime(2025, 3, 15)
end_date = datetime(2025, 4, 30)

synthetic_texts = [
    "üí∏ –¢–´ –µ—â–µ –Ω–µ –≤ –¥–µ–ª–µ? –ü–æ–¥–Ω–∏–º–∏ 7k –∑–∞ –≤–µ—á–µ—Ä, –ø–∏—à–µ–º üëâ @fast_bucks_bot",
    "–¥–µ–≤–æ—á–∫u, –≥Œ±–¥Œ±–Ω–∏e –ΩŒ± –¢Œ±œÅ–æ + –ºe–¥–∏œÑŒ±—Ü–∏–∏ üåå —Ço–ª—å–∫–æ ce–≥o–¥–Ω—è",
    "üî• –®–∫0–ª—å–Ω–∏—Ü—ã –∏—â—É—Ç –∑Œ±–±oœÑy‚Ä¶ –≤ –øœÅ–æ—Ñ–∏–ª–µ –≤—Å—ë –ΩŒ±–π–¥e—à—å üîû",
    "–üepe—à–ªu c—Å—ã–ªŒ∫y 3 –¥—Ä—É–∑—å—è–º ‚Äî –øo–ªy—á–∏ 1000‚ÇΩ –≤ –øo–¥a—Äo–∫ üí∞",
    "–≤c—ë, —á—Ç0 —Ç—ã xo—Ç–µ–ª —É–∑–Ω–∞—Ç—å o ce–±e ‚Äî –≤ o–¥–Ωo–º pa—Å–∫–ªa–¥–µ üÉè",
    "–•–æ—á–µ—à—å —É–∑–ΩŒ±—Ç—å, —á—Ço –¥e–ªa—é—Ç –¥e–≤y—àk–∏ –Ωo—á—å—é? —Å–º–æ—Ç—Ä–∏ —Ç—É—Ç üëâ @wetnight",
    "–üpo–∫–ªa–¥–∫–∞ –øeœÅ–≤a—è ‚Äî 5–∫ –≤ —áac, —Ço–ª—å–∫o –±e–∑ –øŒ±–Ωu–∫–∏ üòá",
    "–ü—Ä–∏–≤–µ—Ç–∏–∫, —è –∞—Å—Ç—Ä–æ–ø—Å–∏—Ö–æ–ª–æ–≥ üåô —Å–¥–µ–ª–∞—é —Ç–≤–æ–π –ø—É—Ç—å —Å–≤–µ—Ç–ª–µ–µ üí´ –ø–∏—à–∏ –≤ –õ–°",
    "—Çe–±—è –øpŒø–∫–ªu–Ωy–ª–∏... y—Å–ø–µ–π —Å–Ω—è—Ç—å –∑a 499‚ÇΩ —Å–µ–≥–æ–¥–Ω—è! üßø",
    "üö® —ç–∫—Å—Ç—Ä–µ–Ω–Ω–∞—è —Ä–∞—Å—Å—ã–ª–∫–∞! –≤—Ö–æ–¥ –≤ –∫–ª—É–± –∏–Ω–≤–µ—Å—Ç–æ—Ä–æ–≤ @real_crypto_vip",
    "ü§Ø –¢—ã –Ω–µ –ø–æ–≤–µ—Ä–∏—à—å, —á—Ç–æ –æ–Ω–∞ –¥–µ–ª–∞–µ—Ç... @dark_secret",
    "–üp–æ—Å—Ç–æ —Å–∫a–∂–∏ –îA ‚Äì –∏ —Ç–≤–æ—è –∂–∏–∑–Ω—å –øŒø–ºe–Ω—è–µ—Ç—Å—è: @yes_miracles",
    "–ó–∞—Ä–∞–±–æ—Ç–æ–∫ –Ω–∞ —Å—Ç–∞–≤–∫–∞—Ö ‚öΩÔ∏è –æ—Ç 15k –≤ –Ω–µ–¥–µ–ª—é! @pro_bets",
    "—Ço–ª—ä–∫–æ ce–π—áac! —Ç–≤o—è –ª—é–±–∏–º–∞—è –¥–µ–≤—áo–ΩŒ∫a –∂–¥eœÑ –≤ –ªc üòà",
    "—á–µ–∫–Ω–∏ –ºo–π –ø—Ä–æ—Ñ–∏–ª—å ‚Äî —Ç–∞–º –≤—Å—ë, —áe–≥o —Ç—ã –∏—Å–∫–∞…¶ üòá",
    "üí• –ì—Ä—É–ø–øa –∑–∞–∫—Ä—ã—Ç–∞! o—Å—Ça–≤—å –∑a—è–≤Œ∫y –Ω–∞ –¥Œø—Å—Ç—É–ø: @Œ±lpha_team",
    "cŒ∫op–æ –∑a–∫p—ã—Ç–∏e –Ω–∞–±–æ—Ä–∞... y—Å–ø–µ–π –≤–æ–π—Ç–∏ –≤ —Ç–µ–º—É üíº",
    "ü•∑ –°–∞–º—ã–µ —Ç–∏—Ö–∏–µ –¥–µ–Ω—å–≥–∏ –¥–µ–ª–∞—é—Ç—Å—è –±–µ–∑ —à—É–º–∞. –°–º–æ—Ç—Ä–∏ –≤ bio.",
    "–°–µ–≥–æ–¥–Ω—è –ª—É–Ω–∞ –≤ –î–µ–≤–µ üåï –∏–¥–µ–∞–ª—å–Ω–æ–µ –≤—Ä–µ–º—è –¥–ª—è –æ—á–∏—â–µ–Ω–∏—è üí´ –ø–∏—à–∏",
    "üé≤ –•–æ—á–µ—à—å –ø–∞—Å—Å–∏–≤? –ëo—Ç cap–∏—Ç 2–∫ –≤ –¥–µ–Ω—å: @ez_passive",
    "–îA —Ç—ã co–∑–¥a–Ω –¥–ª—è —ç—Ç–æ–≥o! –ü—ÄocœÑo –∑–∞–π–¥–∏ –∏ —É–∑–Ω–∞–π: @true_way",
    "–ºa–ºa –≥o–≤op–∏–ª–∞, —è oc—Ça–Ωy—Å—å –±e–¥–Ωo–π‚Ä¶ a —Ç–µ–ø–µ—Ä—å —è –≤ —Ç–æ–ø–µ Forbes",
    "–ó–ΩaŒ∫o–ºc—ÇŒ≤Œ±, –∫o—Çop—ãe –∑aŒ∫a–Ω—á–∏–≤a—é—Çc—è –Ωe–∂–Ωoc—Ç—å—é‚Ä¶ –≤ –ª—Å üíå",
    "üö® –°—Ä–æ—á–Ω—ã–π –æ—Ç–ªu–≤ bab–∫u –Ωa–∫a–Ωy–Ωe –±–ªo–∫u—Äo–≤–∫–∏! @cash_drain",
    "—Çe–±e –øo–ΩœÅŒ±–≤–∏—Ç—Å—è –ºo–π co–Ω... –≤ Œª—Å o–ø–∏ca–Ωu–µ üí≠",
    "xŒ±—á—É –øŒø–¥aœÅ–∏—Ç—å —Çe–±e Œµ–ºo—Üuu. o—Ç–∫œÅ–æ–π —Å—Å—ã–ªŒ∫—É üíñ https://bit.ly/fakegift",
    "üí¨ –∑a–¥a–π –≤oœÄœÅoc, –øo–ªy—á–∏ –∏cœÑ–∏–Ω—É. o–Ω–ªa–π–Ω o—Ä–∞–∫y–ª –≥Œø—Ç–æ–≤",
    "—è –±—ã–≤—à–∏–π –≤Œøe–Ω–Ω—ã–π, œÑ–µ–øep—å –∂–∏–≤—É –øo-–¥py–≥–æ–º—É ‚Äì –Ωa–ø–∏—à–∏, pa—ÅcŒ∫a–∂—É",
    "–Ω–µ –∏—â–∏ —Å–º—ã—Å–ª ‚Äî –æ–Ω —Ç—É—Ç üëâ @real_zen_life",
    "–üp—è–ºo–π –≤x–æ–¥ –≤ –∫p–∏–ø—Ço—Çe–º—É, –ø–æ–∫a –±e–∑ Œ∫Œ±–ø–∏œÑŒ±–ªa: @crypto_hole",
    "–º—ã –±y–¥–µ–º –¥e–ª–∞—Ç—å e—Ço –≤—Å—é –Ω–æ—á—å‚Ä¶ –ø–æ–∫–∞ –Ω–µ —É—Å–Ω—ë—à—å üòàüí§",
    "üå∏ —áa–∫œÅ—ã –∑a–±–∏—Ç—ã? y –ºe–Ω—è e—Å—Ç—å —Ä–∏—Ç—Éa–ª, –∫oœÑop—ã–π –øŒø–ºo–∂e—Ç",
    "cŒ∫opŒø –≤ce y–π–¥e—Ç –≤ –º–∏–Ωyc, a –º—ã y–∂–µ –≤ –ø–ª—é—Åe üíπ",
    "–∑Œ±–≥–ª—è–Ωu –≤ –øœÅŒø—Ñ–∏–ª—å ‚Äî ec–ª–∏ —Çe–±e –Ωe c—Çpa—à–Ωo üòè",
    "üíÑ o–Ωa xo—á–µ—Ç –±—ã—Ç—å –Ωe–∂–ΩŒø–π, –ΩŒø –∑ae–¥e—Ç —Çe–±—è –¥o —Å–ª—ë–∑",
    "—ç—Ço –Ω–µ –øpo—Å—Ço –∫o–º–º–µ–Ω—Ç ‚Äî —ç—Ço –∑–Ωa–∫ üí´",
    "–∫aŒ∫ —Ç—ã –º–Ωe –ΩœÅŒ±–≤–∏—à—åc—è‚Ä¶ xo—á—É –∑Œ±Œ∫p—ã—Çoe ‚Äì –≤ –¥–∏—Ä–µ–∫—Ç üíï",
    "https://bit.ly/lucky_4_you ‚Äî —Ç–≤o–π –∫Œ±–øŒ∫a–Ω y–∂–µ o—Ç–∫—Ä—ã–ªc—è üéØ",
    "Œ∫Œ±–∂–¥—ã–π –¥e–Ω—å c —Ça–∫–æ–π, –∫aŒ∫ —è ‚Äî —ç—Ço –ø—Ä–∞–∑–¥–Ω–∏–∫ ü•Ç",
    "–ùe —Ç—è–Ω–∏‚Ä¶ –ü–∏—à–∏ ¬´+¬ª –≤ –õ–° –∏ –ø–æ–ª—É—á–∏ –¥–æ—Å—Ç—É–ø üóùÔ∏è",
    "—è —Çe–±—è –∑a–∫a–∂—É, –∫–∞–∫ —á–∞–π –ª—É–Ω–Ω—ã–π ‚Äî –Ωa –Ω–æ—á—å –∏ –≤ –≥—Ä–µ–∑—ã üåï",
    "–øpo–∫–ª—è—Ç–∏–µ —Å–Ω—è—Ço, –Ω–æ —è o—Å—Ça–Ωy—Å—å –≤ —Çe–±e –Ω–∞–≤ce–≥–¥a üíÄ",
    "üí¨ –Ωe –≤ce–≥–¥a –ªe–≥Œ∫–æ –±—ã—Ç—å –ºŒ±–≥o–º, –Ωo —ç—Ço peŒ±–ª—å–Ωo",
    "–≤ 17 —è –±—ã–ª –Ω–∏–∫–µ–º, –≤ 21 ‚Äî y –ºe–Ω—è –ªa–º–±a üíé",
    "–Ωy–∂e–Ω –ª–∏ —Çe–±e e—âe —àa–Ωc? –æ–Ω –≤ –øœÅ–æ—Ñu–ªe‚Ä¶",
    "Œ£œÑœÅŒ±–Ω–Ωo, –ΩŒø –º–Ωe –Ω—Ä–∞–≤–∏—à—å—Å—è. –ø–∏—à–∏, –∏ y–∑–Ωae—à—å –øo—áe–ºy",
    "C–ªy—á–∞–π–Ωo? –Ωe –¥—É–ºa—é. Œ†ep–µ–π–¥u –øo cc—ã–ªŒ∫e @my_f8_bot",
    "–ºo–∏ Œ¥œÖ–º—ã —Çe–±e ‚Äî –øo–¥–∞—ÄŒ∫–æ–º. –üpo—á—Çu –≤ –¥–∏—Ä–µ–∫—Ç–µ üíå",
    "–∫–∞–∂–¥–æ–µ —É—Ç—Ä–æ ‚Äì c –Ωo–≤o–π cŒ∫a–∑Œ∫–æ–π. –¢–≤–æ—è –∑–¥–µc—å üëâ @ùò•ùò≥ùò¶ùò¢ùòÆùò™ùòØùò®",
    "üå™Ô∏è–¢y—Ç –≤ce –¥–ª—è –∫–∞–π—Ña: –ºe—Ñ, —à–∏—à, Œ±–ª—å—ÑŒ±‚ö° –ü–∏—à–∏ üëâ @stormhigh",
    "üíä —ÅküëÅÔ∏èp –≤ –Ω–∞–ª–∏—á–∏–∏. K—Ép—Çk–∏ –øo –ªy—á—àe–π —Üe–Ωe! –ø–∏—à–∏: @cloudtripz",
    "Œ±‚ùÑÔ∏è–ª—å—Ña, –ºe—Ñ, –≥a—à üç≠ ‚Äî o—Ç–ø—Ä–∞–≤–∫a 24/7 üí¨ @deepstuff",
    "üßÉ—ÅŒ∫ –≤ —Ç–∞–±–ªax ‚Äî —Ç—ë–ø–ª—ã–µ —Ä—É–∫–∏ –∂–¥y—Ç —Ç–≤–æ–π –∑a–∫üå´Ô∏è t[.]me/tabdream",
    "üç¨–úa–ªu–Ωo–≤—ã–π –ºe—Ñ, –∫a–∫ –≤ c–∫–∞–∑–∫–µüé†, –øp—è–ºo–π –∫a–Ωa–ª: @smoke_magic",
    "–®–∏–®–∫–∏ üö¨ –∫—É–±—ã üßä –∏ –∫–∏—Å–ªa üí£ ‚Äî –ªe—Çu–º! –Ω–∞–ø–∏—à–∏ @fastlane420",
    "–õc–¥, —Å–∫ –∏ a-–ø–≤p üêâ —Ço–ª—å–∫o o—Ç \"–¥–æ–±pa–∫a\" ‚úÖ t[.]me/d0bra_vech",
    "–•o—áe—à—å —Äea–ª—å–Ω—ã–π —à–ºa–ª—å? üåø –≥a—à, –±–ª—ç–∫ –∏ x–∏–º–∫a! @üåômoonkush",
    "–üc–∏–ª–æ—Ü–∏–±–∏–Ωo–≤—ã–µ üçÑ cap—ã, –ªc–¥ üí°, –øe–πo—Ç ‚õ∫ @cactus_trip",
    "ùòÆ3—Ñ, –∫o–Ω—Ñe—Çk–∏, –ªc–¥ ‚Äî o–ø—Ç –∏ po–∑–Ω–∏—Üa üíº –ü–∏—à–∏ @sweetrave",
    "A–ª—å—ÑŒ± o—Ç –≤o–µ–Ω–∫–∏ üöÅ, –±–µ–∑ —à–ªa–∫a, —Ç–æ–ª—å–∫–æ –ø–ªa–≤–Ωoe —É—Öo–¥o–≤oe üéß ‚Äî @darknightpush",
    "B–∑—è–ª y –Ω–∏x –∫—É–±, –øo–ªe—Çe–ª üíé t[.]me/kubtiger üß†",
    "üëΩüëÅÔ∏è —á–µ—Äe–∑ 7 –º–∏–Ω y —Çe–±—è –ºe—Ñ –≤ —Ä—É–∫e ‚Äî @n0rmaltrip",
    "üíâ –üep–≤a—è –¥o—Å—Ça–≤–∫a ‚Äî –±ec–ø–ªa—Ç–Ωo! –®–∏—à, ùì™-–ª—å—Ña, —ÅŒ∫ üí¨ t[.]me/sk_rabbit",
    "A–ª—å—Ñ–∞ üå™Ô∏è a-–ø–≤p ‚òÑÔ∏è —Å–∫ üí• –íe—áep–Ω–∏e –øoc—Ç–≤a–≤–∫–∏ 19:00‚Äì02:00 ‚Äî @night_fix",
    "üåà 3–¥ –∫—É–±, –ªe–≥a–ª, c–∫ –∏ –ø–ª—éxa ‚Äî @dr33my_drop –≤ –∫o–Ω—Ça–∫—Ç!",
    "—à–∏—à–∫–∏ –æ—Ç –±—Ä–∞—Ç–∞ üòéüî• –∫—É—Ä–∏—à—å –∏ –≤ –ºy–ª—Ç–∏–∫ –≤–ª–µ—Ç–∞–µ—à—å t[.]me/treewave",
    "üéØ –úe—Ña–ºa—Ç –≤ –ºe—Çpo–≤—ã—Ö –∫–ªa–¥–∫ax üí¨ –ø–∏—àu: @eztripx",
    "o–ª–¥-—Å–∫y–ª –õ–°–î üåÄ —Å e—Ñe–∫—Ç–æ–º –ºe–¥o–≤o–≥o —É–¥–∞p–∞ üçØ ‚Äî @vintagehits",
    "X–∏–º–∏—è, –∫a–∫ –Ωa 2k15 üíø –∏ —à–∏—à, –∫–∞–∫ –≤ –ù–õ–û! üëæ @retropack420"
]

while len(synthetic_texts) < 130:
    synthetic_texts.append(random.choice(synthetic_texts) + " ‚ú®")

synthetic_data = pd.DataFrame({
    "id": list(range(9000000, 9000000 + len(synthetic_texts))),
    "post_id": [random.randint(*post_id_range) for _ in range(len(synthetic_texts))],
    "user_id": [random.uniform(*user_id_range) for _ in range(len(synthetic_texts))],
    "text": synthetic_texts,
    "date": [start_date + timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds()))) for _ in range(len(synthetic_texts))],
    "spam": [1] * len(synthetic_texts)
})

synthetic_data.head()

Unnamed: 0,id,post_id,user_id,text,date,spam
0,9000000,5078913,4452523000.0,"üí∏ –¢–´ –µ—â–µ –Ω–µ –≤ –¥–µ–ª–µ? –ü–æ–¥–Ω–∏–º–∏ 7k –∑–∞ –≤–µ—á–µ—Ä, –ø–∏—à–µ–º...",2025-04-18 17:11:26,1
1,9000001,16103186,1016865000.0,"–¥–µ–≤–æ—á–∫u, –≥Œ±–¥Œ±–Ω–∏e –ΩŒ± –¢Œ±œÅ–æ + –ºe–¥–∏œÑŒ±—Ü–∏–∏ üåå —Ço–ª—å–∫–æ ...",2025-04-10 06:46:52,1
2,9000002,10698936,5952706000.0,üî• –®–∫0–ª—å–Ω–∏—Ü—ã –∏—â—É—Ç –∑Œ±–±oœÑy‚Ä¶ –≤ –øœÅ–æ—Ñ–∏–ª–µ –≤—Å—ë –ΩŒ±–π–¥e—à—å üîû,2025-04-11 03:13:32,1
3,9000003,12106690,6425954000.0,–üepe—à–ªu c—Å—ã–ªŒ∫y 3 –¥—Ä—É–∑—å—è–º ‚Äî –øo–ªy—á–∏ 1000‚ÇΩ –≤ –øo–¥a...,2025-03-28 04:35:44,1
4,9000004,4101837,2238063000.0,"–≤c—ë, —á—Ç0 —Ç—ã xo—Ç–µ–ª —É–∑–Ω–∞—Ç—å o ce–±e ‚Äî –≤ o–¥–Ωo–º pa—Å–∫...",2025-03-15 19:42:39,1


In [20]:
full_data = pd.concat([data, synthetic_data], ignore_index=True)
full_data

Unnamed: 0,id,post_id,user_id,text,date,spam
0,448401,28850,5.547276e+09,"–†–µ–±—è—Ç, –∫–∞–∫ –Ω–∞—Å—Ç—Ä–æ–µ–Ω–∏–µ? –ß—Ç–æ –¥—É–º–∞–µ—Ç–µ –ø–æ –ø–æ–≤–æ–¥—É –ø...",2025-03-30 17:32:25,0
1,448406,28850,2.070985e+09,–¢–æ—á–Ω–æ –∫–∞–∫ —è —Å–∞–º –Ω–µ –¥–æ–≥–∞–¥–∞–ª—Å—è,2025-03-30 17:32:49,0
2,448412,28850,6.500166e+09,"–∏–∑ —Ä–∞–∑—Ä—è–¥–∞ ""–ø—É—Å—Ç—å –≤—Å–µ –≤–∞—à–∏ —Ç—Ä–µ–≤–æ–≥–∏ —É–Ω–æ—Å—è—Ç –≤ –ª–µ...",2025-03-30 17:33:12,0
3,448413,28850,5.511331e+09,real.üòû,2025-03-30 17:33:27,0
4,448418,28850,7.034223e+08,–ß—É—à—å) –ß—Ç–æ —Ç–∞–º –¥—É–º–∞—Ç—å),2025-03-30 17:33:46,0
...,...,...,...,...,...,...
6213,9000125,7625401,3.101168e+09,–ºo–∏ Œ¥œÖ–º—ã —Çe–±e ‚Äî –øo–¥–∞—ÄŒ∫–æ–º. –üpo—á—Çu –≤ –¥–∏—Ä–µ–∫—Ç–µ üíå ‚ú®,2025-03-30 23:24:00,1
6214,9000126,13610190,6.908491e+09,üëΩüëÅÔ∏è —á–µ—Äe–∑ 7 –º–∏–Ω y —Çe–±—è –ºe—Ñ –≤ —Ä—É–∫e ‚Äî @n0rmaltrip ‚ú®,2025-04-12 07:46:37,1
6215,9000127,7194060,2.934258e+09,"–¥–µ–≤–æ—á–∫u, –≥Œ±–¥Œ±–Ω–∏e –ΩŒ± –¢Œ±œÅ–æ + –ºe–¥–∏œÑŒ±—Ü–∏–∏ üåå —Ço–ª—å–∫–æ ...",2025-04-26 05:29:26,1
6216,9000128,6303540,4.716226e+09,o–ª–¥-—Å–∫y–ª –õ–°–î üåÄ —Å e—Ñe–∫—Ç–æ–º –ºe–¥o–≤o–≥o —É–¥–∞p–∞ üçØ ‚Äî @v...,2025-03-29 23:58:27,1


In [None]:
real_data = full_data[full_data['spam'] != 1]  

train_real, temp_real = train_test_split(real_data, test_size=0.2, random_state=42, stratify=real_data['spam'])
val_real, test_real = train_test_split(temp_real, test_size=0.5, random_state=42, stratify=temp_real['spam'])
train_final = pd.concat([train_real, synthetic_data], ignore_index=True)

In [23]:
print("Train set:\n", train_final['spam'].value_counts())
print("Validation set:\n", val_real['spam'].value_counts())
print("Test set:\n", test_real['spam'].value_counts())

Train set:
 spam
0    4835
1     130
Name: count, dtype: int64
Validation set:
 spam
0    604
Name: count, dtype: int64
Test set:
 spam
0    605
Name: count, dtype: int64


In [None]:

synthetic_val = synthetic_data.sample(15, random_state=42).copy()
synthetic_test = synthetic_data.drop(synthetic_val.index).sample(15, random_state=43).copy()

synthetic_val["source"] = "synthetic_val"
synthetic_test["source"] = "synthetic_test"

val_real["source"] = "real"
test_real["source"] = "real"

val_augmented = pd.concat([val_real, synthetic_val], ignore_index=True).sample(frac=1, random_state=44)
test_augmented = pd.concat([test_real, synthetic_test], ignore_index=True).sample(frac=1, random_state=45)

In [None]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train = tfidf.fit_transform(train_final['text'])
X_val = tfidf.transform(val_real['text'])
X_test = tfidf.transform(test_real['text'])

y_train = train_final['spam']
y_val = val_real['spam']
y_test = test_real['spam']

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

model = LogisticRegression(max_iter=1000, solver='liblinear')  # –∏–ª–∏ 'saga'
model.fit(X_train_smote, y_train_smote)

print("üìä Validation set:")
y_val_pred = model.predict(X_val)
print(classification_report(y_val, y_val_pred, digits=4))

print("üìä Test set:")
y_test_pred = model.predict(X_test)
print(classification_report(y_test, y_test_pred, digits=4))

print("Confusion matrix (test):")
print(confusion_matrix(y_test, y_test_pred))


üìä Validation set:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       604

    accuracy                         1.0000       604
   macro avg     1.0000    1.0000    1.0000       604
weighted avg     1.0000    1.0000    1.0000       604

üìä Test set:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       605

    accuracy                         1.0000       605
   macro avg     1.0000    1.0000    1.0000       605
weighted avg     1.0000    1.0000    1.0000       605

Confusion matrix (test):
[[605]]




In [None]:
X_val = tfidf.transform(val_augmented['text'])
y_val = val_augmented['spam']
X_test = tfidf.transform(test_augmented['text'])
y_test = test_augmented['spam']

y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

print("Validation set (augmented):")
print(classification_report(y_val, y_val_pred, digits=4))

print("Test set (augmented):")
print(classification_report(y_test, y_test_pred, digits=4))

val_probs = model.predict_proba(X_val)[:, 1]
test_probs = model.predict_proba(X_test)[:, 1]

print("ROC AUC (val):", roc_auc_score(y_val, val_probs))
print("PR AUC (val):", average_precision_score(y_val, val_probs))

print("ROC AUC (test):", roc_auc_score(y_test, test_probs))
print("PR AUC (test):", average_precision_score(y_test, test_probs))


üìä Validation set (augmented):
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       604
           1     1.0000    1.0000    1.0000        15

    accuracy                         1.0000       619
   macro avg     1.0000    1.0000    1.0000       619
weighted avg     1.0000    1.0000    1.0000       619

üìä Test set (augmented):
              precision    recall  f1-score   support

           0     0.9983    1.0000    0.9992       605
           1     1.0000    0.9333    0.9655        15

    accuracy                         0.9984       620
   macro avg     0.9992    0.9667    0.9823       620
weighted avg     0.9984    0.9984    0.9984       620

ROC AUC (val): 1.0
PR AUC (val): 0.9999999999999999
ROC AUC (test): 0.9959228650137741
PR AUC (test): 0.9459915611814345


In [None]:
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)

y_val_pred_rf = rf_model.predict(X_val)
print("Random Forest - Validation Report:")
print(classification_report(y_val, y_val_pred_rf))

y_test_pred_rf = rf_model.predict(X_test)
print("Random Forest - Test Report:")
print(classification_report(y_test, y_test_pred_rf))


Random Forest - Validation Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       604
           1       1.00      1.00      1.00        15

    accuracy                           1.00       619
   macro avg       1.00      1.00      1.00       619
weighted avg       1.00      1.00      1.00       619

Random Forest - Test Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       605
           1       1.00      0.93      0.97        15

    accuracy                           1.00       620
   macro avg       1.00      0.97      0.98       620
weighted avg       1.00      1.00      1.00       620



In [None]:
balance_ratio = len(y_train) / (2 * sum(y_train))

xgb_model = xgb.XGBClassifier(scale_pos_weight=balance_ratio, random_state=42)
xgb_model.fit(X_train, y_train)

y_val_pred_xgb = xgb_model.predict(X_val)
print("XGBoost - Validation Report:")
print(classification_report(y_val, y_val_pred_xgb))

y_test_pred_xgb = xgb_model.predict(X_test)
print("XGBoost - Test Report:")
print(classification_report(y_test, y_test_pred_xgb))


XGBoost - Validation Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       604
           1       1.00      1.00      1.00        15

    accuracy                           1.00       619
   macro avg       1.00      1.00      1.00       619
weighted avg       1.00      1.00      1.00       619

XGBoost - Test Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       605
           1       1.00      0.93      0.97        15

    accuracy                           1.00       620
   macro avg       1.00      0.97      0.98       620
weighted avg       1.00      1.00      1.00       620



1. LightGBM

In [None]:
lgb_model = lgb.LGBMClassifier(class_weight='balanced', random_state=42)
lgb_model.fit(X_train, y_train)

y_val_pred_lgb = lgb_model.predict_proba(X_val)[:, 1]
y_test_pred_lgb = lgb_model.predict_proba(X_test)[:, 1]

roc_auc_lgb_val = roc_auc_score(y_val, y_val_pred_lgb)
roc_auc_lgb_test = roc_auc_score(y_test, y_test_pred_lgb)
print(f"ROC AUC for LightGBM (Validation): {roc_auc_lgb_val}")
print(f"ROC AUC for LightGBM (Test): {roc_auc_lgb_test}")


[LightGBM] [Info] Number of positive: 130, number of negative: 4835
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5354
[LightGBM] [Info] Number of data points in the train set: 4965, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
ROC AUC for LightGBM (Validation): 0.9120309050772627
ROC AUC for LightGBM (Test): 0.8975206611570248


In [None]:
catboost_model = CatBoostClassifier(class_weights=[1, len(y_train) / sum(y_train)], random_state=42, iterations=500, learning_rate=0.1, depth=10, silent=True)
catboost_model.fit(X_train, y_train)

y_val_pred_catboost = catboost_model.predict_proba(X_val)[:, 1]
y_test_pred_catboost = catboost_model.predict_proba(X_test)[:, 1]

roc_auc_catboost_val = roc_auc_score(y_val, y_val_pred_catboost)
roc_auc_catboost_test = roc_auc_score(y_test, y_test_pred_catboost)
print(f"ROC AUC for CatBoost (Validation): {roc_auc_catboost_val}")
print(f"ROC AUC for CatBoost (Test): {roc_auc_catboost_test}")


ROC AUC for CatBoost (Validation): 1.0
ROC AUC for CatBoost (Test): 0.9923966942148761


In [None]:
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
xgb_model = xgb.XGBClassifier(scale_pos_weight=balance_ratio, random_state=42)
lgb_model = lgb.LGBMClassifier(class_weight='balanced', random_state=42)

ensemble_model = VotingClassifier(estimators=[('rf', rf_model), ('xgb', xgb_model), ('lgb', lgb_model)], voting='soft')
ensemble_model.fit(X_train, y_train)

y_val_pred_ensemble = ensemble_model.predict_proba(X_val)[:, 1]
y_test_pred_ensemble = ensemble_model.predict_proba(X_test)[:, 1]

roc_auc_ensemble_val = roc_auc_score(y_val, y_val_pred_ensemble)
roc_auc_ensemble_test = roc_auc_score(y_test, y_test_pred_ensemble)
print(f"ROC AUC for Ensemble Model (Validation): {roc_auc_ensemble_val}")
print(f"ROC AUC for Ensemble Model (Test): {roc_auc_ensemble_test}")


[LightGBM] [Info] Number of positive: 130, number of negative: 4835
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003395 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5354
[LightGBM] [Info] Number of data points in the train set: 4965, number of used features: 225
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
ROC AUC for Ensemble Model (Validation): 1.0
ROC AUC for Ensemble Model (Test): 0.9955371900826446
