In [25]:
import pandas as pd

In [26]:
train_df = pd.read_csv("Constraint_Train.csv")
val_df = pd.read_csv("Constraint_Val.csv")
test_df = pd.read_csv("Constraint_Test.csv")

In [27]:
train_df = pd.read_csv("Constraint_Train.csv")
print(train_df.head())
print(train_df.columns)
print(train_df['label'].value_counts())

   id                                              tweet label
0   1  The CDC currently reports 99031 deaths. In gen...  real
1   2  States reported 1121 deaths a small rise from ...  real
2   3  Politically Correct Woman (Almost) Uses Pandem...  fake
3   4  #IndiaFightsCorona: We have 1524 #COVID testin...  real
4   5  Populous states can generate large case counts...  real
Index(['id', 'tweet', 'label'], dtype='object')
label
real    3360
fake    3060
Name: count, dtype: int64


In [28]:
train_df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [29]:
test_df.head()

Unnamed: 0,id,tweet
0,1,Our daily update is published. States reported...
1,2,Alfalfa is the only cure for COVID-19.
2,3,President Trump Asked What He Would Do If He W...
3,4,States reported 630 deaths. We are still seein...
4,5,This is the sixth time a global health emergen...


In [30]:
val_df.head()

Unnamed: 0,id,tweet,label
0,1,Chinese converting to Islam after realising th...,fake
1,2,11 out of 13 people (from the Diamond Princess...,fake
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",fake
3,4,Mike Pence in RNC speech praises Donald Trump’...,fake
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,real


In [31]:
train_df['label'] = train_df['label'].map({'fake': 0, 'real': 1})
val_df['label'] = val_df['label'].map({'fake': 0, 'real': 1})
print(train_df.isnull().sum())

id       0
tweet    0
label    0
dtype: int64


In [32]:
import re

def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # URL'leri kaldır
    text = re.sub(r"@\w+", "", text)     # mentionları kaldır
    text = re.sub(r"#\w+", "", text)     # hashtag'leri kaldır
    text = re.sub(r"[^\w\s]", "", text)  # noktalama işaretlerini kaldır
    text = re.sub(r'[^A-Za-z\s]', '', text) # özel karakterleri kaldır
    return text.lower()                 # küçült

train_df['clean_tweet'] = train_df['tweet'].apply(clean_text)
val_df['clean_tweet'] = val_df['tweet'].apply(clean_text)
test_df['clean_tweet'] = test_df['tweet'].apply(clean_text)

In [33]:
train_df.head()

Unnamed: 0,id,tweet,label,clean_tweet
0,1,The CDC currently reports 99031 deaths. In gen...,1,the cdc currently reports deaths in general t...
1,2,States reported 1121 deaths a small rise from ...,1,states reported deaths a small rise from last...
2,3,Politically Correct Woman (Almost) Uses Pandem...,0,politically correct woman almost uses pandemic...
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,1,we have testing laboratories in india and a...
4,5,Populous states can generate large case counts...,1,populous states can generate large case counts...


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train = tfidf.fit_transform(train_df['clean_tweet'])
X_val = tfidf.transform(val_df['clean_tweet'])

In [35]:
from sklearn.linear_model import LogisticRegression

# Modeli oluştur ve eğit
model = LogisticRegression(max_iter=1000)
model.fit(X_train, train_df['label']) 

In [36]:
import joblib

# Kaydet
joblib.dump(model, "logreg_model.joblib")
joblib.dump(tfidf, "tfidf_vectorizer.joblib")

['tfidf_vectorizer.joblib']

In [37]:
test_df.head()

Unnamed: 0,id,tweet,clean_tweet
0,1,Our daily update is published. States reported...,our daily update is published states reported ...
1,2,Alfalfa is the only cure for COVID-19.,alfalfa is the only cure for covid
2,3,President Trump Asked What He Would Do If He W...,president trump asked what he would do if he w...
3,4,States reported 630 deaths. We are still seein...,states reported deaths we are still seeing a ...
4,5,This is the sixth time a global health emergen...,this is the sixth time a global health emergen...


In [38]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Tahmin yap
preds = model.predict(X_val)

# Performans metriklerini yazdır
print("Doğruluk (Accuracy):", accuracy_score(val_df['label'], preds))
print("\nSınıflandırma Raporu:\n")
print(classification_report(val_df['label'], preds, target_names=['fake', 'real']))

# Confusion matrix
print("\nKarmaşıklık Matrisi:")
print(confusion_matrix(val_df['label'], preds))


Doğruluk (Accuracy): 0.9126168224299065

Sınıflandırma Raporu:

              precision    recall  f1-score   support

        fake       0.91      0.91      0.91      1020
        real       0.92      0.92      0.92      1120

    accuracy                           0.91      2140
   macro avg       0.91      0.91      0.91      2140
weighted avg       0.91      0.91      0.91      2140


Karmaşıklık Matrisi:
[[ 928   92]
 [  95 1025]]


In [39]:
from sklearn.metrics import accuracy_score

# Train setinde modelin performansını ölç
train_preds = model.predict(X_train)
train_accuracy = accuracy_score(train_df['label'], train_preds)

# Val setinde modelin performansını ölç
val_preds = model.predict(X_val)
val_accuracy = accuracy_score(val_df['label'], val_preds)

print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Val Accuracy: {val_accuracy:.4f}")
print(f"Fark: {train_accuracy - val_accuracy:.4f}")

# Eğer fark > 0.05 ise overfitting var demektir
if train_accuracy - val_accuracy > 0.05:
    print("⚠️ Overfitting var!")
else:
    print("✓ Overfitting yok, model iyi!")

Train Accuracy: 0.9466
Val Accuracy: 0.9126
Fark: 0.0340
✓ Overfitting yok, model iyi!


MODELİ AĞ ANALİZİ İÇİN KULLANACAĞIMIZ VERİDE KULANNMA

In [40]:

ag_df = pd.read_csv('Covid-19 Twitter Dataset (Apr-Jun 2020).csv')
ag_df.head()

Unnamed: 0,id,created_at,source,original_text,lang,favorite_count,retweet_count,original_author,hashtags,user_mentions,place,clean_tweet,compound,neg,neu,pos,sentiment
0,1.25e+18,2020-04-19,"<a href=""http://twitter.com/download/android"" ...",RT @GlblCtzn: .@priyankachopra is calling on l...,en,0.0,31.0,RJIshak,,"GlblCtzn, priyankachopra",Jakarta Capital Region,call leader help protect refuge covid19 provid...,0.8176,0.0,0.452,0.548,pos
1,1.25e+18,2020-04-19,"<a href=""http://twitter.com/download/android"" ...",RT @OGSG_Official: OGUN STATE SUPPORT FOR CBN-...,en,0.0,61.0,makinwaoluwole,,OGSG_Official,Nigeria,ogun state support cbn nirsal covid19 target c...,0.6486,0.0,0.602,0.398,pos
2,1.25e+18,2020-04-19,"<a href=""http://twitter.com/download/iphone"" r...",RT @AdvoBarryRoux: These 5 police officials ba...,en,0.0,1.0,TembeAmu,,AdvoBarryRoux,,polic offici base namahadi polic station busi ...,0.2732,0.0,0.851,0.149,pos
3,1.25e+18,2020-04-19,"<a href=""http://twitter.com/download/iphone"" r...",RT @MobilePunch: COVID-19: Oyo discharges two ...,en,0.0,0.0,ilyasrabiu,,MobilePunch,"Lagos, Nigeria",covid19 oyo discharg two patient,0.0,0.0,1.0,0.0,neu
4,1.25e+18,2020-04-19,"<a href=""http://twitter.com/download/android"" ...",My Condolences to the Family of those who did ...,en,0.0,13869.0,bucketeconomist,Covid_19,,,condol famili surviv,0.0,0.0,1.0,0.0,neu


In [41]:
print(ag_df.columns.tolist())

['id', 'created_at', 'source', 'original_text', 'lang', 'favorite_count', 'retweet_count', 'original_author', 'hashtags', 'user_mentions', 'place', 'clean_tweet', 'compound', 'neg', 'neu', 'pos', 'sentiment']


In [42]:
# ...existing code...
# Lang sütunundaki benzersiz değerler ve frekansları
unique_langs = ag_df['lang'].dropna().unique()
print("Benzersiz lang değerleri:", unique_langs)
print("Toplam benzersiz:", len(unique_langs))
print("\nDeğerlerin frekansları:")
print(ag_df['lang'].value_counts(dropna=False))
# ...existing code...

Benzersiz lang değerleri: ['en']
Toplam benzersiz: 1

Değerlerin frekansları:
lang
en     143902
NaN         1
Name: count, dtype: int64


In [43]:
# 1) Mevcut verisetinin bir yedeğini al
backup_path = "Covid-19 Twitter Dataset (Apr-Jun 2020)_backup.csv"
ag_df.to_csv(backup_path, index=False)
print(f"Yedek kaydedildi: {backup_path}")

# 2) Saklamak istediğimiz sütunları 
desired_columns = ['clean_tweet', 'user_mentions', 'hashtags', 'original_author', 'retweet_count', 'favorite_count', "original_text", "created_at"]  

# 3) Eksik sütun kontrolü
missing = [c for c in desired_columns if c not in ag_df.columns]
if missing:
    print("Eksik sütunlar bulundu:", missing)
else:
    # 4) Yalnızca istenen sütunları bırak ve yeni dosya olarak kaydet
    filtered_path = "Covid-19 Twitter Dataset (Apr-Jun 2020)_filtered.csv"
    ag_df_filtered = ag_df[desired_columns].copy()
    ag_df_filtered.to_csv(filtered_path, index=False)
    print(f"Filtrelendi ve kaydedildi: {filtered_path}")

Yedek kaydedildi: Covid-19 Twitter Dataset (Apr-Jun 2020)_backup.csv
Filtrelendi ve kaydedildi: Covid-19 Twitter Dataset (Apr-Jun 2020)_filtered.csv


In [44]:
df_filtered = pd.read_csv("Covid-19 Twitter Dataset (Apr-Jun 2020)_filtered.csv")

In [45]:
df_filtered.head()
df_filtered.isnull().sum()

clean_tweet           484
user_mentions       36571
hashtags           115393
original_author         1
retweet_count           1
favorite_count          1
original_text           1
created_at              1
dtype: int64

In [46]:
import re, joblib, numpy as np

# load model/tfidf from disk if missing in memory
try:
    model
    tfidf
except NameError:
    model = joblib.load('logreg_model.joblib')
    tfidf = joblib.load('tfidf_vectorizer.joblib')

# keep only the filtered columns so we don't accidentally carry extras
try:
    base_cols = [c for c in desired_columns if c in df_filtered.columns]
except NameError:
    base_cols = []
if not base_cols:
    base_cols = list(df_filtered.columns)
df_filtered = df_filtered[base_cols].copy()

# candidate text source columns (use the first available)
src_candidates = ['original_text','tweet','text','full_text']
available_src = [c for c in src_candidates if c in df_filtered.columns]
if not available_src:
    raise ValueError('No usable text column found: ' + ', '.join(src_candidates))

def clean_text_fix(text):
    if pd.isna(text):
        return ''
    s = str(text)
    s = re.sub(r'http\S+', '', s)
    s = re.sub(r'@\w+', '', s)
    s = re.sub(r'#\w+', '', s)
    s = re.sub(r'[^A-Za-z0-9\s]', ' ', s)   # keep numbers, avoid over-cleaning
    s = re.sub(r'\s+', ' ', s).strip()
    return s.lower()

def get_best_clean(row):
    # try preferred columns in order
    for c in available_src:
        raw = row.get(c, '')
        cleaned = clean_text_fix(raw)
        if cleaned:
            return cleaned
    # if single columns fail, combine and clean all candidates
    combined = ' '.join([str(row.get(c,'')) for c in available_src])
    combined_clean = clean_text_fix(combined)
    if combined_clean:
        return combined_clean
    # last resort: lowercase of the first available source or placeholder
    fallback = str(row.get(available_src[0], '')).strip().lower()
    return fallback if fallback else '<no_text>'

# ensure each row has a clean_tweet value
df_filtered['clean_tweet'] = df_filtered.apply(get_best_clean, axis=1)

# fill any remaining blanks from the first available source
mask_empty = df_filtered['clean_tweet'].fillna('').str.strip() == ''
if mask_empty.any():
    df_filtered.loc[mask_empty, 'clean_tweet'] = df_filtered.loc[mask_empty, available_src[0]].fillna('').astype(str).str.strip().str.lower()
    df_filtered['clean_tweet'] = df_filtered['clean_tweet'].replace('', '<no_text>')

# transform and predict
X_pred = tfidf.transform(df_filtered['clean_tweet'].fillna(''))
pred_labels = model.predict(X_pred)
try:
    pred_probs = model.predict_proba(X_pred)[:, 1]
except Exception:
    scores = model.decision_function(X_pred)
    pred_probs = (scores - scores.min()) / (scores.max() - scores.min() + 1e-9)

# attach results
df_filtered = df_filtered.copy()
df_filtered['pred_label'] = pred_labels
df_filtered['pred_prob_real'] = pred_probs

# only keep filtered columns + predictions before saving
save_columns = base_cols + ['pred_label', 'pred_prob_real']
df_filtered = df_filtered.reindex(columns=save_columns)

out_path = 'Covid-19 Twitter Dataset (Apr-Jun 2020)_filtered_with_preds.csv'
df_filtered.to_csv(out_path, index=False)
print(f'Tahminler kaydedildi: {out_path}')


Tahminler kaydedildi: Covid-19 Twitter Dataset (Apr-Jun 2020)_filtered_with_preds.csv


In [48]:
new_df = pd.read_csv("Covid-19 Twitter Dataset (Apr-Jun 2020)_filtered_with_preds.csv")
new_df.isnull().sum()
new_df.columns.tolist()

['clean_tweet',
 'user_mentions',
 'hashtags',
 'original_author',
 'retweet_count',
 'favorite_count',
 'original_text',
 'created_at',
 'pred_label',
 'pred_prob_real']

In [49]:
new_df.head()

Unnamed: 0,clean_tweet,user_mentions,hashtags,original_author,retweet_count,favorite_count,original_text,created_at,pred_label,pred_prob_real
0,rt is calling on leaders to help protect refug...,"GlblCtzn, priyankachopra",,RJIshak,31.0,0.0,RT @GlblCtzn: .@priyankachopra is calling on l...,2020-04-19,1,0.704819
1,rt ogun state support for cbn nirsal covid 19 ...,OGSG_Official,,makinwaoluwole,61.0,0.0,RT @OGSG_Official: OGUN STATE SUPPORT FOR CBN-...,2020-04-19,1,0.900089
2,rt these 5 police officials based at namahadi ...,AdvoBarryRoux,,TembeAmu,1.0,0.0,RT @AdvoBarryRoux: These 5 police officials ba...,2020-04-19,1,0.634284
3,rt covid 19 oyo discharges two patients,MobilePunch,,ilyasrabiu,0.0,0.0,RT @MobilePunch: COVID-19: Oyo discharges two ...,2020-04-19,1,0.85209
4,my condolences to the family of those who did ...,,Covid_19,bucketeconomist,13869.0,0.0,My Condolences to the Family of those who did ...,2020-04-19,0,0.337746
