In [1]:
import pandas as pd
import numpy as np

# Load LIAR dataset
train_df = pd.read_csv('train.tsv', sep='\t', header=None)
valid_df = pd.read_csv('valid.tsv', sep='\t', header=None)
test_df = pd.read_csv('test.tsv', sep='\t', header=None)

# Column names from dataset documentation
columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 
           'state_info', 'party_affiliation', 'barely_true_counts', 
           'false_counts', 'half_true_counts', 'mostly_true_counts', 
           'pants_on_fire_counts', 'context']

train_df.columns = columns
valid_df.columns = columns
test_df.columns = columns

# Combine all data for preprocessing
df = pd.concat([train_df, valid_df, test_df])

# Explore the data
print(df.head())
print(df['label'].value_counts())

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textstat import flesch_reading_ease, smog_index, flesch_kincaid_grade
from textblob import TextBlob

nltk.download('stopwords')
nltk.download('wordnet')

# Binary classification (simplify multi-class labels)
def simplify_label(label):
    if label in ['true', 'mostly-true', 'half-true']:
        return 'real'
    else:
        return 'fake'

df['binary_label'] = df['label'].apply(simplify_label)

# Text preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special chars
    text = re.sub(r'\W', ' ', text)
    # Remove single chars
    text = re.sub(r'\s+[a-z]\s+', ' ', text, flags=re.I)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Lemmatization
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['cleaned_text'] = df['statement'].apply(preprocess_text)

# Feature extraction
def extract_features(text):
    features = {}
    
    # Readability features
    features['flesch_reading_ease'] = flesch_reading_ease(text)
    features['smog_index'] = smog_index(text)
    features['flesch_kincaid_grade'] = flesch_kincaid_grade(text)
    
    # Text statistics
    features['char_count'] = len(text)
    features['word_count'] = len(text.split())
    features['avg_word_length'] = features['char_count'] / max(1, features['word_count'])
    features['sentence_count'] = len(re.split(r'[.!?]', text))
    
    # Sentiment features
    blob = TextBlob(text)
    features['polarity'] = blob.sentiment.polarity
    features['subjectivity'] = blob.sentiment.subjectivity
    
    # Style features
    features['exclamation_count'] = text.count('!')
    features['question_count'] = text.count('?')
    features['uppercase_count'] = sum(1 for c in text if c.isupper())
    
    return features

# Apply feature extraction
feature_list = []
for text in df['statement']:
    feature_list.append(extract_features(text))

features_df = pd.DataFrame(feature_list)
df = pd.concat([df, features_df], axis=1)

# Metadata features
df['speaker_freq'] = df.groupby('speaker')['speaker'].transform('count')
df['party_affiliation'] = df['party_affiliation'].fillna('unknown')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Select features
text_features = 'cleaned_text'
meta_features = ['flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade',
                'char_count', 'word_count', 'avg_word_length', 'sentence_count',
                'polarity', 'subjectivity', 'exclamation_count', 'question_count',
                'uppercase_count', 'speaker_freq']
target = 'binary_label'

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df[meta_features + [text_features]], 
    df[target], 
    test_size=0.2, 
    random_state=42,
    stratify=df[target]
)

# Preprocessing pipeline
text_transformer = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000))
])

meta_transformer = Pipeline([
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('text', text_transformer, text_features),
    ('meta', meta_transformer, meta_features)
])

# Models to try
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    
    results[name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba),
        'report': classification_report(y_test, y_pred)
    }
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
    print(classification_report(y_test, y_pred))
    print("="*80)

# Save the best model
best_model_name = max(results, key=lambda x: results[x]['roc_auc'])
best_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', models[best_model_name])
])

best_model.fit(X_train, y_train)

import joblib
joblib.dump(best_model, 'fake_news_detector.pkl')

           id        label                                          statement  \
0   2635.json        false  Says the Annies List political group supports ...   
1  10540.json    half-true  When did the decline of coal start? It started...   
2    324.json  mostly-true  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json        false  Health care reform legislation is likely to ma...   
4   9028.json    half-true  The economic turnaround started at the end of ...   

                              subject         speaker             job_title  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-crist                   NaN   

  state_info party_affiliation  barely

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kmani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kmani\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [3]:
!pip install textblob


Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   --------------------------------- ------ 524.3/624.3 kB 4.2 MB/s eta 0:00:01
   ---------------------------------------- 624.3/624.3 kB 3.9 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.19.0
