# TruthLens Modelling - Phase 2: Multi-class Classification
The aim of phase 2 is to further classify text which has already been flagged as "fake" into one of four different types of fake news. These four classes - Fabricated, Polarised, Satire and Commentary - are a reduced adaption of the Molina et al. Disinformation Taxonomy.

The dataset used is this phase is the custom dataset I created, which has already been cleaned and preprocessed (see "TruthLens Data Collection" and "TruthLens Data Cleaning" notebooks).

### Feature Extraction 

In [None]:

import spacy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load a spaCy model for NER
nlp = spacy.load('en_core_web_sm')

# Assume df2 is already loaded and cleaned:
# df2 = pd.read_csv('your_cleaned_df2.csv')

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_multiclass, y_multiclass, test_size=0.2, random_state=42)
# Train a classifier on the training data
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on the held-out test set
predictions = clf.predict(X_test)
print("Classification Report on Test Set:\n", classification_report(y_test, predictions))

###########################
# 1. Topic Modeling (LDA)
###########################
# Preprocess text for LDA: tokenize and remove stopwords
stop_words = set(stopwords.words('english'))
def preprocess(text):
    tokens = [token for token in gensim.utils.simple_preprocess(text) if token not in stop_words]
    return tokens

df2['tokens'] = df2['content'].apply(preprocess)

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(df2['tokens'])
corpus = [dictionary.doc2bow(text) for text in df2['tokens']]

# Train LDA model (e.g., 5 topics)
lda_model = gensim.models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)

# Function to extract topic distribution for a document
def get_topic_distribution(text):
    bow = dictionary.doc2bow(preprocess(text))
    # Get topic probabilities for all topics (ensure all topics are returned)
    topic_dist = lda_model.get_document_topics(bow, minimum_probability=0.0)
    # Return a list of probabilities ordered by topic index
    return [prob for topic, prob in sorted(topic_dist, key=lambda x: x[0])]

df2['topic_dist'] = df2['content'].apply(get_topic_distribution)

###############################
# 2. Sentiment Analysis
###############################
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

df2[['polarity', 'subjectivity']] = df2['content'].apply(lambda x: pd.Series(get_sentiment(x)))

#######################################
# 3. Named Entity Recognition (NER)
#######################################
def get_entity_counts(text):
    doc = nlp(text)
    counts = {'PERSON': 0, 'ORG': 0, 'GPE': 0}
    for ent in doc.ents:
        if ent.label_ in counts:
            counts[ent.label_] += 1
    return counts

df2['entity_counts'] = df2['content'].apply(get_entity_counts)
df2['person_count'] = df2['entity_counts'].apply(lambda x: x['PERSON'])
df2['org_count'] = df2['entity_counts'].apply(lambda x: x['ORG'])
df2['gpe_count'] = df2['entity_counts'].apply(lambda x: x['GPE'])

########################################
# 4. Domain-Specific Keyword Counts
########################################
# Define a dictionary for domain-specific keywords
domain_keywords = {
    'politics': ['election', 'government', 'senate', 'congress'],
    'health': ['vaccine', 'covid', 'pandemic', 'healthcare'],
    'finance': ['stock', 'market', 'economy', 'trade']
}

def count_domain_keywords(text):
    counts = {}
    text_lower = text.lower()
    for category, keywords in domain_keywords.items():
        count = sum(text_lower.count(keyword) for keyword in keywords)
        counts[category] = count
    return counts

df2['domain_counts'] = df2['content'].apply(count_domain_keywords)
# Expand domain keyword counts into separate columns
domain_df = df2['domain_counts'].apply(pd.Series)
df2 = pd.concat([df2, domain_df], axis=1)

###############################################
# 5. Combine Features into a Single Feature Vector
###############################################
def combine_features(row):
    features = []
    # Add topic distribution (list of probabilities, e.g., 5 topics)
    features.extend(row['topic_dist'])
    # Add sentiment scores (polarity and subjectivity)
    features.append(row['polarity'])
    features.append(row['subjectivity'])
    # Add NER counts (for PERSON, ORG, GPE)
    features.append(row['person_count'])
    features.append(row['org_count'])
    features.append(row['gpe_count'])
    # Add domain-specific keyword counts (order by sorted key names)
    for key in sorted(domain_keywords.keys()):
        features.append(row.get(key, 0))
    return features

df2['feature_vector'] = df2.apply(combine_features, axis=1)

# Create a final feature matrix (each row is a document's feature vector)
X_multiclass = np.vstack(df2['feature_vector'].values)
y_multiclass = df2['label']

###############################################
# Example: Training a Multi-Class Classifier
###############################################
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_multiclass, y_multiclass)

# Evaluate performance (if you have a separate validation set, use that)
predictions = clf.predict(X_multiclass)
print("Classification Report for Multi-Class Classifier:\n", 
      pd.Series(predictions).value_counts(), "\n")

df2

In [None]:
from sklearn.metrics import classification_report
print("Full Classification Report:\n", classification_report(y_multiclass, predictions))