In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import numpy as np
import pandas as pd
import sklearn

# Same coding step to using one data
df = pd.read_csv("train.csv")
df['author'].fillna('Unknown', inplace=True)
df['title'].fillna('Ambiguous', inplace=True)
df['text'].fillna('Ambiguous', inplace=True)
df.drop_duplicates(inplace=True)

# TF-IDF Vectorization for text and title
vectorizer_text = TfidfVectorizer(max_features=5000)
X_text = vectorizer_text.fit_transform(df['text']).toarray()

vectorizer_title = TfidfVectorizer(max_features=1000)
X_title = vectorizer_title.fit_transform(df['title']).toarray()

# One-hot encoding for authors
encoder = OneHotEncoder()
X_author = encoder.fit_transform(df[['author']]).toarray()

y = df['label'].values



# Combine the feature matrices
x = np.hstack((X_text, X_title, X_author))
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pre = knn.predict(x_test)

In [5]:
print(classification_report(y_test, y_pre, digits=10))

              precision    recall  f1-score   support

           0  0.9940973930 0.9479362101 0.9704681873      2132
           1  0.9478138223 0.9940828402 0.9703971119      2028

    accuracy                      0.9704326923      4160
   macro avg  0.9709556077 0.9710095252 0.9704326496      4160
weighted avg  0.9715341523 0.9704326923 0.9704335380      4160



ACC AUC:

In [6]:
# Score and test:
test_acc = sklearn.model_selection.cross_val_score(knn, x_test, y_test, scoring = "accuracy")
print("The test accuracy is:", test_acc)
test_auc = sklearn.model_selection.cross_val_score(knn, x_test, y_test, scoring = "roc_auc")
print("The testing auc is:", test_auc)

The test accuracy is: [0.90625    0.92668269 0.95552885 0.91346154 0.95913462]
The testing auc is: [0.98306506 0.98414915 0.99247786 0.98591378 0.99239599]


Test one sample:

In [7]:
# Test
sample = {
    'text': df['text'][0],
    'title': df['title'][0],
    'author': df['author'][0],
    'lable': df['label'][0]
}

# TF-IDF Vectorization for the new example
new_text = vectorizer_text.transform([sample['text']]).toarray()
new_title = vectorizer_title.transform([sample['title']]).toarray()
new_author = encoder.transform([[sample['author']]]).toarray()

# Combine the feature matrices for the new example
sample_x = np.hstack((new_text, new_title, new_author))

# se the trained KNN model to make a prediction
pre = knn.predict_proba(sample_x)
print('probability of 0', pre[0][0])
print('probability of 1', pre[0][1])
print('true value', sample['lable'])


probability of 0 0.0
probability of 1 1.0
true value 1




In [11]:
import numpy as np
import pandas as pd

def predict_all_articles(df, model, vectorizer_text, vectorizer_title, encoder):
    df['author'].fillna('Unknown', inplace=True)
    df['title'].fillna('Ambiguous', inplace=True)
    df['text'].fillna('Ambiguous', inplace=True)
    
    # Transform title and text for all articles
    X_title = vectorizer_title.transform(df['title']).toarray()
    X_text = vectorizer_text.transform(df['text']).toarray()

    # Transform the authors
    try:
        X_author = encoder.transform(df[['author']]).toarray()
    except:
        # If the author is not recognized from the training data
        X_author = np.zeros((df.shape[0], len(encoder.categories_[0])))

    #combine
    x = np.hstack((X_title, X_text, X_author))
    
    # Make batch predictions
    prediction_probs = model.predict(x)
    
    # Round the probabilities to get binary class labels
    predictions = np.round(prediction_probs).flatten().astype(int)
    
    # Map 0 and 1 to "Real News" and "Fake News"
    prediction_labels = ["Real News" if p == 0 else "Fake News" for p in predictions]
    
    # Add a new column to the original DataFrame to store predictions
    df['Prediction'] = prediction_labels
    
    return df

def get_accuracy(df):
    # Map "Real News" and "Fake News" back to 0 and 1
    df['PredictionLabel'] = df['Prediction'].map({"Real News": 0, "Fake News": 1})
    
    # Calculate the number of correct predictions
    correct_predictions = df[df['label'] == df['PredictionLabel']].shape[0]
    
    # Calculate the total number of predictions
    total_predictions = df.shape[0]
    
    # Calculate the accuracy
    accuracy = (correct_predictions / total_predictions) * 100
    
    print(f"Accuracy: {accuracy}%")


In [12]:
df_test = pd.read_csv("test.csv")
df_submit = pd.read_csv("submit.csv")
# Combine test and submit datasets based on 'id'
df_test = pd.merge(df_test, df_submit, on='id')

df_with_predictions = predict_all_articles(df_test, knn, vectorizer_text, vectorizer_title, encoder)
get_accuracy(df_with_predictions)


Accuracy: 54.98076923076923%


In [18]:

# Test
sample = {
    'text': """The Security Police point to negative reporting about Finland in Russian media and the closure of the Finnish consulate in St. Petersburg as indicators.
Antti Pelttari, the head of Finland's Security Police, states that although Russia is currently focused on its operations in Ukraine and reducing its international isolation, the threat from Russian intelligence and influence in Finland has not disappeared.
The ongoing war in Ukraine, increasing tensions between Western countries and Russia, and the imposition of more sanctions are likely to escalate Russia's countermeasures against Finland.
The Finnish Security Police also assess that the threat from intelligence activities and influence targeting critical infrastructure has increased, particularly in marine infrastructure.""",
    'title': "Finland sounds alarm: Russia ready to take action",
    'author': "tom"
}
print(df['author'][0])

# test for real news
def test_real(news_dict):
    # TF-IDF Vectorization for the new example
    new_text = vectorizer_text.transform([news_dict['text']]).toarray()
    new_title = vectorizer_title.transform([news_dict['title']]).toarray()
    # new_author = encoder.transform([[sample["author"]]]).toarray()

    try:
        X_author = encoder.transform([[sample['author']]]).toarray()
    except:
        # If the author is not recognized from the training data
        X_author = np.zeros((sample.shape[0], len(encoder.categories_[0])))

    # Combine the feature matrices for the new example
    sample_x = np.hstack((new_text, new_title, X_author))

    # se the trained KNN model to make a prediction
    pre = knn.predict_proba(sample_x)
    print('probability of 0', pre[0][0])
    print('probability of 1', pre[0][1])

test_real(sample)

Darrell Lucus




AttributeError: 'dict' object has no attribute 'shape'