# Import Necessary Dependencies

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#from text to vector
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

#Models & Evaluations
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score ,classification_report, confusion_matrix


# Loading Datasets

In [3]:
true_news = pd.read_csv("True.csv")
fake_news = pd.read_csv("Fake.csv")

In [4]:
true_news.head(n=2)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"


In [5]:
fake_news.head(n=2)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Yearâ€™...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"


# Combine True & Fake Data

In [6]:
# Before Combination of data let's check some necessary things

true_news.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [7]:
fake_news.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [8]:
true_news.drop("date", inplace=True, axis=1)

In [9]:
fake_news.drop("date", inplace=True, axis=1)

In [10]:
true_news.head(2)

Unnamed: 0,title,text,subject
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews


In [11]:
fake_news.head(2)

Unnamed: 0,title,text,subject
0,Donald Trump Sends Out Embarrassing New Yearâ€™...,Donald Trump just couldn t wish all Americans ...,News
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News


In [12]:
true_news['label'] = 1

In [13]:
fake_news['label'] = 0

In [14]:
print(true_news.shape)
print(fake_news.shape)

(21417, 4)
(23481, 4)


In [15]:
#Lets combine both the dataset

final_ds = pd.concat([true_news, fake_news], ignore_index=True)

In [16]:
final_ds.shape

(44898, 4)

In [17]:
shuffled_df = final_ds.sample(frac=1).reset_index(drop=True)

In [18]:
shuffled_df.head()

Unnamed: 0,title,text,subject,label
0,WATCH: FOX and FRIENDS Host BRIAN KILMEADE Imm...,You almost have to feel sorry for Whoopi Goldb...,politics,0
1,Senators want to change massive new Israel aid...,WASHINGTON (Reuters) - Republican U.S. lawmake...,politicsNews,1
2,Trump considering options for new Afghanistan ...,WASHINGTON (Reuters) - President Donald Trump ...,politicsNews,1
3,Senator Markey places hold on Obama's nominee ...,(Reuters) - Democratic U.S. Senator Edward Mar...,politicsNews,1
4,"Is Hillaryâ€™s Meltdown Real, or a Staged Exit?",21st Century Wire says As shocking and controv...,US_News,0


In [19]:
shuffled_df['subject'].value_counts()

subject
politicsNews       11272
worldnews          10145
News                9050
politics            6841
left-news           4459
Government News     1570
US_News              783
Middle-east          778
Name: count, dtype: int64

In [20]:
left_news = shuffled_df[shuffled_df['subject'] == 'US_News']['title']
left_news_arr = pd.array(left_news)
left_news_arr[0:10]

<NumpyExtensionArray>
[                                              'Is Hillaryâ€™s Meltdown Real, or a Staged Exit?',
                              'New 9/11 Trailer â€“ Featuring Charlie Sheen and Whoopi Goldberg',
                          'Boiler Room EP #110 â€“ A Deeper Game: Masters of Chaos Strike Again',
   'MOCKINGBIRD REDUX? CNNâ€™s Role in Peddling Fake â€˜Nothing Burgerâ€™ Russia-Gate News Revealed',
                                     'AFGHANISTAN: Trump Surges Into the Graveyard of Empires',
           '10 U.S. Navy Sailors Held by Iranian Military â€“ Signs of a Neocon Political Stunt',
 'NEW EMAILS: Clinton Foundation VIP donors buy access â€“ while Hillary was Secretary of State',
       'American Scientists Harvesting Human Organs in Live Pigs, Creating a Human-Pig Embryo',
           'Clinton Emails: How Google Worked With Hillary to Try and Overthrow Syriaâ€™s Assad',
                                              'Clinton and Associatesâ€™ Education Ponzi Scheme'

# Removing Stopwords, Punctuation Remove

In [21]:
stop_words = set([
    'the', 'is', 'in', 'and', 'to', 'with', 'a', 'an', 'of', 'for', 'on', 'at',
    'by', 'this', 'that', 'are', 'was', 'it', 'be', 'as', 'from', 'or', 'has',
    'have', 'had', 'but', 'not', 'he', 'she', 'they', 'you', 'we', 'his', 'her'
])

In [22]:
# Removing URL, Hashtags, newline characters, punctuations, whitespace, emoji ranges, 

def cleaned_resume(text) :
    Cltext = re.sub('https\S+', ' ', text) #links
    Cltext = re.sub('@', ' ', Cltext) #mentions
    Cltext = re.sub('\n', ' ', Cltext) #newline
    Cltext = re.sub('\r', ' ' , Cltext) #carriage returns
    Cltext = re.sub(r'[^\w\s]', ' ', Cltext) #Punctuations
    Cltext = re.sub(r"[\u2600-\u26FF\u2700-\u27BF]+", ' ', Cltext) #Emoji ranges
    Cltext = re.sub('\s+', ' ', Cltext).strip() #extra whitespace
    Cltext = re.sub('_', ' ', Cltext)
    Cltext = re.sub("'", ' ', Cltext)

    #Lowercase from consistent case removal
    Cltext = Cltext.lower()

    words = Cltext.split()
    cleaned_words = [word for word in words if word not in stop_words]

    return ' '.join(cleaned_words)

original_resume = """
My name is Alex, find me @alex_datasci or https://alex.codes.
I am a ðŸ“Š Data Scientist.
I love Python and Machine Learning.
"""

cleaned = cleaned_resume(original_resume)
print(cleaned)

my name alex find me alex datasci i am data scientist i love python machine learning


  Cltext = re.sub('https\S+', ' ', text) #links
  Cltext = re.sub('\s+', ' ', Cltext).strip() #extra whitespace


In [23]:
shuffled_df.head(2)

Unnamed: 0,title,text,subject,label
0,WATCH: FOX and FRIENDS Host BRIAN KILMEADE Imm...,You almost have to feel sorry for Whoopi Goldb...,politics,0
1,Senators want to change massive new Israel aid...,WASHINGTON (Reuters) - Republican U.S. lawmake...,politicsNews,1


In [24]:
shuffled_df['cleaned_title'] = shuffled_df['title'].apply(lambda x : cleaned_resume(x))

In [25]:
shuffled_df['cleaned_title'][6]

'qatar s tamim ready resolve row gulf arabs says sovereignty sacred'

In [26]:
shuffled_df['cleaned_text'] = shuffled_df['text'].apply(lambda x : cleaned_resume(x))

In [27]:
shuffled_df['cleaned_text'][10]

'moscow reuters russia working together saudi arabia unify syrian opposition russia s ria news agency quoted russian foreign minister sergey lavrov saying friday lavrov speaking meeting united nations special envoy syria staffan de mistura who visiting moscow de mistura said new syrian constitution will one main items agenda un sponsored talks geneva next week between opposing sides syria conflict ria reported'

# Text Vecrtorisation

In [33]:
tfid = TfidfVectorizer(stop_words='english', max_features=10000)

In [34]:
tfid_title = tfid.fit_transform(shuffled_df['cleaned_title'])

In [None]:
tfid_text = tfid.transform(shuffled_df['cleaned_text'])

In [None]:
#cobining both the features side by side with columns in vector format

combined_features = hstack([tfid_title, tfid_text])

# Splitting Data for training and testing

In [None]:
X = combined_features
y = shuffled_df['label']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(35918, 20000)
(8980, 20000)
(35918,)
(8980,)


# Selecting Best Model

In [None]:
models = {
    'LR' : LogisticRegression(),
    'DTC' : DecisionTreeClassifier(),
    'RC' : RidgeClassifier(),
    'RFC' : RandomForestClassifier(),
    'KNN' : KNeighborsClassifier(),
}

In [None]:
# for name, model in models.items():
#     # --- Training ---
#     model.fit(X_train, y_train)
#     print(f'--- {name} ---')

#     # --- Make Predictions ---
#     y_pred = model.predict(X_test)

#     # --- Accuracy Scores (from your original code) ---
#     train_acc = model.score(X_train, y_train)
    # test_acc = model.score(X_test, y_test)
    # print(f'Train Accuracy: {train_acc:.4f}')
    # print(f'Test Accuracy: {test_acc:.4f}\n')

    # # --- Classification Report ---
    # print("Classification Report:")
    # print(classification_report(y_test, y_pred))
    
    # # --- Confusion Matrix ---
    # print("Confusion Matrix:")
    # cm = confusion_matrix(y_test, y_pred)
    # plt.figure(figsize=(6, 4))
    # sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
    #             xticklabels=model.classes_, yticklabels=model.classes_)
    # plt.xlabel('Predicted')
    # plt.ylabel('Actual')
    # plt.title(f'Confusion Matrix for {name}')
    # plt.show()
    # print("-" * 50)

In [40]:
#Choosing best fitted model - RC

RC = RidgeClassifier()
RC.fit(X_train, y_train)

In [41]:
y_pred = RC.predict(X_test)

In [42]:
print(y_pred)

[0 0 0 ... 0 1 0]


In [43]:
train_acc = RC.score(X_train, y_train)

In [44]:
test_acc = RC.score(X_test, y_test)

In [45]:
print(f"Train Accuracy : {train_acc} && Test Accuracy : {test_acc}")

Train Accuracy : 0.9995266997048834 && Test Accuracy : 0.9955456570155902


In [46]:
print(f"{confusion_matrix(y_test, y_pred)}")

[[4646   24]
 [  16 4294]]


In [47]:
print(f"{classification_report(y_test, y_pred)}")

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      4670
           1       0.99      1.00      1.00      4310

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



# Actual Model Prediction

In [48]:
data_title = "Leaked NASA Documents Confirm Giza Pyramids Were Ancient Power Plants, Not Tombs"
data_text = "In a revelation that upends centuries of archaeological consensus, newly leaked documents from a joint NASA-Egyptian Ministry of Antiquities project allegedly confirm that the Great Pyramids of Giza were not built as tombs for the pharaohs, but as massive wireless power plants. The documents, reportedly leaked by a whistleblower within the space agency, contain advanced geothermal and satellite imaging data showing high concentrations of quartz crystal and dolomite within the pyramid's granite structure. These materials, when subjected to the immense pressure from the pyramid's mass, would create a powerful piezoelectric effect, generating significant electrical energy from the earth itself. According to the files, the internal shafts, previously thought to be for ventilation or spiritual purposes, were perfectly aligned to channel these telluric currents towards a now-missing capstone, which would have broadcast the energy wirelessly. The findings suggest the ancient Egyptians had mastered principles of wireless energy that even Nikola Tesla only dreamed of, read a notation attributed to a project scientist in the leaked report. Officials from both NASA and the Egyptian government have refused to comment on the documents, calling them a complete fabrication. However, proponents of the ancient technology theory say this is just further proof of a massive historical cover-up designed to protect established narratives and suppress knowledge of free energy."

cleaned_data_title = cleaned_resume(data_title)
cleaned_data_text = cleaned_resume(data_text)

cleaned_title_arr = pd.toarray()
vect = TfidfVectorizer()

vector_data_title = vect.fit_transform(cleaned_data_title)
vector_data_text = vect.transform(cleaned_data_text)

combined_data = hstack([vector_data_title, vector_data_text])
predict_data = RC.predict(combined_data)
# if (predict_data[0] == 0) :
#     print("News is Fake")
# else : 
#     print("News is True")

AttributeError: module 'pandas' has no attribute 'toarray'

In [51]:
new_texts = ["short text for a new prediction"]

# 3. Transform the new data using the FITTED vectorizer
#    DO NOT use fit_transform() here!
X_new = vectorizer.transform(new_texts) # X_new will now have shape (1, 20000)

# 4. Predict
prediction = RC.predict(X_new) # This will now work correctly

NameError: name 'vectorizer' is not defined