# **Data Load**

In [None]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
# Mount your Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Read the dataset from the specified path
df = pd.read_csv('/content/drive/MyDrive/ML project/spaCy_Word2Vec.csv', sep=',', encoding='utf-8', quotechar='"')
df.head()

Unnamed: 0,text,type,processed_Text,vector
0,WASHINGTON (Reuters) - The head of a conservat...,True,washington reuter head conservative republic...,[-0.81997824 0.5995497 -2.5212972 0.661411...
1,WASHINGTON (Reuters) - Transgender people will...,True,washington reuters transgender people allow ...,[-6.8455237e-01 7.6636398e-01 -1.6749181e+00 ...
2,WASHINGTON (Reuters) - The special counsel inv...,True,washington reuter special counsel investigat...,[ 1.08025201e-01 6.19717240e-01 -1.75755990e+...
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,washington reuters trump campaign adviser ge...,[-3.90371442e-01 -2.06498355e-01 -1.27466476e+...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,seattlewashington reuters president donald t...,[-6.48751736e-01 -4.65459138e-01 -1.50599909e+...


In [None]:
# Check the type of the 'vector' column
print(df['vector'].dtype)

object


In [None]:
unique_types = df['vector'].apply(type).unique()
print(unique_types)

[<class 'str'> <class 'float'>]


In [None]:
import numpy as np

# Function to convert string representations to numpy arrays
def parse_vector_string(vector_string):
    if isinstance(vector_string, str):
        # Remove square brackets and split by whitespace
        values = vector_string[1:-1].split()
        # Convert values to floats and return numpy array
        return np.array(values, dtype=float)
    elif vector_string is None:
        # If the input is None, return None
        return None
    else:
        # If the input is already a numpy array, return it as is
        return vector_string

# Convert 'vector' column to numpy arrays
df['vector'] = df['vector'].apply(parse_vector_string)

# Check the type of the 'vector' column after conversion
print(df['vector'].apply(type).unique())

[<class 'numpy.ndarray'> <class 'float'>]


In [None]:
df.head()

Unnamed: 0,text,type,processed_Text,vector
0,WASHINGTON (Reuters) - The head of a conservat...,True,washington reuter head conservative republic...,"[-0.81997824, 0.5995497, -2.5212972, 0.6614114..."
1,WASHINGTON (Reuters) - Transgender people will...,True,washington reuters transgender people allow ...,"[-0.68455237, 0.76636398, -1.6749181, 1.137119..."
2,WASHINGTON (Reuters) - The special counsel inv...,True,washington reuter special counsel investigat...,"[0.108025201, 0.61971724, -1.7575599, 1.349151..."
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,washington reuters trump campaign adviser ge...,"[-0.390371442, -0.206498355, -1.27466476, 0.79..."
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,seattlewashington reuters president donald t...,"[-0.648751736, -0.465459138, -1.50599909, 1.10..."


In [None]:
df.tail()

Unnamed: 0,text,type,processed_Text,vector
44893,21st Century Wire says As 21WIRE reported earl...,Fake,st century wire say wire report early week unl...,"[-0.63247424, 0.47587016, -1.4709344, 0.028013..."
44894,21st Century Wire says It s a familiar theme. ...,Fake,st century wire say s familiar theme dispute c...,"[0.2540681, 0.01993897, -0.8036574, -0.1812838..."
44895,Patrick Henningsen 21st Century WireRemember ...,Fake,patrick henningsen st century wireremember o...,"[-0.404304504, 0.978808105, -2.0422833, 0.6304..."
44896,21st Century Wire says Al Jazeera America will...,Fake,st century wire say al jazeera america history...,"[-0.435610354, 1.05584824, -1.97594726, 0.3294..."
44897,21st Century Wire says As 21WIRE predicted in ...,Fake,st century wire say wire predict new year s lo...,"[-0.483065009, 0.815314412, -1.49170899, 0.115..."


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib
import spacy

In [None]:
# Ensure 'vector' column contains only valid numpy arrays
def convert_vector(vector):
    if isinstance(vector, str):
        try:
            return np.fromstring(vector.strip('[]'), sep=', ')
        except:
            return np.nan  # Mark invalid conversions as NaN
    return vector

# Apply the conversion function
df['vector'] = df['vector'].apply(convert_vector)

# Drop rows where conversion failed (i.e., where vectors are NaN)
df = df.dropna(subset=['vector'])

# Check the type and length of vectors after cleaning
vector_lengths = df['vector'].apply(len)
print(vector_lengths.describe())

# Set the expected length of the vectors (e.g., 300 for SpaCy's en_core_web_md)
expected_length = 300

# Remove rows with vectors that do not match the expected length
df = df[vector_lengths == expected_length]

# Feature vectors
X = np.vstack(df['vector'].values)

# Labels (using 'type' column for labels)
y = df['type'].apply(lambda x: 1 if x == 'True' else 0).values

# Display the first few rows of the cleaned dataset
df.head()

count    44267.0
mean       300.0
std          0.0
min        300.0
25%        300.0
50%        300.0
75%        300.0
max        300.0
Name: vector, dtype: float64


Unnamed: 0,text,type,processed_Text,vector
0,WASHINGTON (Reuters) - The head of a conservat...,True,washington reuter head conservative republic...,"[-0.81997824, 0.5995497, -2.5212972, 0.6614114..."
1,WASHINGTON (Reuters) - Transgender people will...,True,washington reuters transgender people allow ...,"[-0.68455237, 0.76636398, -1.6749181, 1.137119..."
2,WASHINGTON (Reuters) - The special counsel inv...,True,washington reuter special counsel investigat...,"[0.108025201, 0.61971724, -1.7575599, 1.349151..."
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,washington reuters trump campaign adviser ge...,"[-0.390371442, -0.206498355, -1.27466476, 0.79..."
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,seattlewashington reuters president donald t...,"[-0.648751736, -0.465459138, -1.50599909, 1.10..."


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the SVM model
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

# Save the trained model
joblib.dump(svm_classifier, '/content/drive/MyDrive/ML project/svm_classifier.pkl')


Accuracy: 0.971086514569686
Precision: 0.9640238313473877
Recall: 0.9770088248954947
F1 Score: 0.9704728950403692
Confusion Matrix:
[[4391  157]
 [  99 4207]]


['/content/drive/MyDrive/ML project/svm_classifier.pkl']

In [None]:
# Load the SpaCy model
nlp = spacy.load('en_core_web_md')

# Function to preprocess and vectorize a new article
def preprocess_and_vectorize(news, nlp):
    doc = nlp(news)
    return doc.vector

# Function to check whether a news article is real or fake
def check_news(news, model, nlp):
    news_vector = preprocess_and_vectorize(news, nlp)
    predicted_label = model.predict([news_vector])[0]
    return predicted_label

# Get news input from the user
user_news = input("Enter the news article: ")

# Check the user-provided news
predicted_label = check_news(user_news, svm_classifier, nlp)
print("Predicted Label:", "Real News" if predicted_label else "Fake News")


Enter the news article: 8 dead after bus carrying farm workers in Florida hit by truck, driver of truck charged with DUI
Predicted Label: Fake News
