# Fake News

## About the data

### 1.id: unique id for a news article
### 2.title: the title of the news article
### 3.author: author of the news article
### 4.text: the text of the article; could be incomplete
### 5.label: a label that marks whether the news article is real or fake

#### 1 => fake news
#### 0 => real news

## Importing libraries

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.metrics import accuracy_score , ConfusionMatrixDisplay , classification_report , roc_curve

In [None]:
# Printing the stopwords in English
print(stopwords.words('english'))

## Data Preparation

In [None]:
# load the data
df = pd.read_csv('train.csv')
print(f"The shape of the dataset is: {df.shape}")
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

## Data Preprocessing

### Handling null values

In [None]:
df.fillna(" ", inplace= True)

In [None]:
df['content'] = df['title'] + " " + df['author']

In [None]:
df.head()

## Stemming

### Stemming is the process of reducing a word to its base or root form, by removing suffixes and prefixes

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
    #replace any non-alphabetic characters in the content variable with a space character
    stemmed_content= re.sub('[^a-zA-Z]',' ',content)
    #Convert all words into lower case letters
    stemmed_content = stemmed_content.lower() 
    # Split the words into list
    stemmed_content = stemmed_content.split()
    #generate a list of stemmed words from stemmed_content, excluding any stop words from the list
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    #Join the elements from the list 'stemmed_content' into a single string separated by spaces
    stemmed_content = " ".join(stemmed_content)
    return stemmed_content

In [None]:
df['content']= df['content'].apply(stemming)

In [None]:
df['content']

## Converting the textual data to numerical data

In [None]:
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
counts = count_vectorizer.fit_transform(df['content'].values)
tfidf = transformer.fit_transform(counts)

## Split data into train & test data

In [None]:
targets = df['label'].values

In [None]:
print(f"target shape: {targets.shape}")
print(f"X shape: {tfidf.shape}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf, targets, test_size=0.2, random_state=49)

In [None]:
print(f"The shape of X_train is: {X_train.shape[0]}")
print(f"The shape of X_test is: {X_test.shape[0]}")

## Iterate

In [None]:
def train(model , model_name):
    model.fit(X_train,y_train)
    print(f"Training accuracy of {model_name} is {model.score(X_train,y_train)}")
    print(f"testing accuracy of {model_name} is {model.score(X_test,y_test)}")
def conf_matrix(model):
    ConfusionMatrixDisplay.from_estimator(
        model,
        X_test,
        y_test
    )
def class_report(model):
    print(classification_report(
        y_test,
        model.predict(X_test)
    ))

## LogisticRegression

In [None]:
model_lr = LogisticRegression()

In [None]:
train(model_lr, 'LogisticRegression')

In [None]:
conf_matrix(model_lr)

In [None]:
class_report(model_lr)

## SVM

In [None]:
svc_model= SVC()

In [None]:
train(svc_model, 'SVM')

In [None]:
conf_matrix(svc_model)

In [None]:
class_report(svc_model)

## DecisionTreeClassifier

In [None]:
depth_num= range(50, 71, 2)
training_acc= []
testing_acc = []
for depth in depth_num:
    tree_model = DecisionTreeClassifier(max_depth=depth,random_state=42)
    tree_model.fit(X_train,y_train)
    training_acc.append(tree_model.score(X_train,y_train))
    testing_acc.append(tree_model.score(X_test,y_test))

In [None]:
print("Training Accuracy Scores:", training_acc[:3])
print("testing Accuracy Scores:", testing_acc[:3])

In [None]:
plt.plot(depth_num , training_acc , label= 'Training')
plt.plot(depth_num , testing_acc , label= 'Testing')
plt.xlabel('Max_Depth')
plt.ylabel('Accuracy_score')
plt.legend();

In [None]:
# final model
tree_final_model=DecisionTreeClassifier(max_depth=58,random_state=42)
tree_final_model.fit(X_train,y_train)
tree_training_acc = tree_final_model.score(X_train,y_train)
tree_testing_acc = tree_final_model.score(X_test,y_test)
print(f"Training accuracy of DesicionTreeClassifier is {tree_training_acc}")
print(f"testing accuracy of DesicionTreeClassifier is {tree_testing_acc}")

In [None]:
conf_matrix(tree_final_model)

In [None]:
class_report(tree_final_model)

## RandomForestClassifier

In [None]:
clf= RandomForestClassifier(random_state=42)
params={
    "n_estimators": range(50,125,25),
    "max_depth": range(60,81,2)
}
params

In [None]:
rfc_model = GridSearchCV(
    clf,
    param_grid= params,
    cv= 5,
    n_jobs= -1,
    verbose=1
)

In [None]:
rfc_model.fit(X_train,y_train)

In [None]:
cv_results= pd.DataFrame(rfc_model.cv_results_)
cv_results.sort_values('rank_test_score').head(10)

In [None]:
rfc_model.best_params_

In [None]:
rfc_model.predict(X_test)

In [None]:
acc_train = rfc_model.score(X_train , y_train)
acc_test = rfc_model.score(X_test , y_test)

print(f"Training accuracy: {round(acc_train , 4)}")
print(f"test accuracy: {round(acc_test , 4)}")

In [None]:
conf_matrix(rfc_model)

In [None]:
class_report(rfc_model)

In [None]:
models = pd.DataFrame({
    
    "Models": ["Logestic Regression" , "SVM", "DecisionTreeClassifier","RandomForestClassifier"],
    "Score":[model_lr.score(X_test,y_test) ,svc_model.score(X_test,y_test) ,tree_testing_acc,acc_test ]
    
})
models.sort_values(by="Score" , ascending=False)

In [None]:
colors= ['orange' , 'blue','red','green']
sns.set_style('whitegrid')
plt.figure(figsize=(10,5))
sns.barplot(x=models['Models'],y=models['Score'], palette=colors )
plt.xlabel("Models")
plt.ylabel("Score")
plt.title("Model Selection")
plt.show();

## DecisionTreeClassifier got the highest accuracy

## Thanks!