# Importing Modules 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report,roc_curve
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
import seaborn as sns

: 

# Reading Dataset 

In [None]:
df = pd.read_csv('political_tweets_2019_2023.csv')
df

# Analysis DataSet & Preprocessing

In [None]:
print(df.isnull().sum())

In [None]:
print(df['tweet_fake_or_real'].value_counts())

In [None]:
df = df.drop(columns=["tweet_id", "tweet_posted_date", "user_name", "likes"])

In [None]:
df

# Pie Chart

In [None]:
fake_count = df['tweet_fake_or_real'].value_counts()[False]
real_count = df['tweet_fake_or_real'].value_counts()[True]

labels = ['Fake Tweets', 'Real Tweets']
sizes = [fake_count, real_count]
colors = ['lightcoral', 'lightskyblue']
explode = (0.1, 0)

plt.figure(figsize=(4, 4))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, actopct='%1.1f%%', shadow=True, startangle=140)
plt.title('Distribution of Fake vs. Real Tweets')
plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
plt.show()

# Bar Graph

In [None]:
fake_count = df['tweet_fake_or_real'].value_counts()[False]  
real_count = df['tweet_fake_or_real'].value_counts()[True]  

categories = ['Fake Tweets', 'Real Tweets']
counts = [fake_count, real_count]
colors = ['lightcoral', 'lightskyblue']

plt.figure(figsize=(3, 3))
plt.bar(categories, counts, color=colors)
plt.xlabel('Tweet Type')
plt.ylabel('Count')
plt.title('Distribution of Fake vs. Real Tweets')
plt.show()

# Applying NLP Cleaning Dataset

Applying,stopwords,word_tokenize

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)


In [None]:
df['cleaned_tweet'] = df['tweet'].apply(clean_text)

# Applying TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2)) 
X = vectorizer.fit_transform(df['cleaned_tweet']).toarray()
y = df['tweet_fake_or_real'].astype(int) 

In [None]:
y

# Spliting Dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Applying SMOTE Module too blanaceing the dataset

In [None]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Applying Machine Learning Algorithm

#### LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_res, y_train_res)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification Report

In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100], 
    'penalty': ['l1', 'l2'],       
    'solver': ['liblinear']         
}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=2, scoring='accuracy')
grid_search.fit(X_train_res, y_train_res)

# Best Accuracy

In [None]:
print("Best Accuracy:", grid_search.best_score_)

# Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Model Save

In [None]:
# import joblib
# joblib.dump(model, 'fake_tweet_classifier.pkl')
# joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')