In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Load data
train_data = pd.read_csv("/content/drive/My Drive/NLP PROJECT/Dataset/train.csv")

# Clean text
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  # Emojis
                        u"\U00002702-\U000027B0"
                        u"\U0001F680-\U0001F6FF"
                        u"\U0001F300-\U0001F5FF"
                        u"\U0001F1E0-\U0001F1FF"
                        u"\U000024C2-\U0001F251"
                        "]+")
    text = emoji_pattern.sub(r'', text)  # Remove emojis
    text = re.sub(r'@\S+', ' ', text)  # Remove Twitter handles
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z]+', ' ', text)  # Remove non-alphabet characters
    text = text.lower()  # Convert to lowercase
    nltk.download('stopwords')
    text = [word for word in text.split() if word not in stopwords.words('english')]  # Remove stopwords
    nltk.download('wordnet')
    text = [WordNetLemmatizer().lemmatize(word) for word in text]  # Lemmatize words
    text = ' '.join(word for word in text)  # Join words back into a single string
    return text

# Apply cleaning
train_data['clean text'] = train_data['text'].apply(clean_text)

# Drop unnecessary columns
df_train = train_data.drop(columns=['id', 'keyword', 'location', 'text'])

# Split data
X = df_train['clean text']
y = df_train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)

# Fit and transform the training data, transform the test data
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

# Initialize and train classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)

# Make predictions and evaluate
predictions = nb_classifier.predict(count_test)
conf_mat = confusion_matrix(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print("Confusion Matrix:\n", conf_mat)
print("\nAccuracy:", accuracy)
print("\nReport:\n", report)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

Confusion Matrix:
 [[747 127]
 [185 464]]

Accuracy: 0.7951411687458962

Report:
               precision    recall  f1-score   support

           0       0.80      0.85      0.83       874
           1       0.79      0.71      0.75       649

    accuracy                           0.80      1523
   macro avg       0.79      0.78      0.79      1523
weighted avg       0.79      0.80      0.79      1523

