In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Load data
train_data = pd.read_csv("/content/drive/My Drive/NLP PROJECT/Dataset/train.csv")

# Clean text
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"
                        u"\U00002702-\U000027B0"
                        u"\U0001F680-\U0001F6FF"
                        u"\U0001F300-\U0001F5FF"
                        u"\U0001F1E0-\U0001F1FF"
                        u"\U000024C2-\U0001F251"
                        "]+")
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z]+', ' ', text)
    text = text.lower()
    nltk.download('stopwords')
    text = [word for word in text.split() if word not in stopwords.words('english')]
    nltk.download('wordnet')
    text = [WordNetLemmatizer().lemmatize(word) for word in text]
    text = ' '.join(word for word in text)
    return text

# Apply cleaning
train_data['clean text'] = train_data['text'].apply(clean_text)

# Drop unnecessary columns
df_train = train_data.drop(columns=['id', 'keyword', 'location', 'text'])

# Split data
X = df_train['clean text']
y = df_train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize sentences
train_sentences = [sentence.split() for sentence in X_train]
test_sentences = [sentence.split() for sentence in X_test]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Function to get sentence vectors
def get_sentence_vector(sentence, model, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in sentence:
        if word in model.wv:
            vec += model.wv[word].reshape((1, size))
            count += 1
    if count != 0:
        vec /= count
    return vec

# Vectorize data
word2vec_size = 100
train_vectors = np.concatenate([get_sentence_vector(sentence, word2vec_model, word2vec_size) for sentence in train_sentences])
test_vectors = np.concatenate([get_sentence_vector(sentence, word2vec_model, word2vec_size) for sentence in test_sentences])

# Initialize and train classifier
nb_classifier = MultinomialNB()
# Note: MultinomialNB requires non-negative input values. Word2Vec vectors might contain negative values.
# As a workaround, use MinMaxScaler to scale the data to be non-negative.
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_vectors = scaler.fit_transform(train_vectors)
test_vectors = scaler.transform(test_vectors)

nb_classifier.fit(train_vectors, y_train)

# Make predictions and evaluate
predictions = nb_classifier.predict(test_vectors)
conf_mat = confusion_matrix(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print("Confusion Matrix:\n", conf_mat)
print("\nAccuracy:", accuracy)
print("\nReport:\n", report)

