In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Load data
train_data = pd.read_csv("/content/drive/My Drive/NLP PROJECT/Dataset/train.csv")

# Clean text
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"
                        u"\U00002702-\U000027B0"
                        u"\U0001F680-\U0001F6FF"
                        u"\U0001F300-\U0001F5FF"
                        u"\U0001F1E0-\U0001F1FF"
                        u"\U000024C2-\U0001F251"
                        "]+")
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z]+', ' ', text)
    text = text.lower()
    import nltk
    nltk.download('stopwords')
    text = [word for word in text.split() if word not in stopwords.words('english')]
    import nltk
    nltk.download('wordnet')
    text = [WordNetLemmatizer().lemmatize(word) for word in text]
    text = ' '.join(word for word in text)
    return text

# Apply cleaning
train_data['clean text'] = train_data['text'].apply(clean_text)

# Drop unnecessary columns
df_train = train_data.drop(columns=['id', 'keyword', 'location', 'text'])

# Split data
X = df_train['clean text']
y = df_train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize sentences
train_sentences = [sentence.split() for sentence in X_train]
test_sentences = [sentence.split() for sentence in X_test]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Function to get sentence vectors
def get_sentence_vector(sentence, model, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in sentence:
        if word in model.wv:
            vec += model.wv[word].reshape((1, size))
            count += 1
    if count != 0:
        vec /= count
    return vec

# Vectorize data
word2vec_size = 100
train_vectors = np.concatenate([get_sentence_vector(sentence, word2vec_model, word2vec_size) for sentence in train_sentences])
test_vectors = np.concatenate([get_sentence_vector(sentence, word2vec_model, word2vec_size) for sentence in test_sentences])

# Initialize and train classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(train_vectors, y_train)

# Make predictions and evaluate
predictions = rf_classifier.predict(test_vectors)
conf_mat = confusion_matrix(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print("Confusion Matrix:\n", conf_mat)
print("\nAccuracy:", accuracy)
print("\nReport:\n", report)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

Confusion Matrix:
 [[760 114]
 [331 318]]

Accuracy: 0.7078135259356533

Report:
               precision    recall  f1-score   support

           0       0.70      0.87      0.77       874
           1       0.74      0.49      0.59       649

    accuracy                           0.71      1523
   macro avg       0.72      0.68      0.68      1523
weighted avg       0.71      0.71      0.69      1523

