In [1]:
# PERFROM LOGISTICS REGRESSION TO PREDICT WHICH TWEETS ARE ABOUT  THE REAL  DISASTERS AND WHICH ONES ARE NOT 

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re

# DATA
train_path = r"C:\Users\nh013\Desktop\NLP PROCESSING WITH DISESTER TWEETS\train.csv"
test_path = r"C:\Users\nh013\Desktop\NLP PROCESSING WITH DISESTER TWEETS\test.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)


# FUNCTION FOR TEXT PREPROCESSING
def preprocess_text(text):
    if isinstance(text, str):
        
     
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)
        
       
        text = text.lower()
        
       
        tokens = word_tokenize(text)
        
       
        stop_words = set(stopwords.words("english"))
        filtered_tokens = [token for token in tokens if token not in stop_words]
        
       
        stemmer = SnowballStemmer("english")
        stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
        
      
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
        
       
        # JOIN THE TOKEN BACK INTO TEXT
        preprocessed_text = " ".join(lemmatized_tokens)
    
        return preprocessed_text
    else:
        return text

# TEXT COLUMN TO PREPROCESS 
text_columns = [
    "id",
    "keyword",
    "location",
    "text"
]

# TRAIN DATASET PREPROCESSING
for column in text_columns:
    train_df[column] = train_df[column].apply(preprocess_text)

# TEST DATASET PREPROCESSING
for column in text_columns[:-1]:
    test_df[column] = test_df[column].apply(preprocess_text)

# TRAINING DATA 
X = train_df['text']
y = train_df['target']

# SPLIT THE DATA INTO TRAINING AND VALIDATION SETS
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# TEXT VECTORIZATIONS
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)

# TRAIN MODEL
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

# PREDICTION ON VALIDATION SET
val_predictions = model.predict(X_val_vectorized)

# CALCULATE ACCURACY ON VALIDATITY SETS
accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", accuracy)

# PREDICTION ON THE TEST SET
test_predictions = model.predict(vectorizer.transform(test_df['text']))

#MAKE SUBMISSION
test_df['target'] = test_predictions


test_df[['id', 'target']].to_csv('submission.csv', index=False)


Validation Accuracy: 0.7905449770190414


In [2]:
#perform naive bays model.....TO PREDICT WHICH TWEETS ARE ABOUT  THE REAL  DISASTERS AND WHICH ONES ARE NOT 


import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re

# DATA
train_path = r"C:\Users\nh013\Desktop\NLP PROCESSING WITH DISESTER TWEETS\train.csv"
test_path = r"C:\Users\nh013\Desktop\NLP PROCESSING WITH DISESTER TWEETS\test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)


# FUNCTION FOR TEXT PREPROCESSING
def preprocess_text(text):
    if isinstance(text, str):
        
       
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)
        
        
        text = text.lower()
        
       
        tokens = word_tokenize(text)
        
       
        stop_words = set(stopwords.words("english"))
        filtered_tokens = [token for token in tokens if token not in stop_words]
        
       
        stemmer = SnowballStemmer("english")
        stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
        
       
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
        
        # JOIN THE TOKENS BACK TO ORGINAL TEXT
        preprocessed_text = " ".join(lemmatized_tokens)
    
        return preprocessed_text
    else:
        return str(text)

# TEXT COLUMN TO PREPROCESS
text_columns = [
    "id",
    "keyword",
    "location",
    "text"
]

# TRAIN DATASET PREPROCESSING
for column in text_columns:
    train_df[column] = train_df[column].apply(preprocess_text)

#TEST DATASET PREPROCESSING
for column in text_columns[:-1]:
    test_df[column] = test_df[column].apply(preprocess_text)

# COMBINE PREPROCESSED TEXT COLUMN INTO A SINGLE COLUMN
train_df['preprocessed_text'] = train_df[text_columns[:-1]].apply(lambda x: ' '.join(x), axis=1)
test_df['preprocessed_text'] = test_df[text_columns[:-1]].apply(lambda x: ' '.join(x), axis=1)

# TEXT VECTORIZATION
vectorizer = TfidfVectorizer()
train_vectorized_text = vectorizer.fit_transform(train_df['preprocessed_text'])
test_vectorized_text = vectorizer.transform(test_df['preprocessed_text'])

# GET FEATURE NAMES
feature_names = vectorizer.get_feature_names_out()

# CONVERT THE VECTORIZED TEXT TO DATAFRAME
train_vectorized_df = pd.DataFrame(train_vectorized_text.toarray(), columns=feature_names)
test_vectorized_df = pd.DataFrame(test_vectorized_text.toarray(), columns=feature_names)

# DROP NON NUMERIC COLUMN  FROM train_df
train_df = train_df.select_dtypes(include='number')


# CONCATENATE THE VECTORIZED DATAFRAME  WITH THE ORGINAL DATAFRAME
train_df = pd.concat([train_df, train_vectorized_df], axis=1)
test_df = pd.concat([test_df, test_vectorized_df], axis=1)

# LET'S SPLIT FEATURE AND TARGET 
X_train = train_df.drop(["target"], axis=1)
y_train = train_df["target"]

# TRAIN MODEL
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

# DROP NON NUMERIC COLUMN FROM test_df
test_df = test_df.select_dtypes(include='number')

# PREDICTION ON THE TEST SET
test_predictions = naive_bayes.predict(test_df)

# EVALUATE
print("Test Predictions:")
print(test_predictions)


Test Predictions:
[0 0 0 ... 0 0 0]
