In [1]:
#preprocess step

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

# DATA
train_path = r"C:\Users\nh013\Desktop\NLP PROCESSING WITH DISESTER TWEETS\train.csv"
test_path = r"C:\Users\nh013\Desktop\NLP PROCESSING WITH DISESTER TWEETS\test.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)


# FUNCTION FOR TEXT PREPROCESSING
def preprocess_text(text):
    if isinstance(text, str):
        
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)
        
        text = text.lower()
        
        tokens = word_tokenize(text)
        
        stop_words = set(stopwords.words("english"))
        filtered_tokens = [token for token in tokens if token not in stop_words]
        
        stemmer = SnowballStemmer("english")
        stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
        
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
        
       
        # JOIN THE TOKEN BACK INTO TEXT
        preprocessed_text = " ".join(lemmatized_tokens)
    
        return preprocessed_text
    else:
        return text

# TEXT COLUMN TO PREPROCESS 
text_columns = [
    "id",
    "keyword",
    "location",
    "text"
]

# TRAIN DATASET PREPROCESSING
for column in text_columns:
    train_df[column] = train_df[column].apply(preprocess_text)

# TEST DATASET PREPROCESSING
for column in text_columns[:-1]:
    test_df[column] = test_df[column].apply(preprocess_text)


print("Train Dataset:")
print(train_df.head())

print("\nTest Dataset:")
print(test_df.head())


Train Dataset:
   id keyword location                                               text  \
0   1     NaN      NaN         deed reason # earthquak may allah forgiv u   
1   4     NaN      NaN             forest fire near la rong sask . canada   
2   5     NaN      NaN  resid ask shelter place ' notifi offic . evacu...   
3   6     NaN      NaN  13,000 peopl receiv # wildfir evacu order cali...   
4   7     NaN      NaN  got sent photo rubi # alaska smoke # wildfir p...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  

Test Dataset:
   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China an

In [None]:
#perform  text vectorization....TF-IDF

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# DATA
train_path = r"C:\Users\nh013\Desktop\NLP PROCESSING WITH DISESTER TWEETS\train.csv"
test_path = r"C:\Users\nh013\Desktop\NLP PROCESSING WITH DISESTER TWEETS\test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)


# FUNCTION FOR TEXT PREPROCESSING
def preprocess_text(text):
    if isinstance(text, str):
        
       
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)
        
      
        text = text.lower()
        
       
        tokens = word_tokenize(text)
        
       
        stop_words = set(stopwords.words("english"))
        filtered_tokens = [token for token in tokens if token not in stop_words]
        
       
        stemmer = SnowballStemmer("english")
        stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
        
       
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
        
        # JOIN THE TOKENS BACK TO ORGINAL TEXT
        preprocessed_text = " ".join(lemmatized_tokens)
    
        return preprocessed_text
    else:
        return str(text)

# TEXT COLUMN TO PREPROCESS
text_columns = [
    "id",
    "keyword",
    "location",
    "text"
]

# TRAIN DATASET PREPROCESSING
for column in text_columns:
    train_df[column] = train_df[column].apply(preprocess_text)

#TEST DATASET PREPROCESSING
for column in text_columns[:-1]:
    test_df[column] = test_df[column].apply(preprocess_text)

# COMBINE PREPROCESSED TEXT COLUMN INTO A SINGLE COLUMN
train_df['preprocessed_text'] = train_df[text_columns[:-1]].apply(lambda x: ' '.join(x), axis=1)
test_df['preprocessed_text'] = test_df[text_columns[:-1]].apply(lambda x: ' '.join(x), axis=1)

# TEXT VECTORIZATION
vectorizer = TfidfVectorizer()
train_vectorized_text = vectorizer.fit_transform(train_df['preprocessed_text'])
test_vectorized_text = vectorizer.transform(test_df['preprocessed_text'])

# GET FEATURE NAMES
feature_names = vectorizer.get_feature_names_out()

# CONVERT THE VECTORIZED TEXT TO DATAFRAME
train_vectorized_df = pd.DataFrame(train_vectorized_text.toarray(), columns=feature_names)
test_vectorized_df = pd.DataFrame(test_vectorized_text.toarray(), columns=feature_names)

# DROP NON NUMERIC COLUMN  FROM train_df
train_df = train_df.select_dtypes(include='number')


# CONCATENATE THE VECTORIZED DATAFRAME  WITH THE ORGINAL DATAFRAME
train_df = pd.concat([train_df, train_vectorized_df], axis=1)
test_df = pd.concat([test_df, test_vectorized_df], axis=1)

print("Train DataFrame:")
print(train_df.head())

print("\nTest DataFrame:")
print(test_df.head())
