In [12]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk

# Ensure necessary NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the dataset
file_path = 'twitter_training.csv'  # Update this path if necessary
tweets_df = pd.read_csv(file_path, encoding='latin1')

# Ensure all tweets are strings and handle missing values
tweets_df['Tweet'] = tweets_df['Tweet'].astype(str).fillna('')

# Define preprocessing functions
def clean_tweet(tweet):
    tweet = re.sub(r'[^\x00-\x7F]+', ' ', tweet)
    tweet = re.sub(r'http\S+', 'URL', tweet)
    tweet = re.sub(r'@\w+', 'MENTION', tweet)
    tweet = re.sub(r'#\w+', 'HASHTAG', tweet)
    tweet = re.sub(r'\d+', '', tweet)
    tweet = re.sub(r'[^\w\s]', '', tweet)
    tweet = tweet.lower()
    return tweet

def tokenize_tweet(tweet):
    return word_tokenize(tweet)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def lemmatize_tweet(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def stem_tweet(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

# Apply preprocessing to the dataset
tweets_df['cleaned_text'] = tweets_df['Tweet'].apply(clean_tweet)
tweets_df['tokenized_text'] = tweets_df['cleaned_text'].apply(tokenize_tweet)
tweets_df['filtered_text'] = tweets_df['tokenized_text'].apply(remove_stopwords)
tweets_df['lemmatized_text'] = tweets_df['filtered_text'].apply(lemmatize_tweet)
tweets_df['stemmed_text'] = tweets_df['lemmatized_text'].apply(stem_tweet)

# Combine words back into a sentence for the final preprocessed text
tweets_df['preprocessed_text'] = tweets_df['stemmed_text'].apply(lambda tokens: ' '.join(tokens))

# Display the final preprocessed dataset
print(tweets_df[['Tweet', 'cleaned_text', 'tokenized_text', 'filtered_text', 'lemmatized_text', 'stemmed_text', 'preprocessed_text']].head())

# Save the preprocessed dataset to a CSV file
output_file_path = 'preprocessed_tweets_dataset.csv'
tweets_df.to_csv(output_file_path, index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mehwishahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mehwishahmed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mehwishahmed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                               Tweet  \
0  I will finish Borderlands 2 today. I have some...   
1  I'm going to finish Borderlands 2 today. I hav...   
2  Going to finish up Borderlands 2 today. I've g...   
3  Going to finish finish cleaning up Borderlands...   
4  Going to finish up volume 2 today. I've got so...   

                                        cleaned_text  \
0  i will finish borderlands  today i have some n...   
1  im going to finish borderlands  today i have s...   
2  going to finish up borderlands  today ive got ...   
3  going to finish finish cleaning up borderlands...   
4  going to finish up volume  today ive got some ...   

                                      tokenized_text  \
0  [i, will, finish, borderlands, today, i, have,...   
1  [im, going, to, finish, borderlands, today, i,...   
2  [going, to, finish, up, borderlands, today, iv...   
3  [going, to, finish, finish, cleaning, up, bord...   
4  [going, to, finish, up, volume, today, ive,