# 02-Data-Preparation.ipynb

In [64]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import pandas as pd
nltk.download('stopwords')
nltk.download('punkt')

# Define function to preprocess and clean text
def preprocess_text(text):
    if not isinstance(text, (str, bytes)):
        return ''
    # Remove URLs, mentions, and hashtags
    text = re.sub(r'https?\S+', '', text)
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'#\S+', '', text)
    # Remove non-alphabetic characters and numbers
    text = re.findall(r'\b(?!\d+\b)[a-zA-Z0-9]+\b', text)
    # Convert text to lowercase
    text = ' '.join(text).lower().strip()
    # Tokenize text into words
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    # Stem words
    stemmer = SnowballStemmer('english')
    tokens = [stemmer.stem(token) for token in tokens]
    # Join tokens back into text
    text = ' '.join(tokens)
    return text

# Define function to preprocess and clean all collected tweets
def preprocess_tweets(tweets):
    for tweet in tweets:
        tweet['content'] = preprocess_text(tweet['content'])
        
#read raw data
df =  pd.read_csv('../data/raw_data.csv')
# clean the dataframe
df['content'] = df['content'].apply(lambda x: preprocess_text(x))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [76]:
# drop duplicates
df.drop_duplicates(subset=['content'],inplace=True)
# drop empty content
df.drop(index=df[df['content'] == ''].index, inplace=True)
# counting the number of sentences after cleaning
df['language'].value_counts()
# save clean data
df.to_csv('../data/clean_data.csv',encoding='utf-8')