## Data Preprocessing Clean  tweets


- **Nettoyage du texte**

    - Remove URLs, mentions (@username), hashtags (or keep them if relevant)
    - Handle emojis (remove or convert to text)
    - Lowercase text
    - Remove extra whitespace

- **Embedding** with minilm

- **Save metadata**

---


In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
import numpy as np
import re
import emoji

In [2]:

os.makedirs("../data/processed",exist_ok=True)
os.makedirs("../data/embedding", exist_ok=True)
os.makedirs("../data/metadata",exist_ok=True)

In [3]:
df = pd.read_csv("../data/raw/data.csv")

df.head()


Unnamed: 0,airline_sentiment,negativereason,airline,text,tweet_created
0,neutral,,Virgin America,@VirginAmerica What @dhepburn said.,2/24/2015 11:35
1,positive,,Virgin America,@VirginAmerica plus you've added commercials t...,2/24/2015 11:15
2,neutral,,Virgin America,@VirginAmerica I didn't today... Must mean I n...,2/24/2015 11:15
3,negative,Bad Flight,Virgin America,@VirginAmerica it's really aggressive to blast...,2/24/2015 11:15
4,negative,Can't Tell,Virgin America,@VirginAmerica and it's a really big bad thing...,2/24/2015 11:14


---
preprocessing required for  embeddings

In [4]:

import emoji
import re

def preprocess_text(text):
    """
    Preprocess text for transformer models
    - Remove HTML tags
    - Remove URLs
    - Remove mentions (@username)
    - Remove hashtags (#hashtag -> hashtag or removed)
    - Normalize whitespace
    """

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Hashtags often carry important context , just remove #
    # Remove hashtags (keep the word, remove #)
    text = re.sub(r'#(\w+)', r'\1', text)
    # remove hashtag with word
    # text = re.sub(r'#\w+', '', text)

    # emojis ‚Üí words
    text = emoji.demojize(text)
    text = text.replace(":", " ").replace("_", " ")

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


    #  remove truly noisy characters
    # text = re.sub(r'[^\w\s.,!?\'"]', ' ', text)
    # text = text.replace(":", "").replace("_", " ")
    # Remove special characters but keep basic punctuation and letters
    # text = re.sub(r'[^\w\s.,!?;:\-\'"()]', ' ', text)



df["clean_text"] = df['text'].apply(preprocess_text)


# null values after cleaning
initial_count = len(df)
df = df[df['clean_text'].str.strip() != '']


print(f"Removed {initial_count - len(df)} empty texts after cleaning")

print(f"\nDuplicates after preprocessing:")
print(f"Train: {df['text'].duplicated().sum()}")

# drop duplicates text after preprocessing
df = df.drop_duplicates(subset=["clean_text"])
print(f" after removing duplicates Train: {df['text'].duplicated().sum()}")


# Reset index
df = df.reset_index(drop=True)

df

Removed 0 empty texts after cleaning

Duplicates after preprocessing:
Train: 0
 after removing duplicates Train: 0


Unnamed: 0,airline_sentiment,negativereason,airline,text,tweet_created,clean_text
0,neutral,,Virgin America,@VirginAmerica What @dhepburn said.,2/24/2015 11:35,What said.
1,positive,,Virgin America,@VirginAmerica plus you've added commercials t...,2/24/2015 11:15,plus you've added commercials to the experienc...
2,neutral,,Virgin America,@VirginAmerica I didn't today... Must mean I n...,2/24/2015 11:15,I didn't today... Must mean I need to take ano...
3,negative,Bad Flight,Virgin America,@VirginAmerica it's really aggressive to blast...,2/24/2015 11:15,"it's really aggressive to blast obnoxious ""ent..."
4,negative,Can't Tell,Virgin America,@VirginAmerica and it's a really big bad thing...,2/24/2015 11:14,and it's a really big bad thing about it
...,...,...,...,...,...,...
14287,positive,,American,@AmericanAir thank you we got on a different f...,2/22/2015 12:01,thank you we got on a different flight to Chic...
14288,negative,Customer Service Issue,American,@AmericanAir leaving over 20 minutes Late Flig...,2/22/2015 11:59,leaving over 20 minutes Late Flight. No warnin...
14289,neutral,,American,@AmericanAir Please bring American Airlines to...,2/22/2015 11:59,Please bring American Airlines to BlackBerry10
14290,negative,Customer Service Issue,American,"@AmericanAir you have my money, you change my ...",2/22/2015 11:59,"you have my money, you change my flight, and d..."


In [5]:
print(f"\nDataset shapes after preprocessing:")
print(f"Train: {df.shape}")


print(f"\nNull alues after preprocessing:")
df.isnull().sum()


Dataset shapes after preprocessing:
Train: (14292, 6)

Null alues after preprocessing:


airline_sentiment       0
negativereason       5222
airline                 0
text                    0
tweet_created           0
clean_text              0
dtype: int64

In [6]:
df.to_csv("../data/processed/data.csv")

print("df saved")

df saved


In [7]:
print(df['airline_sentiment'].unique())

['neutral' 'positive' 'negative']


In [8]:
# load the model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# prepare your data
X = df['clean_text'].values
y = df['airline_sentiment'].values

# embeddings
X_embeddings = model.encode(X, show_progress_bar=True, batch_size=32)

print(f"Embedding shape: {X_embeddings.shape}") 
print(f"Each text is represented as a {X_embeddings.shape[1]}-dimensional vector")

# save embedding
np.save("../data/embedding/embeddings.npy", np.asarray(X_embeddings))

print("embeddings saved")

Batches:   0%|          | 0/447 [00:00<?, ?it/s]

Embedding shape: (14292, 384)
Each text is represented as a 384-dimensional vector
embeddings saved


### Meta Data

In [10]:

# add more info
train_metadata = pd.DataFrame({
    'id': df.index,
    'airline':df['airline'],
    'text': df['text'],
    'clean_text': df['clean_text'],
    'negativereason':df['negativereason'],
    'tweet_created':df['tweet_created'],
    'label': df['airline_sentiment'],
})

train_metadata.to_csv('../data/metadata/metadata.csv', index=False)

print("done")


done
