In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns

In [3]:
os.chdir('../')  # Change to project root directory

In [7]:
df = pd.read_csv('data/raw/tweets.csv', encoding='latin-1', names=[
    'target',
    'ids',
    'date',
    'flag',
    'user',
    'text'
])

In [8]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [9]:
import re

In [10]:
def remove_characters(text):
    text= re.sub(r'http\S+', '', text)  # Remove URLs
    text= re.sub(r'@\w+', '', text)     # Remove mentions
    text= re.sub(r'#\w+', '', text)     # Remove hashtags
    text= re.sub(r'\d+', '', text)      # Remove numbers
    text= re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text= text.lower()                   # Convert to lowercase
    return text

In [11]:
df['cleaned_text'] = df['text'].apply(remove_characters)

In [12]:
df['cleaned_text'].head()

0       awww thats a bummer  you shoulda got david ...
1    is upset that he cant update his facebook by t...
2     i dived many times for the ball managed to sa...
3      my whole body feels itchy and like its on fire 
4     no its not behaving at all im mad why am i he...
Name: cleaned_text, dtype: object

In [13]:
def clean_text(text):
    if not isinstance(text, str):  # handle NaN or non-string
        return ""
    
    # 1. Remove emojis & non-BMP unicode chars
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    # 2. Keep only English letters, numbers, and spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # 3. Collapse multiple underscores, spaces into single space
    text = re.sub(r'[_\s]+', ' ', text)
    
    # 4. Strip leading/trailing spaces
    text = text.strip()
    
    return text

In [14]:
df['cleaned_text'] = df['cleaned_text'].apply(clean_text)

In [15]:
df['cleaned_text'].head()

0    awww thats a bummer you shoulda got david carr...
1    is upset that he cant update his facebook by t...
2    i dived many times for the ball managed to sav...
3       my whole body feels itchy and like its on fire
4    no its not behaving at all im mad why am i her...
Name: cleaned_text, dtype: object

In [16]:
df['cleaned_text'].iloc[197]

'is not going to sleep tonite'

In [17]:
import os

os.makedirs('data/processed', exist_ok=True)

In [18]:
df.to_csv('data/processed/cleaned_tweets_2.csv', index=False)

In [19]:
df.shape

(1600000, 7)

In [20]:
sub = df[df["cleaned_text"].str.strip() != ""]

In [21]:
sub.shape

(1596052, 7)

In [22]:
df.to_csv('data/processed/processed.csv', index=False)