In [3]:
%pip install nltk
%pip install bs4
%pip install ipykernel ipywidgets emoji contractions
%pip install langdetect
%pip install pandas


Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting pandas
  Downloading https://mirrors.aliyun.com/pypi/packages/53/c3/f8e87361f7fdf42012def602bfa2a593423c729f5cb7c97aed7f51be66ac/pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl (10.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m540.3 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading https://mirrors.aliyun.

In [4]:
# General Libraries
import os
import numpy as np
from tqdm import tqdm
import json

import config
import pandas as pd

# Text Processing and Feature Engineering
import re
import string
import nltk
import emoji
import contractions
from langdetect import detect, LangDetectException
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords


# NLTK Download
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Enable the notebook extension for tqdm
tqdm.pandas()

[nltk_data] Downloading package punkt to /Users/zhouyf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/zhouyf/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhouyf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zhouyf/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
df_gn = pd.read_json(config.googel_news_og)
df_sf = pd.read_json(config.stack_overflow_og)
df_tw = pd.read_json(config.tweets_og)

# Data Cleaning

In [7]:
# Data Cleaning
# Initialize the lemmatizer and stopwords
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# 1. Data Cleaning Functions

def strip_emoji(text):
    return emoji.replace_emoji(text, replace='')
    # return emoji.get_emoji_regexp().sub("", text)

def strip_all_entities(text):
    text = re.sub(r'\r|\n', ' ', text.lower())
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub(r'[^\x00-\x7f]', '', text)
    banned_list = string.punctuation
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

def clean_hashtags(tweet):
    new_tweet = re.sub(r'(\s+#[\w-]+)+\s*$', '', tweet).strip()
    new_tweet = re.sub(r'#([\w-]+)', r'\1', new_tweet).strip()
    return new_tweet

def filter_chars(text):
    return ' '.join('' if ('$' in word) or ('&' in word) else word for word in text.split())

def remove_mult_spaces(text):
    return re.sub(r"\s\s+", " ", text)

def filter_non_english(text):
    try:
        lang = detect(text)
    except LangDetectException:
        lang = "unknown"
    return text if lang == "en" else ""

def expand_contractions(text):
    return contractions.fix(text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def lemmatize(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def remove_short_words(text, min_len=1):
    words = text.split()
    long_words = [word for word in words if len(word) >= min_len]
    return ' '.join(long_words)

def replace_elongated_words(text):
    regex_pattern = r'\b(\w+)((\w)\3{2,})(\w*)\b'
    return re.sub(regex_pattern, r'\1\3\4', text)

def remove_repeated_punctuation(text):
    return re.sub(r'[\?\.\!]+(?=[\?\.\!])', '', text)

def remove_extra_whitespace(text):
    return ' '.join(text.split())

def remove_url_shorteners(text):
    return re.sub(r'(?:http[s]?://)?(?:www\.)?(?:bit\.ly|goo\.gl|t\.co|tinyurl\.com|tr\.im|is\.gd|cli\.gs|u\.nu|url\.ie|tiny\.cc|alturl\.com|ow\.ly|bit\.do|adoro\.to)\S+', '', text)

def remove_spaces_tweets(tweet):
    return tweet.strip()

def remove_short_tweets(tweet, min_words=1):
    words = tweet.split()
    return tweet if len(words) >= min_words else ""

# 2. Main Cleaning Function

def clean_tweet(tweet):
    tweet = str(tweet).lower()
    
    tweet = strip_emoji(tweet)
    tweet = expand_contractions(tweet)
    # tweet = filter_non_english(tweet)
    tweet = strip_all_entities(tweet)
    tweet = clean_hashtags(tweet)
    tweet = filter_chars(tweet)
    tweet = remove_mult_spaces(tweet)
    tweet = remove_numbers(tweet)
    tweet = lemmatize(tweet)
    tweet = remove_short_words(tweet)
    tweet = replace_elongated_words(tweet)
    tweet = remove_repeated_punctuation(tweet)
    tweet = remove_extra_whitespace(tweet)
    tweet = remove_url_shorteners(tweet)
    tweet = remove_spaces_tweets(tweet)
    tweet = remove_short_tweets(tweet)
    tweet = ' '.join(tweet.split())
    return tweet

# 3. Data Loading and Saving

def load_and_clean_data(data_path, cleaned_data_path):
    if os.path.exists(cleaned_data_path):
        print("Cleaned data file already exists. Loading from file...")
        df = pd.read_csv(cleaned_data_path)
    else:
        df = pd.read_csv(data_path)
        df['text'] = df['text'].fillna('').astype(str)  # Ensure all text entries are strings
        df['text_clean'] = [clean_tweet(tweet) for tweet in tqdm(df['text'], desc="Cleaning Tweets")]
        df.to_csv(cleaned_data_path, index=False)
    return df


def clean_data(df):
    df['text'] = df['text'].fillna('').astype(str)  # Ensure all text entries are strings
    df["clean_text"] = df["text"].progress_apply(clean_tweet)
    return df

In [8]:
df_gn = clean_data(df_gn)
df_sf = clean_data(df_sf)
df_tw = clean_data(df_tw)

100%|██████████| 11109/11109 [00:01<00:00, 7119.57it/s] 
100%|██████████| 16408/16408 [00:01<00:00, 12417.48it/s]
100%|██████████| 2473/2473 [00:00<00:00, 9381.70it/s]


In [9]:
if not os.path.exists(config.data_path):
    os.makedirs(config.data_path)
    print(f"Create data path: {config.data_path}")
    
with open(config.googel_news, 'w' , encoding = 'utf-8') as fp:
    json.dump(df_gn.to_dict('records'), fp , ensure_ascii=False, indent=2)

with open(config.stack_overflow, 'w' , encoding = 'utf-8') as fp:
    json.dump(df_sf.to_dict('records'), fp , ensure_ascii=False, indent=2)

with open(config.tweets, 'w' , encoding = 'utf-8') as fp:
    json.dump(df_tw.to_dict('records'), fp , ensure_ascii=False, indent=2)

Create data path: /Users/zhouyf/Documents/data/majid/drive/MyDrive/project2/data/cleaned_data


In [None]:
#creat labl_map one time and save it as assets (google example)

#labl_map = {  item:i for i , item in enumerate(list(set(df['label']))) if item}
#inv_lable_map = {item:i for i , item in labl_map.items() }
#with open(config.goole_label_map, 'w' , encoding = 'utf-8') as fp:
#    json.dump(labl_map, fp , ensure_ascii=False )

#with open(config.google_inv_label_map, 'w' , encoding = 'utf-8') as fp:
#    json.dump(inv_lable_map, fp , ensure_ascii=False )