In [86]:
import pandas as pd
import re

# Load the CSV file
file_path = 'data scraping.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Function to remove date and time with various formats
def remove_date(text):
    if isinstance(text, str):  # Only apply to text data
        # Pattern to capture various date formats, including:
        # - YYYY-MM-DD HH:MM:SS+00:00
        # - YYYY/MM/DD
        # - HH:MM:SS
        # - Dates followed by commas or other characters
        date_pattern = r'(\d{4}[-/]\d{2}[-/]\d{2} \d{2}:\d{2}:\d{2}\+\d{2}:\d{2},?)|(\d{4}[-/]\d{2}[-/]\d{2})|(\d{2}:\d{2}:\d{2})'
        cleaned_text = re.sub(date_pattern, '', text)
        return cleaned_text.strip()  # Remove trailing whitespace
    return text  # Return non-text data as is

# Function to remove invalid characters directly
def clean_invalid_chars(text):
    if isinstance(text, str):  # Only apply to text data
        cleaned_text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
        return cleaned_text
    return text  # Return non-text data as is

# Apply cleaning to all columns
for col in df.columns:
    df[col] = df[col].apply(remove_date).apply(clean_invalid_chars)

print(df.head())


  Date                                              Tweet
0       Td ada banner gede di deretan pkl danau unesa....
1       Nesa! Kkn iku harus beda prodi ta? Atau sak pr...
2       Nesa! Alhamdulillah bersyukur bgt IPK sm 1 uda...
3       nesa! maaf aku masih bingung maksudnya ngulang...
4           Nesa! Terakhir proses SPK tanggal berapa ya ?


In [87]:
def remove_urls(text):
    if isinstance(text, str):  # Only apply to text data
        # Improved URL pattern to capture more URL variations including quoted URLs
        url_pattern = r'(http[s]?://\S+|www\.\S+|\S+\.\S+/\S+)'  # Catch common and short URLs
        cleaned_text = re.sub(url_pattern, '', text)
        return cleaned_text.strip()  # Remove trailing whitespace
    return text  # Return non-text data as is

# Apply cleaning to all columns
for col in df.columns:
    df[col] = df[col].apply(remove_date).apply(clean_invalid_chars).apply(remove_urls)


output_file_path = 'cleaned_file.csv'  # Replace with your desired output path
df.to_csv(output_file_path, index=False)


In [88]:
import string
import re
punctuation_pattern = r'[{}]'.format(re.escape(string.punctuation))

# Step 3: Remove punctuation from all columns
df = df.replace(punctuation_pattern, '', regex=True)
output_file_path = 'cleaned_file.csv'  # Replace with your desired output path
df.to_csv(output_file_path, index=False)
print(df.head())

  Date                                              Tweet
0       Td ada banner gede di deretan pkl danau unesa ...
1       Nesa Kkn iku harus beda prodi ta Atau sak prod...
2       Nesa Alhamdulillah bersyukur bgt IPK sm 1 udah...
3       nesa maaf aku masih bingung maksudnya ngulang ...
4             Nesa Terakhir proses SPK tanggal berapa ya 


In [89]:


# Fungsi untuk menghapus emoji
def remove_emoji(string):
    # Menentukan pola untuk semua emoji
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"  # miscellaneous symbols
        u"\U000024C2-\U0001F251"  # enclosed characters
        u"\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
        u"\U0001FA00-\U0001FAFF"  # chess symbols and other additional emojis
        u"\U0001F1F2-\U0001F1F3"  # China flag
        u"\U0001F1E6-\U0001F1FF"  # Regional indicator symbols
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# Membaca data dari file CSV
#

# Menampilkan DataFrame sebelum penghapusan emoji
print("Data sebelum penghapusan emoji:")
print(df)

# Menghapus emoji dari kolom 'tweet'
df['Tweet'] = df['Tweet'].apply(remove_emoji)

# Menampilkan DataFrame setelah penghapusan emoji
print("\nData setelah penghapusan emoji:")
print(df)

# Menyimpan hasil ke file CSV baru
# Ganti 'cleaned_tweets.csv' dengan nama file yang diinginkan
df.to_csv('cleaned_file.csv', index=False)

Data sebelum penghapusan emoji:
     Date                                              Tweet
0          Td ada banner gede di deretan pkl danau unesa ...
1          Nesa Kkn iku harus beda prodi ta Atau sak prod...
2          Nesa Alhamdulillah bersyukur bgt IPK sm 1 udah...
3          nesa maaf aku masih bingung maksudnya ngulang ...
4                Nesa Terakhir proses SPK tanggal berapa ya 
...   ...                                                ...
6421       ptn pil1 sosiologi unesa saran dong pil2 nya y...
6422       Ptn \n1 sasing UNPAD\n2 sasing UNESA\n\nATAU\n...
6423                     Unesa knpsi gadiadain wisuda ha hft
6424                        nesa Assalamualaikum warga unesa
6425       dengerin lagu2 mas pam sekarang jadi kerasa vi...

[6426 rows x 2 columns]

Data setelah penghapusan emoji:
     Date                                              Tweet
0          Td ada banner gede di deretan pkl danau unesa ...
1          Nesa Kkn iku harus beda prodi ta Atau sak pro

In [92]:
df = df.apply(lambda col: col.map(lambda x: x.lower() if isinstance(x, str) else x))

df.to_csv('cleaned_file.csv', index=False)