In [35]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [28]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
# Load the CSV file into a DataFrame
df = pd.read_csv('tweets.csv')

In [31]:
df.columns

Index(['user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'text', 'hashtags', 'source'],
      dtype='object')

In [32]:
# Keep only the 'text' column and drop all other columns
df.drop(df.columns.difference(['text']), axis=1, inplace=True)

In [33]:
df

Unnamed: 0,text
0,Apply today!:\nhttps://t.co/eCRVPFByiy\n\n#app...
1,Apply today!:\nhttps://t.co/eCRVPFByiy\n\n#app...
2,I want to wish all @Twitter employees that got...
3,@elonmusk sir has call me to work from office....
4,Hereâ€™s what happens when you fire all your eng...
...,...
25486,Gate changed to G6 ðŸ¥² #Schiphol https://t.co/61...
25487,Well this didnâ€™t age well: https://t.co/nrzamH...
25488,#Schiphol still a shit hole with enormous crea...
25489,@Schiphol #schiphol on a Tuesday @ 6:20. This ...


In [36]:
df['text'] = df['text'].astype(str)

# Define a regular expression pattern to match URLs
url_pattern = r'http\S+'

# Filter out rows containing URLs in the 'text' column
df = df[~df['text'].str.contains(url_pattern)]

# Reset the index after dropping rows
df.reset_index(drop=True, inplace=True)

In [37]:
stop_words = set(stopwords.words('english'))


In [38]:
df

Unnamed: 0,text
0,I want to wish all @Twitter employees that got...
1,@Curtis415 @AdamParkhomenko @MeidasTouch #Elon...
2,@wkamaubell he even made the local news tonigh...
3,We just filed yet another case against Twitter...
4,I own a new social communication app\nNow on A...
...,...
13223,Missed our flight yesterday after queuing for ...
13224,"@bplwijn Long queues, no indications of waitin..."
13225,#schiphol is a disaster again. The security we...
13226,"If you need to fly from #Schiphol today, arriv..."


In [42]:
#  function to clean text
def clean_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Lowercasing
        text = text.lower()
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Tokenization and remove stopwords
        tokens = word_tokenize(text)
        filtered_tokens = [word for word in tokens if word not in stop_words]
        # Join tokens back into a string
        cleaned_text = ' '.join(filtered_tokens)
        return cleaned_text
    else:
        return ''

In [43]:
# Apply the cleaning function to the 'text' column
df['text'] = df['text'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(clean_text)


In [44]:
df

Unnamed: 0,text
0,want wish twitter employees got fired happy ho...
1,curtis415 adamparkhomenko meidastouch elon sno...
2,wkamaubell even made local news tonight one wo...
3,filed yet another case twitter today twitterla...
4,new social communication app android ios lets ...
...,...
13223,missed flight yesterday queuing 4h booked tomo...
13224,bplwijn long queues indications waiting times ...
13225,schiphol disaster security went strike summer ...
13226,need fly schiphol today arrive early allowed 4...


In [45]:
df.to_csv('tweets_clean.csv', index=False)