In [1]:
# Read from CSV file.

import pandas as pd

df = pd.read_csv('./scraped_tweets.csv').dropna(subset=['text']).sample(frac=1).reset_index()

print(len(df.index))
df['text'].head()

9988


0    This holiday season the shopping rush is comin...
1    Police open fire as vehicle rams Ukraine embas...
2    “Retraining can’t just be a feelgood thing, it...
3    Bring home the kindness.   #WHPmagical  https:...
4    Woods within striking distance of fifth Green ...
Name: text, dtype: object

In [2]:
# Preprocess tweets - remove @, # and links.

import preprocessor as p

df['clean_text'] = df['text'].apply(lambda text: p.clean(str(text)))

df['clean_text'].head()

0    This holiday season the shopping rush is comin...
1    Police open fire as vehicle rams Ukraine embas...
2    “Retraining can’t just be a feelgood thing, it...
3                             Bring home the kindness.
4    Woods within striking distance of fifth Green ...
Name: clean_text, dtype: object

In [3]:
# Convert to lowercase.

df['clean_text'] = df['clean_text'].apply(lambda text: text.lower())

df['clean_text'].head()

0    this holiday season the shopping rush is comin...
1    police open fire as vehicle rams ukraine embas...
2    “retraining can’t just be a feelgood thing, it...
3                             bring home the kindness.
4    woods within striking distance of fifth green ...
Name: clean_text, dtype: object

In [4]:
# Remove punctuation.

df['clean_text'] = df['clean_text'].str.replace('[^\w\s]', '')

df['clean_text'].head()

0    this holiday season the shopping rush is comin...
1    police open fire as vehicle rams ukraine embas...
2    retraining cant just be a feelgood thing it ne...
3                              bring home the kindness
4    woods within striking distance of fifth green ...
Name: clean_text, dtype: object

In [5]:
# Remove whitespaces.

df['clean_text'] = df['clean_text'].apply(lambda text: str(text).strip())

df['clean_text'].head()

0    this holiday season the shopping rush is comin...
1    police open fire as vehicle rams ukraine embas...
2    retraining cant just be a feelgood thing it ne...
3                              bring home the kindness
4    woods within striking distance of fifth green ...
Name: clean_text, dtype: object

In [6]:
# Remove empty texts.

df = df.drop(df[df['clean_text'] == ''].index)

print(len(df.index))
df['clean_text'].head()

9729


0    this holiday season the shopping rush is comin...
1    police open fire as vehicle rams ukraine embas...
2    retraining cant just be a feelgood thing it ne...
3                              bring home the kindness
4    woods within striking distance of fifth green ...
Name: clean_text, dtype: object

In [7]:
# Include only English texts.
from langdetect import detect

# print("Detecting languages: 0.00%...", end="\r")
# for idx, text in enumerate(df['clean_text'].values):
#     print("Detecting languages: {:.2f}%...".format((idx + 1) / len(df.index) * 100), end="\r")
#     try:
#         detect(text)
#     except Exception:
#         print('text: (', text, ')')

df['lang'] = df['clean_text'].apply(lambda text: detect(text))
df = df.drop(df[df['lang'] != 'en'].index)

print(len(df.index))
df['clean_text'].head()

9149


0    this holiday season the shopping rush is comin...
1    police open fire as vehicle rams ukraine embas...
2    retraining cant just be a feelgood thing it ne...
3                              bring home the kindness
4    woods within striking distance of fifth green ...
Name: clean_text, dtype: object

In [8]:
# Lemmatize texts.

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(lemmatizer.lemmatize(word) for word in text.split()))

df['clean_text'].head()

0    this holiday season the shopping rush is comin...
1    police open fire a vehicle ram ukraine embassy...
2    retraining cant just be a feelgood thing it ne...
3                              bring home the kindness
4    wood within striking distance of fifth green j...
Name: clean_text, dtype: object

In [9]:
# Remove non-English words.

from nltk.corpus import words
words = set(words.words())

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word in words))

df['clean_text'].head()

0    this holiday season the shopping rush is comin...
1    police open fire a vehicle ram embassy car in ...
2    cant just be a thing it need to be tied to the...
3                              bring home the kindness
4    wood within striking distance of fifth green j...
Name: clean_text, dtype: object

In [10]:
# Remove stopwords.

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in stopwords))

df['clean_text'].head()

0    holiday season shopping rush coming social med...
1      police open fire vehicle ram embassy car injury
2    cant thing need tied financial performance com...
3                                  bring home kindness
4     wood within striking distance fifth green jacket
Name: clean_text, dtype: object

In [11]:
# Remove common words.

freq = pd.Series(' '.join(df['clean_text']).split()).value_counts()[pd.Series(' '.join(df['clean_text']).split()).value_counts() > 300]

freq.head(10)

new       787
u         621
ha        553
today     497
people    495
one       491
say       464
wa        407
year      396
live      381
dtype: int64

In [12]:
freq = list(freq.index)

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in freq))

df['clean_text'].head()

0    holiday season shopping rush coming social med...
1      police open fire vehicle ram embassy car injury
2    cant thing need tied financial performance com...
3                                  bring home kindness
4     wood within striking distance fifth green jacket
Name: clean_text, dtype: object

In [13]:
# Remove rare words.

freq = pd.Series(' '.join(df['clean_text']).split()).value_counts()[pd.Series(' '.join(df['clean_text']).split()).value_counts() <= 3]

freq.head(10)

sticking       3
regulate       3
otherwise      3
outlet         3
waffle         3
oxygen         3
dive           3
scope          3
forgiveness    3
impression     3
dtype: int64

In [14]:
freq = list(freq.index)

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in freq))

df['clean_text'].head()

0    holiday season shopping rush coming social med...
1      police open fire vehicle ram embassy car injury
2    cant thing need financial performance company ...
3                                  bring home kindness
4     wood within striking distance fifth green jacket
Name: clean_text, dtype: object

In [15]:
# Remove more empty texts.

df = df.drop(df[df['clean_text'] == ''].index)

print(len(df.index))
df['clean_text'].head()

9079


0    holiday season shopping rush coming social med...
1      police open fire vehicle ram embassy car injury
2    cant thing need financial performance company ...
3                                  bring home kindness
4     wood within striking distance fifth green jacket
Name: clean_text, dtype: object

In [16]:
# Write cleaned texts to CSV file.

df = df.dropna(subset=['clean_text'])
df.to_csv('./clean_text.csv', columns=['clean_text'], index=False)

len(df.index)

9079