In [1]:
# Read from CSV file.

import pandas as pd

df = pd.read_csv('./scraped_tweets.csv').dropna(subset=['text'])

print(len(df.index))
df['text'].head()

9989


0    What a weekend so far. Have you been watching ...
1     See who's stopping by the #YouTubeMusic loung...
2    Yes, we're still running the numbers, but we c...
3    We're just getting started. Keep tuning into t...
4    Cannot WAIT to see what Donald has up his slee...
Name: text, dtype: object

In [2]:
# Preprocess tweets - remove @, # and links.

import preprocessor as p

df['clean_text'] = df['text'].apply(lambda text: p.clean(str(text)))

df['clean_text'].head()

0    What a weekend so far. Have you been watching ...
1    See who's stopping by the lounge at ! And tune...
2    Yes, we're still running the numbers, but we c...
3    We're just getting started. Keep tuning into t...
4    Cannot WAIT to see what Donald has up his slee...
Name: clean_text, dtype: object

In [3]:
# Convert to lowercase.

df['clean_text'] = df['clean_text'].apply(lambda text: text.lower())

df['clean_text'].head()

0    what a weekend so far. have you been watching ...
1    see who's stopping by the lounge at ! and tune...
2    yes, we're still running the numbers, but we c...
3    we're just getting started. keep tuning into t...
4    cannot wait to see what donald has up his slee...
Name: clean_text, dtype: object

In [4]:
# Remove contractions.

import re

def remove_contractions(text):
    text = re.sub(r'’', '\'', text)
    
    text = re.sub(r'won\'t', 'will not', text)
    text = re.sub(r'can\'t', 'can not', text)
    
    text = re.sub(r'\'s', ' is', text)
    text = re.sub(r'\'m', ' am', text)
    text = re.sub(r'\'re', ' are', text)
    text = re.sub(r'\'ve', ' have', text)
    text = re.sub(r'\'ll', ' will', text)
    text = re.sub(r'\'d', ' would', text)
    text = re.sub(r'\'t', ' not', text)
    text = re.sub(r'n\'t', ' not', text)
    
    return text

df['clean_text'] = df['clean_text'].apply(remove_contractions)

df['clean_text'].head()

0    what a weekend so far. have you been watching ...
1    see who is stopping by the lounge at ! and tun...
2    yes, we are still running the numbers, but we ...
3    we are just getting started. keep tuning into ...
4    cannot wait to see what donald has up his slee...
Name: clean_text, dtype: object

In [5]:
# Remove punctuations.

df['clean_text'] = df['clean_text'].str.replace('[^\w\s]', '')

df['clean_text'].head()

0    what a weekend so far have you been watching t...
1    see who is stopping by the lounge at  and tune...
2    yes we are still running the numbers but we ca...
3    we are just getting started keep tuning into t...
4    cannot wait to see what donald has up his slee...
Name: clean_text, dtype: object

In [6]:
# Remove whitespaces.

df['clean_text'] = df['clean_text'].apply(lambda text: str(text).strip())

df['clean_text'].head()

0    what a weekend so far have you been watching t...
1    see who is stopping by the lounge at  and tune...
2    yes we are still running the numbers but we ca...
3    we are just getting started keep tuning into t...
4    cannot wait to see what donald has up his slee...
Name: clean_text, dtype: object

In [7]:
# Remove empty texts.

df = df.drop(df[df['clean_text'] == ''].index)

print(len(df.index))
df['clean_text'].head()

9737


0    what a weekend so far have you been watching t...
1    see who is stopping by the lounge at  and tune...
2    yes we are still running the numbers but we ca...
3    we are just getting started keep tuning into t...
4    cannot wait to see what donald has up his slee...
Name: clean_text, dtype: object

In [8]:
# Include only English texts.
from langdetect import detect

# print("Detecting languages: 0.00%...", end="\r")
# for idx, text in enumerate(df['clean_text'].values):
#     print("Detecting languages: {:.2f}%...".format((idx + 1) / len(df.index) * 100), end="\r")
#     try:
#         detect(text)
#     except Exception:
#         print('text: (', text, ')')

df['lang'] = df['clean_text'].apply(lambda text: detect(text))
df = df.drop(df[df['lang'] != 'en'].index)

print(len(df.index))
df['clean_text'].head()

9181


0    what a weekend so far have you been watching t...
1    see who is stopping by the lounge at  and tune...
2    yes we are still running the numbers but we ca...
3    we are just getting started keep tuning into t...
4    cannot wait to see what donald has up his slee...
Name: clean_text, dtype: object

In [9]:
# Lemmatize texts.

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(lemmatizer.lemmatize(word) for word in text.split()))

df['clean_text'].head()

0    what a weekend so far have you been watching t...
1    see who is stopping by the lounge at and tune ...
2    yes we are still running the number but we can...
3    we are just getting started keep tuning into t...
4    cannot wait to see what donald ha up his sleev...
Name: clean_text, dtype: object

In [10]:
# Remove non-English words.

from nltk.corpus import words
words = set(words.words())

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word in words))

df['clean_text'].head()

0     what a weekend so far have you been watching the
1    see who is stopping by the lounge at and tune ...
2    yes we are still running the number but we can...
3    we are just getting keep tuning into the all w...
4    cannot wait to see what ha up his sleeve tune ...
Name: clean_text, dtype: object

In [11]:
# Remove stopwords.

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in stopwords))

df['clean_text'].head()

0                                 weekend far watching
1         see stopping lounge tune live stream weekend
2    yes still running number confirm officially ho...
3                     getting keep tuning weekend long
4              cannot wait see ha sleeve tune live min
Name: clean_text, dtype: object

In [12]:
# Remove common words.

freq = pd.Series(' '.join(df['clean_text']).split()).value_counts()[pd.Series(' '.join(df['clean_text']).split()).value_counts() > 300]

freq.head(10)

new       756
u         620
ha        559
one       486
people    483
today     481
say       468
year      406
wa        399
live      377
dtype: int64

In [13]:
freq = list(freq.index)

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in freq))

df['clean_text'].head()

0                                 weekend far watching
1                  stopping lounge tune stream weekend
2    yes still running number confirm officially ho...
3                     getting keep tuning weekend long
4                          cannot wait sleeve tune min
Name: clean_text, dtype: object

In [14]:
# Remove rare words.

freq = pd.Series(' '.join(df['clean_text']).split()).value_counts()[pd.Series(' '.join(df['clean_text']).split()).value_counts() <= 3]

freq.head(10)

waffle       3
virtually    3
tractor      3
monitor      3
warming      3
ought        3
renewable    3
largely      3
midrange     3
luxury       3
dtype: int64

In [15]:
freq = list(freq.index)

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in freq))

df['clean_text'].head()

0                                 weekend far watching
1                         stopping tune stream weekend
2    yes still running number confirm officially ho...
3                            getting keep weekend long
4                                     cannot wait tune
Name: clean_text, dtype: object

In [16]:
# Remove more empty texts.

df = df.drop(df[df['clean_text'] == ''].index)

print(len(df.index))
df['clean_text'].head()

9105


0                                 weekend far watching
1                         stopping tune stream weekend
2    yes still running number confirm officially ho...
3                            getting keep weekend long
4                                     cannot wait tune
Name: clean_text, dtype: object

In [17]:
# Write cleaned texts to CSV file.

df = df.dropna(subset=['clean_text'])
df.to_csv('./clean_text.csv', columns=['clean_text'], index=False)

len(df.index)

9105