In [1]:
# Read from CSV file.

import pandas as pd

df = pd.read_csv('./scraped_tweets.csv').dropna(subset=['text'])

print(len(df.index))
df['text'].head()

9988


0    Yes, we're still running the numbers, but we c...
1    We're just getting started. Keep tuning into t...
2    Cannot WAIT to see what Donald has up his slee...
3    IT. HAS. ARRIVED. You can catch the official @...
4    Breaking boards left and right! Watch @lizakos...
Name: text, dtype: object

In [2]:
# Preprocess tweets - remove @, # and links.

import preprocessor as p

df['clean_text'] = df['text'].apply(lambda text: p.clean(str(text)))

df['clean_text'].head()

0    Yes, we're still running the numbers, but we c...
1    We're just getting started. Keep tuning into t...
2    Cannot WAIT to see what Donald has up his slee...
3    IT. HAS. ARRIVED. You can catch the official l...
4    Breaking boards left and right! Watch and the ...
Name: clean_text, dtype: object

In [3]:
# Convert to lowercase.

df['clean_text'] = df['clean_text'].apply(lambda text: text.lower())

df['clean_text'].head()

0    yes, we're still running the numbers, but we c...
1    we're just getting started. keep tuning into t...
2    cannot wait to see what donald has up his slee...
3    it. has. arrived. you can catch the official l...
4    breaking boards left and right! watch and the ...
Name: clean_text, dtype: object

In [4]:
# Remove contractions.

import re

def remove_contractions(text):
    text = re.sub(r'’', '\'', text)
    
    text = re.sub(r'won\'t', 'will not', text)
    text = re.sub(r'can\'t', 'can not', text)
    
    text = re.sub(r'\'s', ' is', text)
    text = re.sub(r'\'m', ' am', text)
    text = re.sub(r'\'re', ' are', text)
    text = re.sub(r'\'ve', ' have', text)
    text = re.sub(r'\'ll', ' will', text)
    text = re.sub(r'\'d', ' would', text)
    text = re.sub(r'\'t', ' not', text)
    text = re.sub(r'n\'t', ' not', text)
    
    return text

df['clean_text'] = df['clean_text'].apply(remove_contractions)

df['clean_text'].head()

0    yes, we are still running the numbers, but we ...
1    we are just getting started. keep tuning into ...
2    cannot wait to see what donald has up his slee...
3    it. has. arrived. you can catch the official l...
4    breaking boards left and right! watch and the ...
Name: clean_text, dtype: object

In [5]:
# Remove punctuations.

df['clean_text'] = df['clean_text'].str.replace('[^\w\s]', '')

df['clean_text'].head()

0    yes we are still running the numbers but we ca...
1    we are just getting started keep tuning into t...
2    cannot wait to see what donald has up his slee...
3    it has arrived you can catch the official live...
4    breaking boards left and right watch and the v...
Name: clean_text, dtype: object

In [6]:
# Remove whitespaces.

df['clean_text'] = df['clean_text'].apply(lambda text: str(text).strip())

df['clean_text'].head()

0    yes we are still running the numbers but we ca...
1    we are just getting started keep tuning into t...
2    cannot wait to see what donald has up his slee...
3    it has arrived you can catch the official live...
4    breaking boards left and right watch and the v...
Name: clean_text, dtype: object

In [7]:
# Remove empty texts.

df = df.drop(df[df['clean_text'] == ''].index)

print(len(df.index))
df['clean_text'].head()

9729


0    yes we are still running the numbers but we ca...
1    we are just getting started keep tuning into t...
2    cannot wait to see what donald has up his slee...
3    it has arrived you can catch the official live...
4    breaking boards left and right watch and the v...
Name: clean_text, dtype: object

In [8]:
# Include only English texts.
from langdetect import detect

# print("Detecting languages: 0.00%...", end="\r")
# for idx, text in enumerate(df['clean_text'].values):
#     print("Detecting languages: {:.2f}%...".format((idx + 1) / len(df.index) * 100), end="\r")
#     try:
#         detect(text)
#     except Exception:
#         print('text: (', text, ')')

df['lang'] = df['clean_text'].apply(lambda text: detect(text))
df = df.drop(df[df['lang'] != 'en'].index)

print(len(df.index))
df['clean_text'].head()

9162


0    yes we are still running the numbers but we ca...
1    we are just getting started keep tuning into t...
2    cannot wait to see what donald has up his slee...
3    it has arrived you can catch the official live...
4    breaking boards left and right watch and the v...
Name: clean_text, dtype: object

In [9]:
# Lemmatize texts.

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(lemmatizer.lemmatize(word) for word in text.split()))

df['clean_text'].head()

0    yes we are still running the number but we can...
1    we are just getting started keep tuning into t...
2    cannot wait to see what donald ha up his sleev...
3    it ha arrived you can catch the official lives...
4    breaking board left and right watch and the vl...
Name: clean_text, dtype: object

In [10]:
# Remove non-English words.

from nltk.corpus import words
words = set(words.words())

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word in words))

df['clean_text'].head()

0    yes we are still running the number but we can...
1    we are just getting keep tuning into the all w...
2    cannot wait to see what ha up his sleeve tune ...
3    it ha you can catch the official both weekend ...
4    breaking board left and right watch and the tr...
Name: clean_text, dtype: object

In [11]:
# Remove stopwords.

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in stopwords))

df['clean_text'].head()

0    yes still running number confirm officially ho...
1                     getting keep tuning weekend long
2              cannot wait see ha sleeve tune live min
3         ha catch official weekend music whether tune
4      breaking board left right watch try martial art
Name: clean_text, dtype: object

In [12]:
# Remove common words.

freq = pd.Series(' '.join(df['clean_text']).split()).value_counts()[pd.Series(' '.join(df['clean_text']).split()).value_counts() > 300]

freq.head(10)

new       791
u         619
ha        553
today     496
people    495
one       490
say       463
wa        407
year      396
live      380
dtype: int64

In [13]:
freq = list(freq.index)

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in freq))

df['clean_text'].head()

0    yes still running number confirm officially ho...
1                     getting keep tuning weekend long
2                          cannot wait sleeve tune min
3            catch official weekend music whether tune
4            breaking board left right try martial art
Name: clean_text, dtype: object

In [14]:
# Remove rare words.

freq = pd.Series(' '.join(df['clean_text']).split()).value_counts()[pd.Series(' '.join(df['clean_text']).split()).value_counts() <= 3]

freq.head(10)

observation    3
wisely         3
portfolio      3
respond        3
ur             3
batch          3
portrait       3
otherwise      3
paving         3
immediate      3
dtype: int64

In [15]:
freq = list(freq.index)

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in freq))

df['clean_text'].head()

0    yes still running number confirm officially ho...
1                            getting keep weekend long
2                                     cannot wait tune
3            catch official weekend music whether tune
4                    breaking board left right try art
Name: clean_text, dtype: object

In [16]:
# Remove more empty texts.

df = df.drop(df[df['clean_text'] == ''].index)

print(len(df.index))
df['clean_text'].head()

9090


0    yes still running number confirm officially ho...
1                            getting keep weekend long
2                                     cannot wait tune
3            catch official weekend music whether tune
4                    breaking board left right try art
Name: clean_text, dtype: object

In [17]:
# Write cleaned texts to CSV file.

df = df.dropna(subset=['clean_text'])
df.to_csv('./clean_text.csv', columns=['clean_text'], index=False)

len(df.index)

9090