In [1]:
import pandas as pd

df = pd.read_csv('./scraped_tweets.csv').sample(frac=1)

print(len(df.index))
df['text'].head()

9998


7335    Judge Garland is highly qualified for the benc...
4726    Agree - so important to attract girls to tech ...
6163    Dispelling both the pleasant and the unpleasan...
7378    Rising sea levels are already flooding homes a...
957     Five days of Deepavali celebrations begin toda...
Name: text, dtype: object

In [2]:
# Preprocess tweets - remove @, # and links.

import preprocessor as p

df['clean_text'] = df['text'].apply(lambda text: p.clean(str(text)))

df['clean_text'].head()

7335    Judge Garland is highly qualified for the benc...
4726    Agree - so important to attract girls to tech ...
6163    Dispelling both the pleasant and the unpleasan...
7378    Rising sea levels are already flooding homes a...
957     Five days of Deepavali celebrations begin toda...
Name: clean_text, dtype: object

In [3]:
# Convert to lowercase.

df['clean_text'] = df['clean_text'].apply(lambda text: text.lower())

df['clean_text'].head()

7335    judge garland is highly qualified for the benc...
4726    agree - so important to attract girls to tech ...
6163    dispelling both the pleasant and the unpleasan...
7378    rising sea levels are already flooding homes a...
957     five days of deepavali celebrations begin toda...
Name: clean_text, dtype: object

In [None]:
# Remove punctuation.

df['clean_text'] = df['clean_text'].str.replace('[^\w\s]', '')

df['clean_text'].head()

7335    judge garland is highly qualified for the benc...
4726    agree  so important to attract girls to tech c...
6163    dispelling both the pleasant and the unpleasan...
7378    rising sea levels are already flooding homes a...
957     five days of deepavali celebrations begin toda...
Name: clean_text, dtype: object

In [None]:
# Remove whitespaces.

df['clean_text'] = df['clean_text'].apply(lambda text: str(text).strip())

df['clean_text'].head()

7335    judge garland is highly qualified for the benc...
4726    agree  so important to attract girls to tech c...
6163    dispelling both the pleasant and the unpleasan...
7378    rising sea levels are already flooding homes a...
957     five days of deepavali celebrations begin toda...
Name: clean_text, dtype: object

In [None]:
# Remove empty texts.

df = df.drop(df[df['clean_text'] == ''].index)

print(len(df.index))
df['clean_text'].head()

9739


7335    judge garland is highly qualified for the benc...
4726    agree  so important to attract girls to tech c...
6163    dispelling both the pleasant and the unpleasan...
7378    rising sea levels are already flooding homes a...
957     five days of deepavali celebrations begin toda...
Name: clean_text, dtype: object

In [None]:
# Include only English texts.
from langdetect import detect

# print("Detecting languages: 0.00%...", end="\r")
# for idx, text in enumerate(df['clean_text'].values):
#     print("Detecting languages: {:.2f}%...".format((idx + 1) / len(df.index) * 100), end="\r")
#     try:
#         detect(text)
#     except Exception:
#         print('text: (', text, ')')

df['lang'] = df['clean_text'].apply(lambda text: detect(text))
df = df.drop(df[df['lang'] != 'en'].index)

print(len(df.index))
df['clean_text'].head()

In [None]:
# Lemmatize texts.

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(lemmatizer.lemmatize(word) for word in text.split()))

df['clean_text'].head()

In [None]:
# Remove non-English words.

from nltk.corpus import words
words = set(words.words())

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word in words))

df['clean_text'].head()

In [None]:
# Remove stopwords.

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in stopwords))

df['clean_text'].head()

In [None]:
# Remove common words.

freq = pd.Series(' '.join(df['clean_text']).split()).value_counts()[pd.Series(' '.join(df['clean_text']).split()).value_counts() > 300]

freq.head(10)

In [None]:
freq = list(freq.index)

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in freq))

df['clean_text'].head()

In [None]:
# Remove rare words.

freq = pd.Series(' '.join(df['clean_text']).split()).value_counts()[pd.Series(' '.join(df['clean_text']).split()).value_counts() < 3]

freq.head(10)

In [None]:
freq = list(freq.index)

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in freq))

df['clean_text'].head()

In [None]:
# Tokenize texts (Not needed).

# from nltk import word_tokenize

# tokenized_texts = []
# for text in df['clean_text'].values:
#     tokenized_texts.append(word_tokenize(text))

# tokenized_texts[:10]

In [None]:
# Create TF-IDF (1 text).
import numpy as np

tfidf = df['clean_text'][0:1].apply(lambda text: pd.value_counts(text.split(" "))).sum(axis=0).reset_index()
tfidf.columns = ['words', 'tf']
for i, word in enumerate(tfidf['words']):
    tfidf.loc[i, 'idf'] = np.log(df.shape[0] / len(df[df['clean_text'].str.contains(word)]))
tfidf['tf-idf'] = tfidf['tf'] * tfidf['idf']

tfidf.head()

In [None]:
# Create TF-IDF (all texts).
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(df['clean_text'])
tfidf = pd.DataFrame(matrix.toarray())
tfidf.columns = vectorizer.get_feature_names()

tfidf.head()