In [1]:
import pandas as pd

df = pd.read_csv('./scraped_tweets.csv').sample(frac=1)

print(len(df.index))
df['text'].head()

9998


797     Found something historic?\nDon’t want to forge...
7431    Our children's health and prosperity is too im...
5674    You become like the 5 people you spend most of...
5246    Tune in LIVE to our NASA Innovative Advanced C...
8823                           pic.twitter.com/WQThgHWcg4
Name: text, dtype: object

In [2]:
# Preprocess tweets - remove @, # and links.

import preprocessor as p

df['clean_text'] = df['text'].apply(lambda text: p.clean(str(text)))

df['clean_text'].head()

797     Found something historic? Don’t want to forget...
7431    Our children's health and prosperity is too im...
5674    You become like the people you spend most of y...
5246    Tune in LIVE to our NASA Innovative Advanced C...
8823                                                     
Name: clean_text, dtype: object

In [3]:
# Convert to lowercase.

df['clean_text'] = df['clean_text'].apply(lambda text: text.lower())

df['clean_text'].head()

797     found something historic? don’t want to forget...
7431    our children's health and prosperity is too im...
5674    you become like the people you spend most of y...
5246    tune in live to our nasa innovative advanced c...
8823                                                     
Name: clean_text, dtype: object

In [4]:
# Remove punctuation.

df['clean_text'] = df['clean_text'].str.replace('[^\w\s]', '')

df['clean_text'].head()

797     found something historic dont want to forget a...
7431    our childrens health and prosperity is too imp...
5674    you become like the people you spend most of y...
5246    tune in live to our nasa innovative advanced c...
8823                                                     
Name: clean_text, dtype: object

In [5]:
# Remove whitespaces.

df['clean_text'] = df['clean_text'].apply(lambda text: str(text).strip())

df['clean_text'].head()

797     found something historic dont want to forget a...
7431    our childrens health and prosperity is too imp...
5674    you become like the people you spend most of y...
5246    tune in live to our nasa innovative advanced c...
8823                                                     
Name: clean_text, dtype: object

In [6]:
# Remove empty texts.

df = df.drop(df[df['clean_text'] == ''].index)

print(len(df.index))
df['clean_text'].head()

9739


797     found something historic dont want to forget a...
7431    our childrens health and prosperity is too imp...
5674    you become like the people you spend most of y...
5246    tune in live to our nasa innovative advanced c...
8138            unaltra importante vittoria forza ragazzi
Name: clean_text, dtype: object

In [7]:
# Include only English texts.
from langdetect import detect

# print("Detecting languages: 0.00%...", end="\r")
# for idx, text in enumerate(df['clean_text'].values):
#     print("Detecting languages: {:.2f}%...".format((idx + 1) / len(df.index) * 100), end="\r")
#     try:
#         detect(text)
#     except Exception:
#         print('text: (', text, ')')

df['lang'] = df['clean_text'].apply(lambda text: detect(text))
df = df.drop(df[df['lang'] != 'en'].index)

print(len(df.index))
df['clean_text'].head()

9148


797     found something historic dont want to forget a...
7431    our childrens health and prosperity is too imp...
5674    you become like the people you spend most of y...
5246    tune in live to our nasa innovative advanced c...
2184         coach genesio to leave lyon at end of season
Name: clean_text, dtype: object

In [8]:
# Lemmatize texts.

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(lemmatizer.lemmatize(word) for word in text.split()))

df['clean_text'].head()

797     found something historic dont want to forget a...
7431    our childrens health and prosperity is too imp...
5674    you become like the people you spend most of y...
5246    tune in live to our nasa innovative advanced c...
2184         coach genesio to leave lyon at end of season
Name: clean_text, dtype: object

In [9]:
# Remove non-English words.

from nltk.corpus import words
words = set(words.words())

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word in words))

df['clean_text'].head()

797     found something historic dont want to forget a...
7431    our health and prosperity is too important to ...
5674    you become like the people you spend most of y...
5246    tune in live to our innovative advanced concep...
2184                      coach to leave at end of season
Name: clean_text, dtype: object

In [10]:
# Remove stopwords.

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in stopwords))

df['clean_text'].head()

797     found something historic dont want forget joke...
7431    health prosperity important ignore consequence...
5674                        become like people spend time
5246    tune live innovative advanced concept symposiu...
2184                               coach leave end season
Name: clean_text, dtype: object

In [11]:
# Remove common words.

freq = pd.Series(' '.join(df['clean_text']).split()).value_counts()[:10]

freq

new       789
u         619
ha        553
today     499
people    495
one       490
say       462
wa        407
year      396
live      380
dtype: int64

In [12]:
freq = list(freq.index)

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in freq))

df['clean_text'].head()

797     found something historic dont want forget joke...
7431    health prosperity important ignore consequence...
5674                               become like spend time
5246    tune innovative advanced concept symposium hea...
2184                               coach leave end season
Name: clean_text, dtype: object

In [13]:
# Remove rare words.

freq = pd.Series(' '.join(df['clean_text']).split()).value_counts()[pd.Series(' '.join(df['clean_text']).split()).value_counts() < 5]

freq.head(10)

fled           4
cyclone        4
neighbor       4
warrant        4
forecast       4
holocaust      4
surrounding    4
comfort        4
visibility     4
repeat         4
dtype: int64

In [14]:
freq = list(freq.index)

df['clean_text'] = df['clean_text'].apply(lambda text: ' '.join(word for word in text.split() if word not in freq))

df['clean_text'].head()

797     found something historic dont want forget joke...
7431    health prosperity important ignore consequence...
5674                               become like spend time
5246    tune innovative advanced concept symposium hea...
2184                               coach leave end season
Name: clean_text, dtype: object

In [15]:
# Tokenize texts (Not needed).

# from nltk import word_tokenize

# tokenized_texts = []
# for text in df['clean_text'].values:
#     tokenized_texts.append(word_tokenize(text))

# tokenized_texts[:10]

In [16]:
# Create TF-IDF (1 text).
import numpy as np

tfidf = df['clean_text'][0:1].apply(lambda text: pd.value_counts(text.split(" "))).sum(axis=0).reset_index()
tfidf.columns = ['words', 'tf']
for i, word in enumerate(tfidf['words']):
    tfidf.loc[i, 'idf'] = np.log(df.shape[0] / len(df[df['clean_text'].str.contains(word)]))
tfidf['tf-idf'] = tfidf['tf'] * tfidf['idf']

tfidf.head()

Unnamed: 0,words,tf,idf,tf-idf
0,want,3,3.967999,11.903997
1,whenever,1,6.723395,6.723395
2,article,1,5.510373,5.510373
3,save,1,5.655555,5.655555
4,something,1,4.643954,4.643954


In [17]:
# Create TF-IDF (all texts).
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf = pd.DataFrame(vectorizer.fit_transform(df['clean_text']).toarray())
tfidf.columns = vectorizer.get_feature_names()

tfidf.head()

Unnamed: 0,abide,ability,able,aboard,absence,absolute,absolutely,abuse,academic,accept,...,ya,yellow,yes,yesterday,yet,york,young,youve,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
